Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bumping up min version for pyarrow #23482

Merged
merged 5 commits into from
Nov 5, 2018
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion ci/requirements-optional-conda.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ matplotlib>=2.0.0
nbsphinx
numexpr>=2.6.1
openpyxl
pyarrow>=0.4.1
pyarrow>=0.7.0
pymysql
pytables>=3.4.2
pytest-cov
Expand Down
2 changes: 1 addition & 1 deletion ci/travis-27.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ dependencies:
- patsy
- psycopg2
- py
- pyarrow=0.4.1
- pyarrow=0.7.0
- PyCrypto
- pymysql=0.6.3
- pytables
Expand Down
4 changes: 2 additions & 2 deletions doc/source/install.rst
Original file line number Diff line number Diff line change
Expand Up @@ -258,8 +258,8 @@ Optional Dependencies
* `SciPy <http://www.scipy.org>`__: miscellaneous statistical functions, Version 0.18.1 or higher
* `xarray <http://xarray.pydata.org>`__: pandas like handling for > 2 dims, needed for converting Panels to xarray objects. Version 0.7.0 or higher is recommended.
* `PyTables <http://www.pytables.org>`__: necessary for HDF5-based storage, Version 3.4.2 or higher
* `pyarrow <http://arrow.apache.org/docs/python/>`__ (>= 0.4.1): necessary for feather-based storage.
* `Apache Parquet <https://parquet.apache.org/>`__, either `pyarrow <http://arrow.apache.org/docs/python/>`__ (>= 0.4.1) or `fastparquet <https://fastparquet.readthedocs.io/en/latest>`__ (>= 0.0.6) for parquet-based storage. The `snappy <https://pypi.org/project/python-snappy>`__ and `brotli <https://pypi.org/project/brotlipy>`__ are available for compression support.
* `pyarrow <http://arrow.apache.org/docs/python/>`__ (>= 0.7.0): necessary for feather-based storage.
* `Apache Parquet <https://parquet.apache.org/>`__, either `pyarrow <http://arrow.apache.org/docs/python/>`__ (>= 0.7.0) or `fastparquet <https://fastparquet.readthedocs.io/en/latest>`__ (>= 0.0.6) for parquet-based storage. The `snappy <https://pypi.org/project/python-snappy>`__ and `brotli <https://pypi.org/project/brotlipy>`__ are available for compression support.
* `SQLAlchemy <http://www.sqlalchemy.org>`__: for SQL database support. Version 0.8.1 or higher recommended. Besides SQLAlchemy, you also need a database specific driver. You can find an overview of supported drivers for each SQL dialect in the `SQLAlchemy docs <http://docs.sqlalchemy.org/en/latest/dialects/index.html>`__. Some common drivers are:

* `psycopg2 <http://initd.org/psycopg/>`__: for PostgreSQL
Expand Down
4 changes: 3 additions & 1 deletion doc/source/whatsnew/v0.24.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,7 @@ Backwards incompatible API changes
Dependencies have increased minimum versions
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

We have updated our minimum supported versions of dependencies (:issue:`21242`).
We have updated our minimum supported versions of dependencies (:issue:`21242`, `18742`).
If installed, we now require:

+-----------------+-----------------+----------+
Expand All @@ -268,6 +268,8 @@ If installed, we now require:
+-----------------+-----------------+----------+
| scipy | 0.18.1 | |
+-----------------+-----------------+----------+
| pyarrow | 0.7.0 | |
+-----------------+-----------------+----------+

Additionally we no longer depend on `feather-format` for feather based storage
and replaced it with references to `pyarrow` (:issue:`21639` and :issue:`23053`).
Expand Down
72 changes: 10 additions & 62 deletions pandas/io/parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,57 +89,38 @@ def __init__(self):
"\nor via pip\n"
"pip install -U pyarrow\n"
)
if LooseVersion(pyarrow.__version__) < '0.4.1':
if LooseVersion(pyarrow.__version__) < '0.7.0':
raise ImportError(
"pyarrow >= 0.4.1 is required for parquet support\n\n"
"pyarrow >= 0.7.0 is required for parquet support\n\n"
"you can install via conda\n"
"conda install pyarrow -c conda-forge\n"
"\nor via pip\n"
"pip install -U pyarrow\n"
)

self._pyarrow_lt_060 = (
LooseVersion(pyarrow.__version__) < LooseVersion('0.6.0'))
self._pyarrow_lt_070 = (
LooseVersion(pyarrow.__version__) < LooseVersion('0.7.0'))

self.api = pyarrow

def write(self, df, path, compression='snappy',
coerce_timestamps='ms', index=None, **kwargs):
self.validate_dataframe(df)

# Only validate the index if we're writing it.
if self._pyarrow_lt_070 and index is not False:
self._validate_write_lt_070(df)
path, _, _, _ = get_filepath_or_buffer(path, mode='wb')

if index is None:
from_pandas_kwargs = {}
else:
from_pandas_kwargs = {'preserve_index': index}

if self._pyarrow_lt_060:
table = self.api.Table.from_pandas(df, timestamps_to_ms=True,
**from_pandas_kwargs)
self.api.parquet.write_table(
table, path, compression=compression, **kwargs)

else:
table = self.api.Table.from_pandas(df, **from_pandas_kwargs)
self.api.parquet.write_table(
table, path, compression=compression,
coerce_timestamps=coerce_timestamps, **kwargs)
table = self.api.Table.from_pandas(df, **from_pandas_kwargs)
self.api.parquet.write_table(
table, path, compression=compression,
coerce_timestamps=coerce_timestamps, **kwargs)

def read(self, path, columns=None, **kwargs):
path, _, _, should_close = get_filepath_or_buffer(path)
if self._pyarrow_lt_070:
result = self.api.parquet.read_pandas(path, columns=columns,
**kwargs).to_pandas()
else:
kwargs['use_pandas_metadata'] = True
result = self.api.parquet.read_table(path, columns=columns,
**kwargs).to_pandas()

kwargs['use_pandas_metadata'] = True
result = self.api.parquet.read_table(path, columns=columns,
**kwargs).to_pandas()
if should_close:
try:
path.close()
Expand All @@ -148,39 +129,6 @@ def read(self, path, columns=None, **kwargs):

return result

def _validate_write_lt_070(self, df):
# Compatibility shim for pyarrow < 0.7.0
# TODO: Remove in pandas 0.23.0
from pandas.core.indexes.multi import MultiIndex
if isinstance(df.index, MultiIndex):
msg = (
"Multi-index DataFrames are only supported "
"with pyarrow >= 0.7.0"
)
raise ValueError(msg)
# Validate index
if not isinstance(df.index, Int64Index):
msg = (
"pyarrow < 0.7.0 does not support serializing {} for the "
"index; you can .reset_index() to make the index into "
"column(s), or install the latest version of pyarrow or "
"fastparquet."
)
raise ValueError(msg.format(type(df.index)))
if not df.index.equals(RangeIndex(len(df))):
raise ValueError(
"pyarrow < 0.7.0 does not support serializing a non-default "
"index; you can .reset_index() to make the index into "
"column(s), or install the latest version of pyarrow or "
"fastparquet."
)
if df.index.name is not None:
raise ValueError(
"pyarrow < 0.7.0 does not serialize indexes with a name; you "
"can set the index.name to None or install the latest version "
"of pyarrow or fastparquet."
)


class FastParquetImpl(BaseImpl):

Expand Down
34 changes: 5 additions & 29 deletions pandas/tests/io/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,22 +41,6 @@ def engine(request):

@pytest.fixture
def pa():
if not _HAVE_PYARROW:
pytest.skip("pyarrow is not installed")
return 'pyarrow'


@pytest.fixture
def pa_lt_070():
if not _HAVE_PYARROW:
pytest.skip("pyarrow is not installed")
if LooseVersion(pyarrow.__version__) >= LooseVersion('0.7.0'):
pytest.skip("pyarrow is >= 0.7.0")
return 'pyarrow'


@pytest.fixture
def pa_ge_070():
if not _HAVE_PYARROW:
pytest.skip("pyarrow is not installed")
if LooseVersion(pyarrow.__version__) < LooseVersion('0.7.0'):
Expand Down Expand Up @@ -337,9 +321,9 @@ def test_write_index(self, engine):
df.index.name = 'foo'
check_round_trip(df, engine)

def test_write_multiindex(self, pa_ge_070):
def test_write_multiindex(self, pa):
# Not suppoprted in fastparquet as of 0.1.3 or older pyarrow version
engine = pa_ge_070
engine = pa

df = pd.DataFrame({'A': [1, 2, 3]})
index = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1)])
Expand All @@ -352,8 +336,8 @@ def test_write_column_multiindex(self, engine):
df = pd.DataFrame(np.random.randn(4, 3), columns=mi_columns)
self.check_error_on_write(df, engine, ValueError)

def test_multiindex_with_columns(self, pa_ge_070):
engine = pa_ge_070
def test_multiindex_with_columns(self, pa):
engine = pa
dates = pd.date_range('01-Jan-2018', '01-Dec-2018', freq='MS')
df = pd.DataFrame(np.random.randn(2 * len(dates), 3),
columns=list('ABC'))
Expand Down Expand Up @@ -456,8 +440,7 @@ def test_unsupported(self, pa):
# older pyarrows raise ArrowInvalid
self.check_error_on_write(df, pa, Exception)

def test_categorical(self, pa_ge_070):
pa = pa_ge_070
def test_categorical(self, pa):

# supported in >= 0.7.0
df = pd.DataFrame({'a': pd.Categorical(list('abc'))})
Expand All @@ -466,13 +449,6 @@ def test_categorical(self, pa_ge_070):
expected = df.assign(a=df.a.astype(object))
check_round_trip(df, pa, expected=expected)

def test_categorical_unsupported(self, pa_lt_070):
pa = pa_lt_070

# supported in >= 0.7.0
df = pd.DataFrame({'a': pd.Categorical(list('abc'))})
self.check_error_on_write(df, pa, NotImplementedError)

def test_s3_roundtrip(self, df_compat, s3_resource, pa):
# GH #19134
check_round_trip(df_compat, pa,
Expand Down