Skip to content

Commit

Permalink
Bumping up min version for pyarrow and fastparquet (pandas-dev#23482)
Browse files Browse the repository at this point in the history
* Bumping up min version for pyarrow
  • Loading branch information
anjsudh authored and Pingviinituutti committed Feb 28, 2019
1 parent cb8070c commit 0cd2473
Show file tree
Hide file tree
Showing 7 changed files with 32 additions and 104 deletions.
4 changes: 2 additions & 2 deletions ci/requirements-optional-conda.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
beautifulsoup4>=4.2.1
blosc
bottleneck>=1.2.0
fastparquet
fastparquet>=0.1.2
gcsfs
html5lib
ipython>=5.6.0
Expand All @@ -12,7 +12,7 @@ matplotlib>=2.0.0
nbsphinx
numexpr>=2.6.1
openpyxl
pyarrow>=0.4.1
pyarrow>=0.7.0
pymysql
pytables>=3.4.2
pytest-cov
Expand Down
8 changes: 4 additions & 4 deletions ci/requirements-optional-pip.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
beautifulsoup4>=4.2.1
blosc
bottleneck>=1.2.0
fastparquet
fastparquet>=0.1.2
gcsfs
html5lib
ipython>=5.6.0
Expand All @@ -14,9 +14,9 @@ matplotlib>=2.0.0
nbsphinx
numexpr>=2.6.1
openpyxl
pyarrow>=0.4.1
pyarrow>=0.7.0
pymysql
tables
pytables>=3.4.2
pytest-cov
pytest-xdist
s3fs
Expand All @@ -27,4 +27,4 @@ statsmodels
xarray
xlrd
xlsxwriter
xlwt
xlwt
2 changes: 1 addition & 1 deletion ci/travis-27.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ dependencies:
- patsy
- psycopg2
- py
- pyarrow=0.4.1
- pyarrow=0.7.0
- PyCrypto
- pymysql=0.6.3
- pytables
Expand Down
4 changes: 2 additions & 2 deletions doc/source/install.rst
Original file line number Diff line number Diff line change
Expand Up @@ -258,8 +258,8 @@ Optional Dependencies
* `SciPy <http://www.scipy.org>`__: miscellaneous statistical functions, Version 0.18.1 or higher
* `xarray <http://xarray.pydata.org>`__: pandas like handling for > 2 dims, needed for converting Panels to xarray objects. Version 0.7.0 or higher is recommended.
* `PyTables <http://www.pytables.org>`__: necessary for HDF5-based storage, Version 3.4.2 or higher
* `pyarrow <http://arrow.apache.org/docs/python/>`__ (>= 0.4.1): necessary for feather-based storage.
* `Apache Parquet <https://parquet.apache.org/>`__, either `pyarrow <http://arrow.apache.org/docs/python/>`__ (>= 0.4.1) or `fastparquet <https://fastparquet.readthedocs.io/en/latest>`__ (>= 0.0.6) for parquet-based storage. The `snappy <https://pypi.org/project/python-snappy>`__ and `brotli <https://pypi.org/project/brotlipy>`__ are available for compression support.
* `pyarrow <http://arrow.apache.org/docs/python/>`__ (>= 0.7.0): necessary for feather-based storage.
* `Apache Parquet <https://parquet.apache.org/>`__, either `pyarrow <http://arrow.apache.org/docs/python/>`__ (>= 0.7.0) or `fastparquet <https://fastparquet.readthedocs.io/en/latest>`__ (>= 0.1.2) for parquet-based storage. The `snappy <https://pypi.org/project/python-snappy>`__ and `brotli <https://pypi.org/project/brotlipy>`__ are available for compression support.
* `SQLAlchemy <http://www.sqlalchemy.org>`__: for SQL database support. Version 0.8.1 or higher recommended. Besides SQLAlchemy, you also need a database specific driver. You can find an overview of supported drivers for each SQL dialect in the `SQLAlchemy docs <http://docs.sqlalchemy.org/en/latest/dialects/index.html>`__. Some common drivers are:

* `psycopg2 <http://initd.org/psycopg/>`__: for PostgreSQL
Expand Down
6 changes: 5 additions & 1 deletion doc/source/whatsnew/v0.24.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,7 @@ Backwards incompatible API changes
Dependencies have increased minimum versions
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

We have updated our minimum supported versions of dependencies (:issue:`21242`).
We have updated our minimum supported versions of dependencies (:issue:`21242`, `18742`).
If installed, we now require:

+-----------------+-----------------+----------+
Expand All @@ -268,6 +268,10 @@ If installed, we now require:
+-----------------+-----------------+----------+
| scipy | 0.18.1 | |
+-----------------+-----------------+----------+
| pyarrow | 0.7.0 | |
+-----------------+-----------------+----------+
| fastparquet | 0.1.2 | |
+-----------------+-----------------+----------+

Additionally we no longer depend on `feather-format` for feather based storage
and replaced it with references to `pyarrow` (:issue:`21639` and :issue:`23053`).
Expand Down
78 changes: 13 additions & 65 deletions pandas/io/parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

from pandas.compat import string_types

from pandas import DataFrame, Int64Index, RangeIndex, get_option
from pandas import DataFrame, get_option
import pandas.core.common as com

from pandas.io.common import get_filepath_or_buffer, is_s3_url
Expand Down Expand Up @@ -89,57 +89,38 @@ def __init__(self):
"\nor via pip\n"
"pip install -U pyarrow\n"
)
if LooseVersion(pyarrow.__version__) < '0.4.1':
if LooseVersion(pyarrow.__version__) < '0.7.0':
raise ImportError(
"pyarrow >= 0.4.1 is required for parquet support\n\n"
"pyarrow >= 0.7.0 is required for parquet support\n\n"
"you can install via conda\n"
"conda install pyarrow -c conda-forge\n"
"\nor via pip\n"
"pip install -U pyarrow\n"
)

self._pyarrow_lt_060 = (
LooseVersion(pyarrow.__version__) < LooseVersion('0.6.0'))
self._pyarrow_lt_070 = (
LooseVersion(pyarrow.__version__) < LooseVersion('0.7.0'))

self.api = pyarrow

def write(self, df, path, compression='snappy',
coerce_timestamps='ms', index=None, **kwargs):
self.validate_dataframe(df)

# Only validate the index if we're writing it.
if self._pyarrow_lt_070 and index is not False:
self._validate_write_lt_070(df)
path, _, _, _ = get_filepath_or_buffer(path, mode='wb')

if index is None:
from_pandas_kwargs = {}
else:
from_pandas_kwargs = {'preserve_index': index}

if self._pyarrow_lt_060:
table = self.api.Table.from_pandas(df, timestamps_to_ms=True,
**from_pandas_kwargs)
self.api.parquet.write_table(
table, path, compression=compression, **kwargs)

else:
table = self.api.Table.from_pandas(df, **from_pandas_kwargs)
self.api.parquet.write_table(
table, path, compression=compression,
coerce_timestamps=coerce_timestamps, **kwargs)
table = self.api.Table.from_pandas(df, **from_pandas_kwargs)
self.api.parquet.write_table(
table, path, compression=compression,
coerce_timestamps=coerce_timestamps, **kwargs)

def read(self, path, columns=None, **kwargs):
path, _, _, should_close = get_filepath_or_buffer(path)
if self._pyarrow_lt_070:
result = self.api.parquet.read_pandas(path, columns=columns,
**kwargs).to_pandas()
else:
kwargs['use_pandas_metadata'] = True
result = self.api.parquet.read_table(path, columns=columns,
**kwargs).to_pandas()

kwargs['use_pandas_metadata'] = True
result = self.api.parquet.read_table(path, columns=columns,
**kwargs).to_pandas()
if should_close:
try:
path.close()
Expand All @@ -148,39 +129,6 @@ def read(self, path, columns=None, **kwargs):

return result

def _validate_write_lt_070(self, df):
# Compatibility shim for pyarrow < 0.7.0
# TODO: Remove in pandas 0.23.0
from pandas.core.indexes.multi import MultiIndex
if isinstance(df.index, MultiIndex):
msg = (
"Multi-index DataFrames are only supported "
"with pyarrow >= 0.7.0"
)
raise ValueError(msg)
# Validate index
if not isinstance(df.index, Int64Index):
msg = (
"pyarrow < 0.7.0 does not support serializing {} for the "
"index; you can .reset_index() to make the index into "
"column(s), or install the latest version of pyarrow or "
"fastparquet."
)
raise ValueError(msg.format(type(df.index)))
if not df.index.equals(RangeIndex(len(df))):
raise ValueError(
"pyarrow < 0.7.0 does not support serializing a non-default "
"index; you can .reset_index() to make the index into "
"column(s), or install the latest version of pyarrow or "
"fastparquet."
)
if df.index.name is not None:
raise ValueError(
"pyarrow < 0.7.0 does not serialize indexes with a name; you "
"can set the index.name to None or install the latest version "
"of pyarrow or fastparquet."
)


class FastParquetImpl(BaseImpl):

Expand All @@ -197,9 +145,9 @@ def __init__(self):
"\nor via pip\n"
"pip install -U fastparquet"
)
if LooseVersion(fastparquet.__version__) < '0.1.0':
if LooseVersion(fastparquet.__version__) < '0.1.2':
raise ImportError(
"fastparquet >= 0.1.0 is required for parquet "
"fastparquet >= 0.1.2 is required for parquet "
"support\n\n"
"you can install via conda\n"
"conda install fastparquet -c conda-forge\n"
Expand Down
34 changes: 5 additions & 29 deletions pandas/tests/io/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,22 +41,6 @@ def engine(request):

@pytest.fixture
def pa():
if not _HAVE_PYARROW:
pytest.skip("pyarrow is not installed")
return 'pyarrow'


@pytest.fixture
def pa_lt_070():
if not _HAVE_PYARROW:
pytest.skip("pyarrow is not installed")
if LooseVersion(pyarrow.__version__) >= LooseVersion('0.7.0'):
pytest.skip("pyarrow is >= 0.7.0")
return 'pyarrow'


@pytest.fixture
def pa_ge_070():
if not _HAVE_PYARROW:
pytest.skip("pyarrow is not installed")
if LooseVersion(pyarrow.__version__) < LooseVersion('0.7.0'):
Expand Down Expand Up @@ -337,9 +321,9 @@ def test_write_index(self, engine):
df.index.name = 'foo'
check_round_trip(df, engine)

def test_write_multiindex(self, pa_ge_070):
def test_write_multiindex(self, pa):
# Not suppoprted in fastparquet as of 0.1.3 or older pyarrow version
engine = pa_ge_070
engine = pa

df = pd.DataFrame({'A': [1, 2, 3]})
index = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1)])
Expand All @@ -352,8 +336,8 @@ def test_write_column_multiindex(self, engine):
df = pd.DataFrame(np.random.randn(4, 3), columns=mi_columns)
self.check_error_on_write(df, engine, ValueError)

def test_multiindex_with_columns(self, pa_ge_070):
engine = pa_ge_070
def test_multiindex_with_columns(self, pa):
engine = pa
dates = pd.date_range('01-Jan-2018', '01-Dec-2018', freq='MS')
df = pd.DataFrame(np.random.randn(2 * len(dates), 3),
columns=list('ABC'))
Expand Down Expand Up @@ -456,8 +440,7 @@ def test_unsupported(self, pa):
# older pyarrows raise ArrowInvalid
self.check_error_on_write(df, pa, Exception)

def test_categorical(self, pa_ge_070):
pa = pa_ge_070
def test_categorical(self, pa):

# supported in >= 0.7.0
df = pd.DataFrame({'a': pd.Categorical(list('abc'))})
Expand All @@ -466,13 +449,6 @@ def test_categorical(self, pa_ge_070):
expected = df.assign(a=df.a.astype(object))
check_round_trip(df, pa, expected=expected)

def test_categorical_unsupported(self, pa_lt_070):
pa = pa_lt_070

# supported in >= 0.7.0
df = pd.DataFrame({'a': pd.Categorical(list('abc'))})
self.check_error_on_write(df, pa, NotImplementedError)

def test_s3_roundtrip(self, df_compat, s3_resource, pa):
# GH #19134
check_round_trip(df_compat, pa,
Expand Down

0 comments on commit 0cd2473

Please sign in to comment.