Skip to content

Commit

Permalink
restrict columns to read for pandas.read_parquet (#18155)
Browse files Browse the repository at this point in the history
(cherry picked from commit 5128fe6)
  • Loading branch information
hoffmann authored and TomAugspurger committed Dec 11, 2017
1 parent 560190e commit 50ff9e3
Show file tree
Hide file tree
Showing 4 changed files with 32 additions and 8 deletions.
10 changes: 10 additions & 0 deletions doc/source/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4580,6 +4580,16 @@ Read from a parquet file.
result.dtypes
Read only certain columns of a parquet file.

.. ipython:: python
result = pd.read_parquet('example_pa.parquet', engine='pyarrow', columns=['a', 'b'])
result = pd.read_parquet('example_fp.parquet', engine='fastparquet', columns=['a', 'b'])
result.dtypes
.. ipython:: python
:suppress:
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.21.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ I/O
- Bug in :func:`read_csv` when reading a compressed UTF-16 encoded file (:issue:`18071`)
- Bug in :func:`read_csv` for handling null values in index columns when specifying ``na_filter=False`` (:issue:`5239`)
- Bug in :meth:`DataFrame.to_csv` when the table had ``MultiIndex`` columns, and a list of strings was passed in for ``header`` (:issue:`5539`)
- :func:`read_parquet` now allows to specify the columns to read from a parquet file (:issue:`18154`)


Plotting
Expand Down
16 changes: 10 additions & 6 deletions pandas/io/parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,9 +76,9 @@ def write(self, df, path, compression='snappy',
table, path, compression=compression,
coerce_timestamps=coerce_timestamps, **kwargs)

def read(self, path):
def read(self, path, columns=None):
path, _, _ = get_filepath_or_buffer(path)
return self.api.parquet.read_table(path).to_pandas()
return self.api.parquet.read_table(path, columns=columns).to_pandas()


class FastParquetImpl(object):
Expand Down Expand Up @@ -115,9 +115,9 @@ def write(self, df, path, compression='snappy', **kwargs):
self.api.write(path, df,
compression=compression, **kwargs)

def read(self, path):
def read(self, path, columns=None):
path, _, _ = get_filepath_or_buffer(path)
return self.api.ParquetFile(path).to_pandas()
return self.api.ParquetFile(path).to_pandas(columns=columns)


def to_parquet(df, path, engine='auto', compression='snappy', **kwargs):
Expand Down Expand Up @@ -178,7 +178,7 @@ def to_parquet(df, path, engine='auto', compression='snappy', **kwargs):
return impl.write(df, path, compression=compression)


def read_parquet(path, engine='auto', **kwargs):
def read_parquet(path, engine='auto', columns=None, **kwargs):
"""
Load a parquet object from the file path, returning a DataFrame.
Expand All @@ -188,6 +188,10 @@ def read_parquet(path, engine='auto', **kwargs):
----------
path : string
File path
columns: list, default=None
If not None, only these columns will be read from the file.
.. versionadded 0.21.1
engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto'
Parquet reader library to use. If 'auto', then the option
'io.parquet.engine' is used. If 'auto', then the first
Expand All @@ -201,4 +205,4 @@ def read_parquet(path, engine='auto', **kwargs):
"""

impl = get_engine(engine)
return impl.read(path)
return impl.read(path, columns=columns)
13 changes: 11 additions & 2 deletions pandas/tests/io/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,15 +192,15 @@ def check_round_trip(self, df, engine, expected=None, **kwargs):

with tm.ensure_clean() as path:
df.to_parquet(path, engine, **kwargs)
result = read_parquet(path, engine)
result = read_parquet(path, engine, **kwargs)

if expected is None:
expected = df
tm.assert_frame_equal(result, expected)

# repeat
to_parquet(df, path, engine, **kwargs)
result = pd.read_parquet(path, engine)
result = pd.read_parquet(path, engine, **kwargs)

if expected is None:
expected = df
Expand Down Expand Up @@ -282,6 +282,15 @@ def test_compression(self, engine, compression):
df = pd.DataFrame({'A': [1, 2, 3]})
self.check_round_trip(df, engine, compression=compression)

def test_read_columns(self, engine):
# GH18154
df = pd.DataFrame({'string': list('abc'),
'int': list(range(1, 4))})

expected = pd.DataFrame({'string': list('abc')})
self.check_round_trip(df, engine, expected=expected,
compression=None, columns=["string"])


class TestParquetPyArrow(Base):

Expand Down

0 comments on commit 50ff9e3

Please sign in to comment.