From f7d0b9f1ee95362bed1e9cec947cfcccd2511d28 Mon Sep 17 00:00:00 2001 From: Peter Hoffmann Date: Wed, 8 Nov 2017 21:11:33 +0100 Subject: [PATCH] restrict columns to read for pandas.read_parquet (#18155) (cherry picked from commit 5128fe60b24e72c896ebfdb3319e28e710b44386) --- doc/source/io.rst | 10 ++++++++++ doc/source/whatsnew/v0.21.1.txt | 1 + pandas/io/parquet.py | 16 ++++++++++------ pandas/tests/io/test_parquet.py | 13 +++++++++++-- 4 files changed, 32 insertions(+), 8 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index 82cb83c168b22..de3ae2e8f4305 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -4580,6 +4580,16 @@ Read from a parquet file. result.dtypes +Read only certain columns of a parquet file. + +.. ipython:: python + + result = pd.read_parquet('example_pa.parquet', engine='pyarrow', columns=['a', 'b']) + result = pd.read_parquet('example_fp.parquet', engine='fastparquet', columns=['a', 'b']) + + result.dtypes + + .. ipython:: python :suppress: diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index 966a5a72f3bd4..0c5642ad52853 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -82,6 +82,7 @@ I/O - Bug in :func:`read_csv` when reading a compressed UTF-16 encoded file (:issue:`18071`) - Bug in :func:`read_csv` for handling null values in index columns when specifying ``na_filter=False`` (:issue:`5239`) - Bug in :meth:`DataFrame.to_csv` when the table had ``MultiIndex`` columns, and a list of strings was passed in for ``header`` (:issue:`5539`) +- :func:`read_parquet` now allows to specify the columns to read from a parquet file (:issue:`18154`) Plotting diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 4b507b7f5df6f..ef95e32cc241e 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -76,9 +76,9 @@ def write(self, df, path, compression='snappy', table, path, compression=compression, coerce_timestamps=coerce_timestamps, **kwargs) - def read(self, path): + def read(self, path, columns=None): path, _, _ = get_filepath_or_buffer(path) - return self.api.parquet.read_table(path).to_pandas() + return self.api.parquet.read_table(path, columns=columns).to_pandas() class FastParquetImpl(object): @@ -115,9 +115,9 @@ def write(self, df, path, compression='snappy', **kwargs): self.api.write(path, df, compression=compression, **kwargs) - def read(self, path): + def read(self, path, columns=None): path, _, _ = get_filepath_or_buffer(path) - return self.api.ParquetFile(path).to_pandas() + return self.api.ParquetFile(path).to_pandas(columns=columns) def to_parquet(df, path, engine='auto', compression='snappy', **kwargs): @@ -178,7 +178,7 @@ def to_parquet(df, path, engine='auto', compression='snappy', **kwargs): return impl.write(df, path, compression=compression) -def read_parquet(path, engine='auto', **kwargs): +def read_parquet(path, engine='auto', columns=None, **kwargs): """ Load a parquet object from the file path, returning a DataFrame. @@ -188,6 +188,10 @@ def read_parquet(path, engine='auto', **kwargs): ---------- path : string File path + columns: list, default=None + If not None, only these columns will be read from the file. + + .. versionadded 0.21.1 engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto' Parquet reader library to use. If 'auto', then the option 'io.parquet.engine' is used. If 'auto', then the first @@ -201,4 +205,4 @@ def read_parquet(path, engine='auto', **kwargs): """ impl = get_engine(engine) - return impl.read(path) + return impl.read(path, columns=columns) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index ecd4e8f719014..9a4edf38e2ef4 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -192,7 +192,7 @@ def check_round_trip(self, df, engine, expected=None, **kwargs): with tm.ensure_clean() as path: df.to_parquet(path, engine, **kwargs) - result = read_parquet(path, engine) + result = read_parquet(path, engine, **kwargs) if expected is None: expected = df @@ -200,7 +200,7 @@ def check_round_trip(self, df, engine, expected=None, **kwargs): # repeat to_parquet(df, path, engine, **kwargs) - result = pd.read_parquet(path, engine) + result = pd.read_parquet(path, engine, **kwargs) if expected is None: expected = df @@ -282,6 +282,15 @@ def test_compression(self, engine, compression): df = pd.DataFrame({'A': [1, 2, 3]}) self.check_round_trip(df, engine, compression=compression) + def test_read_columns(self, engine): + # GH18154 + df = pd.DataFrame({'string': list('abc'), + 'int': list(range(1, 4))}) + + expected = pd.DataFrame({'string': list('abc')}) + self.check_round_trip(df, engine, expected=expected, + compression=None, columns=["string"]) + class TestParquetPyArrow(Base):