From f7d0b9f1ee95362bed1e9cec947cfcccd2511d28 Mon Sep 17 00:00:00 2001
From: Peter Hoffmann <ph@peter-hoffmann.com>
Date: Wed, 8 Nov 2017 21:11:33 +0100
Subject: [PATCH] restrict columns to read for pandas.read_parquet (#18155)

(cherry picked from commit 5128fe60b24e72c896ebfdb3319e28e710b44386)
---
 doc/source/io.rst               | 10 ++++++++++
 doc/source/whatsnew/v0.21.1.txt |  1 +
 pandas/io/parquet.py            | 16 ++++++++++------
 pandas/tests/io/test_parquet.py | 13 +++++++++++--
 4 files changed, 32 insertions(+), 8 deletions(-)

diff --git a/doc/source/io.rst b/doc/source/io.rst
index 82cb83c168b22..de3ae2e8f4305 100644
--- a/doc/source/io.rst
+++ b/doc/source/io.rst
@@ -4580,6 +4580,16 @@ Read from a parquet file.
 
    result.dtypes
 
+Read only certain columns of a parquet file. 
+
+.. ipython:: python
+
+   result = pd.read_parquet('example_pa.parquet', engine='pyarrow', columns=['a', 'b'])
+   result = pd.read_parquet('example_fp.parquet', engine='fastparquet', columns=['a', 'b'])
+
+   result.dtypes
+
+
 .. ipython:: python
    :suppress:
 
diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt
index 966a5a72f3bd4..0c5642ad52853 100644
--- a/doc/source/whatsnew/v0.21.1.txt
+++ b/doc/source/whatsnew/v0.21.1.txt
@@ -82,6 +82,7 @@ I/O
 - Bug in :func:`read_csv` when reading a compressed UTF-16 encoded file (:issue:`18071`)
 - Bug in :func:`read_csv` for handling null values in index columns when specifying ``na_filter=False`` (:issue:`5239`)
 - Bug in :meth:`DataFrame.to_csv` when the table had ``MultiIndex`` columns, and a list of strings was passed in for ``header`` (:issue:`5539`)
+- :func:`read_parquet` now allows to specify the columns to read from a parquet file (:issue:`18154`)
 
 
 Plotting
diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py
index 4b507b7f5df6f..ef95e32cc241e 100644
--- a/pandas/io/parquet.py
+++ b/pandas/io/parquet.py
@@ -76,9 +76,9 @@ def write(self, df, path, compression='snappy',
                 table, path, compression=compression,
                 coerce_timestamps=coerce_timestamps, **kwargs)
 
-    def read(self, path):
+    def read(self, path, columns=None):
         path, _, _ = get_filepath_or_buffer(path)
-        return self.api.parquet.read_table(path).to_pandas()
+        return self.api.parquet.read_table(path, columns=columns).to_pandas()
 
 
 class FastParquetImpl(object):
@@ -115,9 +115,9 @@ def write(self, df, path, compression='snappy', **kwargs):
             self.api.write(path, df,
                            compression=compression, **kwargs)
 
-    def read(self, path):
+    def read(self, path, columns=None):
         path, _, _ = get_filepath_or_buffer(path)
-        return self.api.ParquetFile(path).to_pandas()
+        return self.api.ParquetFile(path).to_pandas(columns=columns)
 
 
 def to_parquet(df, path, engine='auto', compression='snappy', **kwargs):
@@ -178,7 +178,7 @@ def to_parquet(df, path, engine='auto', compression='snappy', **kwargs):
     return impl.write(df, path, compression=compression)
 
 
-def read_parquet(path, engine='auto', **kwargs):
+def read_parquet(path, engine='auto', columns=None, **kwargs):
     """
     Load a parquet object from the file path, returning a DataFrame.
 
@@ -188,6 +188,10 @@ def read_parquet(path, engine='auto', **kwargs):
     ----------
     path : string
         File path
+    columns: list, default=None
+        If not None, only these columns will be read from the file.
+
+        .. versionadded 0.21.1
     engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto'
         Parquet reader library to use. If 'auto', then the option
         'io.parquet.engine' is used. If 'auto', then the first
@@ -201,4 +205,4 @@ def read_parquet(path, engine='auto', **kwargs):
     """
 
     impl = get_engine(engine)
-    return impl.read(path)
+    return impl.read(path, columns=columns)
diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
index ecd4e8f719014..9a4edf38e2ef4 100644
--- a/pandas/tests/io/test_parquet.py
+++ b/pandas/tests/io/test_parquet.py
@@ -192,7 +192,7 @@ def check_round_trip(self, df, engine, expected=None, **kwargs):
 
         with tm.ensure_clean() as path:
             df.to_parquet(path, engine, **kwargs)
-            result = read_parquet(path, engine)
+            result = read_parquet(path, engine, **kwargs)
 
             if expected is None:
                 expected = df
@@ -200,7 +200,7 @@ def check_round_trip(self, df, engine, expected=None, **kwargs):
 
             # repeat
             to_parquet(df, path, engine, **kwargs)
-            result = pd.read_parquet(path, engine)
+            result = pd.read_parquet(path, engine, **kwargs)
 
             if expected is None:
                 expected = df
@@ -282,6 +282,15 @@ def test_compression(self, engine, compression):
         df = pd.DataFrame({'A': [1, 2, 3]})
         self.check_round_trip(df, engine, compression=compression)
 
+    def test_read_columns(self, engine):
+        # GH18154
+        df = pd.DataFrame({'string': list('abc'),
+                           'int': list(range(1, 4))})
+
+        expected = pd.DataFrame({'string': list('abc')})
+        self.check_round_trip(df, engine, expected=expected,
+                              compression=None, columns=["string"])
+
 
 class TestParquetPyArrow(Base):