ENH: Adding engine_kwargs to Excel engines for issue #40274 (#52214)

* Fixing merge conflicts * Fixing merge conflict * Fixing documentation issues * standardized usage of engine_kwargs, fixed unit tests & doc strings * Fixing documentation issues * Fixing implementation logic and unit tests * Fixing implementation logic * Fixing formatting issues * Fixing error for test Docstring validation, typing, and other manual pre-commit hooks * Fixing documentation error * Standardizing engine_kwarg types * Fixing minor issues with unit tests and documentation * Fixing documentation issue * Fixing a formatting / documentation error * Fixing documentation errors * Fixing documentation errors * Fixing documentation errors * Fixing documentation errors * Fixing documentation errors * Adding an extra blank line to troubleshoot documentation error * Adding an extra blank line to troubleshoot documentation error * Fixing documentation issues * Fixing formatting errors * Fixing formatting errors * Fixing formatting errors * Fixing logic and formatting issues in unit tests * Fixing issues with merge conflict * Fixing formatting issue * Update pandas/io/excel/_base.py --------- Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
pandas-dev · Apr 12, 2023 · 7eeec0d · 7eeec0d
1 parent cfbbeb6
commit 7eeec0d
Show file tree

Hide file tree

Showing 8 changed files with 132 additions and 20 deletions.
diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
@@ -3449,6 +3449,18 @@ Reading Excel files
 In the most basic use-case, ``read_excel`` takes a path to an Excel
 file, and the ``sheet_name`` indicating which sheet to parse.
 
+When using the ``engine_kwargs`` parameter, pandas will pass these arguments to the
+engine. For this, it is important to know which function pandas is
+using internally.
+
+* For the engine openpyxl, pandas is using :func:`openpyxl.load_workbook` to read in (``.xlsx``) and (``.xlsm``) files.
+
+* For the engine xlrd, pandas is using :func:`xlrd.open_workbook` to read in (``.xls``) files.
+
+* For the engine pyxlsb, pandas is using :func:`pyxlsb.open_workbook` to read in (``.xlsb``) files.
+
+* For the engine odf, pandas is using :func:`odf.opendocument.load` to read in (``.ods``) files.
+
 .. code-block:: python
 
    # Returns a DataFrame

diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst
@@ -87,6 +87,7 @@ Other enhancements
 - :meth:`DataFrame.applymap` now uses the :meth:`~api.extensions.ExtensionArray.map` method of underlying :class:`api.extensions.ExtensionArray` instances (:issue:`52219`)
 - :meth:`arrays.SparseArray.map` now supports ``na_action`` (:issue:`52096`).
 - Add dtype of categories to ``repr`` information of :class:`CategoricalDtype` (:issue:`52179`)
+- Adding ``engine_kwargs`` parameter to :meth:`DataFrame.read_excel` (:issue:`52214`)
 -
 
 .. ---------------------------------------------------------------------------

diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py
@@ -289,6 +289,9 @@
 
     .. versionadded:: 2.0
 
+engine_kwargs : dict, optional
+    Arbitrary keyword arguments passed to excel engine.
+
 Returns
 -------
 DataFrame or dict of DataFrames
@@ -302,6 +305,11 @@
 read_csv : Read a comma-separated values (csv) file into DataFrame.
 read_fwf : Read a table of fixed-width formatted lines into DataFrame.
 
+Notes
+-----
+For specific information on the methods used for each Excel engine, refer to the pandas
+:ref:`user guide <io.excel_reader>`
+
 Examples
 --------
 The file can be read using the file name as string or an open file object:
@@ -472,13 +480,21 @@ def read_excel(
     skipfooter: int = 0,
     storage_options: StorageOptions = None,
     dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
+    engine_kwargs: dict | None = None,
 ) -> DataFrame | dict[IntStrT, DataFrame]:
     check_dtype_backend(dtype_backend)
-
     should_close = False
+    if engine_kwargs is None:
+        engine_kwargs = {}
+
     if not isinstance(io, ExcelFile):
         should_close = True
-        io = ExcelFile(io, storage_options=storage_options, engine=engine)
+        io = ExcelFile(
+            io,
+            storage_options=storage_options,
+            engine=engine,
+            engine_kwargs=engine_kwargs,
+        )
     elif engine and engine != io.engine:
         raise ValueError(
             "Engine should not be specified when passing "
@@ -520,8 +536,14 @@ def read_excel(
 
 class BaseExcelReader(metaclass=abc.ABCMeta):
     def __init__(
-        self, filepath_or_buffer, storage_options: StorageOptions = None
+        self,
+        filepath_or_buffer,
+        storage_options: StorageOptions = None,
+        engine_kwargs: dict | None = None,
     ) -> None:
+        if engine_kwargs is None:
+            engine_kwargs = {}
+
         # First argument can also be bytes, so create a buffer
         if isinstance(filepath_or_buffer, bytes):
             filepath_or_buffer = BytesIO(filepath_or_buffer)
@@ -540,7 +562,7 @@ def __init__(
             # N.B. xlrd.Book has a read attribute too
             self.handles.handle.seek(0)
             try:
-                self.book = self.load_workbook(self.handles.handle)
+                self.book = self.load_workbook(self.handles.handle, engine_kwargs)
             except Exception:
                 self.close()
                 raise
@@ -555,7 +577,7 @@ def _workbook_class(self):
         pass
 
     @abc.abstractmethod
-    def load_workbook(self, filepath_or_buffer):
+    def load_workbook(self, filepath_or_buffer, engine_kwargs):
         pass
 
     def close(self) -> None:
@@ -1450,6 +1472,8 @@ class ExcelFile:
 
             Please do not report issues when using ``xlrd`` to read ``.xlsx`` files.
             This is not supported, switch to using ``openpyxl`` instead.
+    engine_kwargs : dict, optional
+        Arbitrary keyword arguments passed to excel engine.
     """
 
     from pandas.io.excel._odfreader import ODFReader
@@ -1469,7 +1493,11 @@ def __init__(
         path_or_buffer,
         engine: str | None = None,
         storage_options: StorageOptions = None,
+        engine_kwargs: dict | None = None,
     ) -> None:
+        if engine_kwargs is None:
+            engine_kwargs = {}
+
         if engine is not None and engine not in self._engines:
             raise ValueError(f"Unknown engine: {engine}")
 
@@ -1513,7 +1541,11 @@ def __init__(
         self.engine = engine
         self.storage_options = storage_options
 
-        self._reader = self._engines[engine](self._io, storage_options=storage_options)
+        self._reader = self._engines[engine](
+            self._io,
+            storage_options=storage_options,
+            engine_kwargs=engine_kwargs,
+        )
 
     def __fspath__(self):
         return self._io

diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py
@@ -31,6 +31,7 @@ def __init__(
         self,
         filepath_or_buffer: FilePath | ReadBuffer[bytes],
         storage_options: StorageOptions = None,
+        engine_kwargs: dict | None = None,
     ) -> None:
         """
         Read tables out of OpenDocument formatted files.
@@ -40,20 +41,28 @@ def __init__(
         filepath_or_buffer : str, path to be parsed or
             an open readable stream.
         {storage_options}
+        engine_kwargs : dict, optional
+            Arbitrary keyword arguments passed to excel engine.
         """
         import_optional_dependency("odf")
-        super().__init__(filepath_or_buffer, storage_options=storage_options)
+        super().__init__(
+            filepath_or_buffer,
+            storage_options=storage_options,
+            engine_kwargs=engine_kwargs,
+        )
 
     @property
     def _workbook_class(self):
         from odf.opendocument import OpenDocument
 
         return OpenDocument
 
-    def load_workbook(self, filepath_or_buffer: FilePath | ReadBuffer[bytes]):
+    def load_workbook(
+        self, filepath_or_buffer: FilePath | ReadBuffer[bytes], engine_kwargs
+    ):
         from odf.opendocument import load
 
-        return load(filepath_or_buffer)
+        return load(filepath_or_buffer, **engine_kwargs)
 
     @property
     def empty_value(self) -> str:

diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py
@@ -536,6 +536,7 @@ def __init__(
         self,
         filepath_or_buffer: FilePath | ReadBuffer[bytes],
         storage_options: StorageOptions = None,
+        engine_kwargs: dict | None = None,
     ) -> None:
         """
         Reader using openpyxl engine.
@@ -545,21 +546,33 @@ def __init__(
         filepath_or_buffer : str, path object or Workbook
             Object to be parsed.
         {storage_options}
+        engine_kwargs : dict, optional
+            Arbitrary keyword arguments passed to excel engine.
         """
         import_optional_dependency("openpyxl")
-        super().__init__(filepath_or_buffer, storage_options=storage_options)
+        super().__init__(
+            filepath_or_buffer,
+            storage_options=storage_options,
+            engine_kwargs=engine_kwargs,
+        )
 
     @property
     def _workbook_class(self):
         from openpyxl import Workbook
 
         return Workbook
 
-    def load_workbook(self, filepath_or_buffer: FilePath | ReadBuffer[bytes]):
+    def load_workbook(
+        self, filepath_or_buffer: FilePath | ReadBuffer[bytes], engine_kwargs
+    ):
         from openpyxl import load_workbook
 
         return load_workbook(
-            filepath_or_buffer, read_only=True, data_only=True, keep_links=False
+            filepath_or_buffer,
+            read_only=True,
+            data_only=True,
+            keep_links=False,
+            **engine_kwargs,
         )
 
     @property

diff --git a/pandas/io/excel/_pyxlsb.py b/pandas/io/excel/_pyxlsb.py
@@ -25,6 +25,7 @@ def __init__(
         self,
         filepath_or_buffer: FilePath | ReadBuffer[bytes],
         storage_options: StorageOptions = None,
+        engine_kwargs: dict | None = None,
     ) -> None:
         """
         Reader using pyxlsb engine.
@@ -34,26 +35,34 @@ def __init__(
         filepath_or_buffer : str, path object, or Workbook
             Object to be parsed.
         {storage_options}
+        engine_kwargs : dict, optional
+            Arbitrary keyword arguments passed to excel engine.
         """
         import_optional_dependency("pyxlsb")
         # This will call load_workbook on the filepath or buffer
         # And set the result to the book-attribute
-        super().__init__(filepath_or_buffer, storage_options=storage_options)
+        super().__init__(
+            filepath_or_buffer,
+            storage_options=storage_options,
+            engine_kwargs=engine_kwargs,
+        )
 
     @property
     def _workbook_class(self):
         from pyxlsb import Workbook
 
         return Workbook
 
-    def load_workbook(self, filepath_or_buffer: FilePath | ReadBuffer[bytes]):
+    def load_workbook(
+        self, filepath_or_buffer: FilePath | ReadBuffer[bytes], engine_kwargs
+    ):
         from pyxlsb import open_workbook
 
         # TODO: hack in buffer capability
         # This might need some modifications to the Pyxlsb library
         # Actual work for opening it is in xlsbpackage.py, line 20-ish
 
-        return open_workbook(filepath_or_buffer)
+        return open_workbook(filepath_or_buffer, **engine_kwargs)
 
     @property
     def sheet_names(self) -> list[str]:

diff --git a/pandas/io/excel/_xlrd.py b/pandas/io/excel/_xlrd.py
@@ -22,7 +22,10 @@
 class XlrdReader(BaseExcelReader):
     @doc(storage_options=_shared_docs["storage_options"])
     def __init__(
-        self, filepath_or_buffer, storage_options: StorageOptions = None
+        self,
+        filepath_or_buffer,
+        storage_options: StorageOptions = None,
+        engine_kwargs: dict | None = None,
     ) -> None:
         """
         Reader using xlrd engine.
@@ -32,25 +35,31 @@ def __init__(
         filepath_or_buffer : str, path object or Workbook
             Object to be parsed.
         {storage_options}
+        engine_kwargs : dict, optional
+            Arbitrary keyword arguments passed to excel engine.
         """
         err_msg = "Install xlrd >= 2.0.1 for xls Excel support"
         import_optional_dependency("xlrd", extra=err_msg)
-        super().__init__(filepath_or_buffer, storage_options=storage_options)
+        super().__init__(
+            filepath_or_buffer,
+            storage_options=storage_options,
+            engine_kwargs=engine_kwargs,
+        )
 
     @property
     def _workbook_class(self):
         from xlrd import Book
 
         return Book
 
-    def load_workbook(self, filepath_or_buffer):
+    def load_workbook(self, filepath_or_buffer, engine_kwargs):
         from xlrd import open_workbook
 
         if hasattr(filepath_or_buffer, "read"):
             data = filepath_or_buffer.read()
-            return open_workbook(file_contents=data)
+            return open_workbook(file_contents=data, **engine_kwargs)
         else:
-            return open_workbook(filepath_or_buffer)
+            return open_workbook(filepath_or_buffer, **engine_kwargs)
 
     @property
     def sheet_names(self):

diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py
@@ -6,6 +6,7 @@
 import os
 from pathlib import Path
 import platform
+import re
 from urllib.error import URLError
 from zipfile import BadZipFile
 
@@ -148,6 +149,32 @@ def parser(self, *args, **kwargs):
             expected = expected_defaults[read_ext[1:]]
         assert result == expected
 
+    def test_engine_kwargs(self, read_ext, engine):
+        # GH#52214
+        expected_defaults = {
+            "xlsx": {"foo": "abcd"},
+            "xlsm": {"foo": 123},
+            "xlsb": {"foo": "True"},
+            "xls": {"foo": True},
+            "ods": {"foo": "abcd"},
+        }
+
+        if read_ext[1:] == "xls" or read_ext[1:] == "xlsb":
+            msg = re.escape(r"open_workbook() got an unexpected keyword argument 'foo'")
+        elif read_ext[1:] == "ods":
+            msg = re.escape(r"load() got an unexpected keyword argument 'foo'")
+        else:
+            msg = re.escape(r"load_workbook() got an unexpected keyword argument 'foo'")
+
+        if engine is not None:
+            with pytest.raises(TypeError, match=msg):
+                pd.read_excel(
+                    "test1" + read_ext,
+                    sheet_name="Sheet1",
+                    index_col=0,
+                    engine_kwargs=expected_defaults[read_ext[1:]],
+                )
+
     def test_usecols_int(self, read_ext):
         # usecols as int
         msg = "Passing an integer for `usecols`"