Skip to content

Commit

Permalink
ENH: Adding engine_kwargs to Excel engines for issue #40274 (#52214)
Browse files Browse the repository at this point in the history
* Fixing merge conflicts

* Fixing merge conflict

* Fixing documentation issues

* standardized usage of engine_kwargs, fixed unit tests & doc strings

* Fixing documentation issues

* Fixing implementation logic and unit tests

* Fixing implementation logic

* Fixing formatting issues

* Fixing error for test Docstring validation, typing, and other manual pre-commit hooks

* Fixing documentation error

* Standardizing engine_kwarg types

* Fixing minor issues with unit tests and documentation

* Fixing documentation issue

* Fixing a formatting / documentation error

* Fixing documentation errors

* Fixing documentation errors

* Fixing documentation errors

* Fixing documentation errors

* Fixing documentation errors

* Adding an extra blank line to troubleshoot documentation error

* Adding an extra blank line to troubleshoot documentation error

* Fixing documentation issues

* Fixing formatting errors

* Fixing formatting errors

* Fixing formatting errors

* Fixing logic and formatting issues in unit tests

* Fixing issues with merge conflict

* Fixing formatting issue

* Update pandas/io/excel/_base.py

---------

Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
  • Loading branch information
rmhowe425 and mroeschke authored Apr 12, 2023
1 parent cfbbeb6 commit 7eeec0d
Show file tree
Hide file tree
Showing 8 changed files with 132 additions and 20 deletions.
12 changes: 12 additions & 0 deletions doc/source/user_guide/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3449,6 +3449,18 @@ Reading Excel files
In the most basic use-case, ``read_excel`` takes a path to an Excel
file, and the ``sheet_name`` indicating which sheet to parse.

When using the ``engine_kwargs`` parameter, pandas will pass these arguments to the
engine. For this, it is important to know which function pandas is
using internally.

* For the engine openpyxl, pandas is using :func:`openpyxl.load_workbook` to read in (``.xlsx``) and (``.xlsm``) files.

* For the engine xlrd, pandas is using :func:`xlrd.open_workbook` to read in (``.xls``) files.

* For the engine pyxlsb, pandas is using :func:`pyxlsb.open_workbook` to read in (``.xlsb``) files.

* For the engine odf, pandas is using :func:`odf.opendocument.load` to read in (``.ods``) files.

.. code-block:: python
# Returns a DataFrame
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ Other enhancements
- :meth:`DataFrame.applymap` now uses the :meth:`~api.extensions.ExtensionArray.map` method of underlying :class:`api.extensions.ExtensionArray` instances (:issue:`52219`)
- :meth:`arrays.SparseArray.map` now supports ``na_action`` (:issue:`52096`).
- Add dtype of categories to ``repr`` information of :class:`CategoricalDtype` (:issue:`52179`)
- Adding ``engine_kwargs`` parameter to :meth:`DataFrame.read_excel` (:issue:`52214`)
-

.. ---------------------------------------------------------------------------
Expand Down
44 changes: 38 additions & 6 deletions pandas/io/excel/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,6 +289,9 @@
.. versionadded:: 2.0
engine_kwargs : dict, optional
Arbitrary keyword arguments passed to excel engine.
Returns
-------
DataFrame or dict of DataFrames
Expand All @@ -302,6 +305,11 @@
read_csv : Read a comma-separated values (csv) file into DataFrame.
read_fwf : Read a table of fixed-width formatted lines into DataFrame.
Notes
-----
For specific information on the methods used for each Excel engine, refer to the pandas
:ref:`user guide <io.excel_reader>`
Examples
--------
The file can be read using the file name as string or an open file object:
Expand Down Expand Up @@ -472,13 +480,21 @@ def read_excel(
skipfooter: int = 0,
storage_options: StorageOptions = None,
dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
engine_kwargs: dict | None = None,
) -> DataFrame | dict[IntStrT, DataFrame]:
check_dtype_backend(dtype_backend)

should_close = False
if engine_kwargs is None:
engine_kwargs = {}

if not isinstance(io, ExcelFile):
should_close = True
io = ExcelFile(io, storage_options=storage_options, engine=engine)
io = ExcelFile(
io,
storage_options=storage_options,
engine=engine,
engine_kwargs=engine_kwargs,
)
elif engine and engine != io.engine:
raise ValueError(
"Engine should not be specified when passing "
Expand Down Expand Up @@ -520,8 +536,14 @@ def read_excel(

class BaseExcelReader(metaclass=abc.ABCMeta):
def __init__(
self, filepath_or_buffer, storage_options: StorageOptions = None
self,
filepath_or_buffer,
storage_options: StorageOptions = None,
engine_kwargs: dict | None = None,
) -> None:
if engine_kwargs is None:
engine_kwargs = {}

# First argument can also be bytes, so create a buffer
if isinstance(filepath_or_buffer, bytes):
filepath_or_buffer = BytesIO(filepath_or_buffer)
Expand All @@ -540,7 +562,7 @@ def __init__(
# N.B. xlrd.Book has a read attribute too
self.handles.handle.seek(0)
try:
self.book = self.load_workbook(self.handles.handle)
self.book = self.load_workbook(self.handles.handle, engine_kwargs)
except Exception:
self.close()
raise
Expand All @@ -555,7 +577,7 @@ def _workbook_class(self):
pass

@abc.abstractmethod
def load_workbook(self, filepath_or_buffer):
def load_workbook(self, filepath_or_buffer, engine_kwargs):
pass

def close(self) -> None:
Expand Down Expand Up @@ -1450,6 +1472,8 @@ class ExcelFile:
Please do not report issues when using ``xlrd`` to read ``.xlsx`` files.
This is not supported, switch to using ``openpyxl`` instead.
engine_kwargs : dict, optional
Arbitrary keyword arguments passed to excel engine.
"""

from pandas.io.excel._odfreader import ODFReader
Expand All @@ -1469,7 +1493,11 @@ def __init__(
path_or_buffer,
engine: str | None = None,
storage_options: StorageOptions = None,
engine_kwargs: dict | None = None,
) -> None:
if engine_kwargs is None:
engine_kwargs = {}

if engine is not None and engine not in self._engines:
raise ValueError(f"Unknown engine: {engine}")

Expand Down Expand Up @@ -1513,7 +1541,11 @@ def __init__(
self.engine = engine
self.storage_options = storage_options

self._reader = self._engines[engine](self._io, storage_options=storage_options)
self._reader = self._engines[engine](
self._io,
storage_options=storage_options,
engine_kwargs=engine_kwargs,
)

def __fspath__(self):
return self._io
Expand Down
15 changes: 12 additions & 3 deletions pandas/io/excel/_odfreader.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ def __init__(
self,
filepath_or_buffer: FilePath | ReadBuffer[bytes],
storage_options: StorageOptions = None,
engine_kwargs: dict | None = None,
) -> None:
"""
Read tables out of OpenDocument formatted files.
Expand All @@ -40,20 +41,28 @@ def __init__(
filepath_or_buffer : str, path to be parsed or
an open readable stream.
{storage_options}
engine_kwargs : dict, optional
Arbitrary keyword arguments passed to excel engine.
"""
import_optional_dependency("odf")
super().__init__(filepath_or_buffer, storage_options=storage_options)
super().__init__(
filepath_or_buffer,
storage_options=storage_options,
engine_kwargs=engine_kwargs,
)

@property
def _workbook_class(self):
from odf.opendocument import OpenDocument

return OpenDocument

def load_workbook(self, filepath_or_buffer: FilePath | ReadBuffer[bytes]):
def load_workbook(
self, filepath_or_buffer: FilePath | ReadBuffer[bytes], engine_kwargs
):
from odf.opendocument import load

return load(filepath_or_buffer)
return load(filepath_or_buffer, **engine_kwargs)

@property
def empty_value(self) -> str:
Expand Down
19 changes: 16 additions & 3 deletions pandas/io/excel/_openpyxl.py
Original file line number Diff line number Diff line change
Expand Up @@ -536,6 +536,7 @@ def __init__(
self,
filepath_or_buffer: FilePath | ReadBuffer[bytes],
storage_options: StorageOptions = None,
engine_kwargs: dict | None = None,
) -> None:
"""
Reader using openpyxl engine.
Expand All @@ -545,21 +546,33 @@ def __init__(
filepath_or_buffer : str, path object or Workbook
Object to be parsed.
{storage_options}
engine_kwargs : dict, optional
Arbitrary keyword arguments passed to excel engine.
"""
import_optional_dependency("openpyxl")
super().__init__(filepath_or_buffer, storage_options=storage_options)
super().__init__(
filepath_or_buffer,
storage_options=storage_options,
engine_kwargs=engine_kwargs,
)

@property
def _workbook_class(self):
from openpyxl import Workbook

return Workbook

def load_workbook(self, filepath_or_buffer: FilePath | ReadBuffer[bytes]):
def load_workbook(
self, filepath_or_buffer: FilePath | ReadBuffer[bytes], engine_kwargs
):
from openpyxl import load_workbook

return load_workbook(
filepath_or_buffer, read_only=True, data_only=True, keep_links=False
filepath_or_buffer,
read_only=True,
data_only=True,
keep_links=False,
**engine_kwargs,
)

@property
Expand Down
15 changes: 12 additions & 3 deletions pandas/io/excel/_pyxlsb.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ def __init__(
self,
filepath_or_buffer: FilePath | ReadBuffer[bytes],
storage_options: StorageOptions = None,
engine_kwargs: dict | None = None,
) -> None:
"""
Reader using pyxlsb engine.
Expand All @@ -34,26 +35,34 @@ def __init__(
filepath_or_buffer : str, path object, or Workbook
Object to be parsed.
{storage_options}
engine_kwargs : dict, optional
Arbitrary keyword arguments passed to excel engine.
"""
import_optional_dependency("pyxlsb")
# This will call load_workbook on the filepath or buffer
# And set the result to the book-attribute
super().__init__(filepath_or_buffer, storage_options=storage_options)
super().__init__(
filepath_or_buffer,
storage_options=storage_options,
engine_kwargs=engine_kwargs,
)

@property
def _workbook_class(self):
from pyxlsb import Workbook

return Workbook

def load_workbook(self, filepath_or_buffer: FilePath | ReadBuffer[bytes]):
def load_workbook(
self, filepath_or_buffer: FilePath | ReadBuffer[bytes], engine_kwargs
):
from pyxlsb import open_workbook

# TODO: hack in buffer capability
# This might need some modifications to the Pyxlsb library
# Actual work for opening it is in xlsbpackage.py, line 20-ish

return open_workbook(filepath_or_buffer)
return open_workbook(filepath_or_buffer, **engine_kwargs)

@property
def sheet_names(self) -> list[str]:
Expand Down
19 changes: 14 additions & 5 deletions pandas/io/excel/_xlrd.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,10 @@
class XlrdReader(BaseExcelReader):
@doc(storage_options=_shared_docs["storage_options"])
def __init__(
self, filepath_or_buffer, storage_options: StorageOptions = None
self,
filepath_or_buffer,
storage_options: StorageOptions = None,
engine_kwargs: dict | None = None,
) -> None:
"""
Reader using xlrd engine.
Expand All @@ -32,25 +35,31 @@ def __init__(
filepath_or_buffer : str, path object or Workbook
Object to be parsed.
{storage_options}
engine_kwargs : dict, optional
Arbitrary keyword arguments passed to excel engine.
"""
err_msg = "Install xlrd >= 2.0.1 for xls Excel support"
import_optional_dependency("xlrd", extra=err_msg)
super().__init__(filepath_or_buffer, storage_options=storage_options)
super().__init__(
filepath_or_buffer,
storage_options=storage_options,
engine_kwargs=engine_kwargs,
)

@property
def _workbook_class(self):
from xlrd import Book

return Book

def load_workbook(self, filepath_or_buffer):
def load_workbook(self, filepath_or_buffer, engine_kwargs):
from xlrd import open_workbook

if hasattr(filepath_or_buffer, "read"):
data = filepath_or_buffer.read()
return open_workbook(file_contents=data)
return open_workbook(file_contents=data, **engine_kwargs)
else:
return open_workbook(filepath_or_buffer)
return open_workbook(filepath_or_buffer, **engine_kwargs)

@property
def sheet_names(self):
Expand Down
27 changes: 27 additions & 0 deletions pandas/tests/io/excel/test_readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import os
from pathlib import Path
import platform
import re
from urllib.error import URLError
from zipfile import BadZipFile

Expand Down Expand Up @@ -148,6 +149,32 @@ def parser(self, *args, **kwargs):
expected = expected_defaults[read_ext[1:]]
assert result == expected

def test_engine_kwargs(self, read_ext, engine):
# GH#52214
expected_defaults = {
"xlsx": {"foo": "abcd"},
"xlsm": {"foo": 123},
"xlsb": {"foo": "True"},
"xls": {"foo": True},
"ods": {"foo": "abcd"},
}

if read_ext[1:] == "xls" or read_ext[1:] == "xlsb":
msg = re.escape(r"open_workbook() got an unexpected keyword argument 'foo'")
elif read_ext[1:] == "ods":
msg = re.escape(r"load() got an unexpected keyword argument 'foo'")
else:
msg = re.escape(r"load_workbook() got an unexpected keyword argument 'foo'")

if engine is not None:
with pytest.raises(TypeError, match=msg):
pd.read_excel(
"test1" + read_ext,
sheet_name="Sheet1",
index_col=0,
engine_kwargs=expected_defaults[read_ext[1:]],
)

def test_usecols_int(self, read_ext):
# usecols as int
msg = "Passing an integer for `usecols`"
Expand Down

0 comments on commit 7eeec0d

Please sign in to comment.