astronomer · pankajastro · Aug 11, 2023 · Jul 2, 2023 · Jul 2, 2023 · Jul 2, 2023
@@ -120,6 +120,7 @@ pip install astro-sdk-python[amazon,google,snowflake,postgres]
 | json     |
 | ndjson   |
 | parquet  |
+| xls,xlsx |
 
 | Database  |
 | :-------- |

diff --git a/python-sdk/dev/Dockerfile b/python-sdk/dev/Dockerfile
@@ -11,7 +11,8 @@ RUN apt-get install -y --no-install-recommends \
         freetds-dev \
         libssl-dev \
         libkrb5-dev \
-        libmariadb-dev
+        libmariadb-dev \
+        pkg-config
 
 ENV SETUPTOOLS_USE_DISTUTILS=stdlib
 

@@ -39,6 +39,7 @@ class FileType(Enum):
     JSON = "json"
     NDJSON = "ndjson"
     PARQUET = "parquet"
+    EXCEL = "xls,xlsx"
     # [END filetypes]
 
     def __str__(self) -> str:

@@ -16,6 +16,7 @@ class PandasLoadOptions(LoadOptions):
      1. CSV file type - https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html
      2. NDJSON/JSON file type - https://pandas.pydata.org/docs/reference/api/pandas.read_json.html
      3. Parquet file type - https://pandas.pydata.org/docs/reference/api/pandas.read_parquet.html
+     4. Excel file type: https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html
 
     :param delimiter: Delimiter to use. Defaults to None
     :param dtype: Data type for data or columns.

@@ -92,7 +92,7 @@ def is_binary(self) -> bool:
 
         :return: True or False
         """
-        result: bool = self.type.name == constants.FileType.PARQUET
+        result: bool = self.type.name in (constants.FileType.PARQUET, constants.FileType.EXCEL)
         return result
 
     def is_local(self) -> bool:

@@ -5,6 +5,7 @@
 from astro.constants import FileType as FileTypeConstants
 from astro.files.types.base import FileType
 from astro.files.types.csv import CSVFileType
+from astro.files.types.excel import ExcelFileType
 from astro.files.types.json import JSONFileType
 from astro.files.types.ndjson import NDJSONFileType
 from astro.files.types.parquet import ParquetFileType
@@ -23,6 +24,7 @@ def create_file_type(
         FileTypeConstants.JSON: JSONFileType,
         FileTypeConstants.NDJSON: NDJSONFileType,
         FileTypeConstants.PARQUET: ParquetFileType,
+        FileTypeConstants.EXCEL: ExcelFileType,
     }
     if not filetype:
         filetype = get_filetype(path)
@@ -49,9 +51,13 @@ def get_filetype(filepath: str | pathlib.PosixPath) -> FileTypeConstants:
 
     :param filepath: URI or Path to a file
     :type filepath: str or pathlib.PosixPath
-    :return: The filetype (e.g. csv, ndjson, json, parquet)
+    :return: The filetype (e.g. csv, ndjson, json, parquet, excel)
     :rtype: astro.constants.FileType
     """
+    ext_to_filetype: dict[str, type[FileTypeConstants]] = {
+        ext: t for t in FileTypeConstants for ext in t.value.split(",")
+    }
+
     if isinstance(filepath, pathlib.PosixPath):
         extension = filepath.suffix[1:]
     else:
@@ -67,6 +73,6 @@ def get_filetype(filepath: str | pathlib.PosixPath) -> FileTypeConstants:
         )
 
     try:
-        return FileTypeConstants(extension)
-    except ValueError:
+        return ext_to_filetype[extension.lower()]
+    except KeyError:
         raise ValueError(f"Unsupported filetype '{extension}' from file '{filepath}'.")
@@ -0,0 +1,51 @@
+from __future__ import annotations
+
+import io
+
+import pandas as pd
+
+from astro.constants import FileType as FileTypeConstants
+from astro.dataframes.load_options import PandasLoadOptions
+from astro.dataframes.pandas import PandasDataframe
+from astro.files.types.base import FileType
+from astro.utils.dataframe import convert_columns_names_capitalization
+
+
+class ExcelFileType(FileType):
+    """Concrete implementation to handle Excel file type"""
+
+    LOAD_OPTIONS_CLASS_NAME = ("PandasLoadOptions",)
+
+    # We need skipcq because it's a method overloading so we don't want to make it a static method
+    def export_to_dataframe(
+        self,
+        stream,
+        columns_names_capitalization="original",
+        **kwargs,
+    ) -> pd.DataFrame:  # skipcq PYL-R0201
+        """read Excel file from one of the supported locations and return dataframe
+
+        :param stream: file stream object
+        :param columns_names_capitalization: determines whether to convert all columns to lowercase/uppercase
+            in the resulting dataframe
+        """
+        if isinstance(self.load_options, PandasLoadOptions):
+            kwargs = self.load_options.populate_kwargs(kwargs)
+        df = pd.read_excel(stream, **kwargs)
+        df = convert_columns_names_capitalization(
+            df=df, columns_names_capitalization=columns_names_capitalization
+        )
+        return PandasDataframe.from_pandas_df(df)
+
+    # We need skipcq because it's a method overloading so we don't want to make it a static method
+    def create_from_dataframe(self, df: pd.DataFrame, stream: io.TextIOWrapper) -> None:  # skipcq PYL-R0201
+        """Write csv file to one of the supported locations
+
+        :param df: pandas dataframe
+        :param stream: file stream object
+        """
+        df.to_excel(stream, index=False)
+
+    @property
+    def name(self):
+        return FileTypeConstants.EXCEL
@@ -24,6 +24,7 @@
     (False, "/tmp/sample.json"),
     (False, "/tmp/sample.ndjson"),
     (True, "/tmp/sample.parquet"),
+    (True, "/tmp/sample.xlsx"),
 ]
 
 
@@ -49,12 +50,13 @@ def test_is_binary(filetype):
         (False, "/tmp/sample.json"),
         (False, "/tmp/sample.ndjson"),
         (False, "/tmp/sample.parquet"),
+        (False, "/tmp/sample.xlsx"),
         (True, "/tmp/"),
         (True, "s3://tmp/home_*"),
         (False, "s3://tmp/.folder/sample.csv"),
         (True, "s3://tmp/.folder/"),
     ],
-    ids=["csv", "json", "ndjson", "parquet", "csv", "json", "csv", "json"],
+    ids=["csv", "json", "ndjson", "parquet", "xlsx", "csv", "json", "csv", "json"],
 )
 def test_is_pattern(filetype):
     """Test if the file is a file pattern"""
@@ -226,8 +228,9 @@ def test_if_file_object_can_be_pickled():
         {"type": "ndjson", "expected_class": PandasLoadOptions},
         {"type": "json", "expected_class": PandasLoadOptions},
         {"type": "parquet", "expected_class": PandasLoadOptions},
+        {"type": "xlsx", "expected_class": PandasLoadOptions},
     ],
-    ids=["csv", "ndjson", "json", "parquet"],
+    ids=["csv", "ndjson", "json", "parquet", "xlsx"],
 )
 @pytest.mark.parametrize(
     "file_location",

@@ -0,0 +1,46 @@
+import pathlib
+import tempfile
+from unittest import mock
+
+import pandas as pd
+
+from astro.dataframes.load_options import PandasLoadOptions
+from astro.dataframes.pandas import PandasDataframe
+from astro.files.types import ExcelFileType
+
+sample_file = pathlib.Path(pathlib.Path(__file__).parent.parent.parent, "data/sample.xlsx")
+
+
+def test_read_excel_file():
+    """Test reading of excel file from local location"""
+    path = str(sample_file.absolute())
+    excel_type = ExcelFileType(path)
+    with open(path, "rb") as file:
+        df = excel_type.export_to_dataframe(file)
+    assert df.shape == (3, 2)
+    assert isinstance(df, PandasDataframe)
+
+
+@mock.patch("astro.files.types.excel.pd.read_excel")
+def test_read_excel_file_with_pandas_opts(mock_read_excel):
+    """Test pandas option get pass to read_excel"""
+    path = str(sample_file.absolute())
+    excel_type = ExcelFileType(path, load_options=PandasLoadOptions())
+    with open(path, "rb") as file:
+        excel_type.export_to_dataframe(file)
+    mock_read_excel.assert_called_once_with(file)
+
+
+def test_write_excel_file():
+    """Test writing of excel file from local location"""
+    with tempfile.NamedTemporaryFile() as temp_file:
+        path = temp_file.name
+        data = {
+            "id": [1, 2, 3],
+            "name": ["First", "Second", "Third with unicode पांचाल"],
+        }
+        df = pd.DataFrame(data=data)
+
+        excel_type = ExcelFileType(path)
+        excel_type.create_from_dataframe(stream=temp_file, df=df)
+        assert pd.read_excel(path).shape == (3, 2)
@@ -11,6 +11,7 @@
     (FileType.JSON, "sample.json"),
     (FileType.NDJSON, "sample.ndjson"),
     (FileType.PARQUET, "sample.parquet"),
+    (FileType.EXCEL, "sample.xlsx"),
 ]
 sample_filetypes = [items[0] for items in sample_filepaths_per_filetype]
 sample_filepaths = [items[1] for items in sample_filepaths_per_filetype]

@@ -7,7 +7,7 @@ def test_supported_file_locations():
 
 
 def test_supported_file_types():
-    expected = {"csv", "json", "ndjson", "parquet"}
+    expected = {"csv", "json", "ndjson", "parquet", "xls,xlsx"}
     assert set(SUPPORTED_FILE_TYPES) == expected