Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ExcelFileType class added #1978

Merged
merged 24 commits into from
Aug 11, 2023
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
ee78961
pkg-config added as its missing it distro
skolchin Jul 2, 2023
138a61b
ExcelFileType class added (with tests)
skolchin Jul 2, 2023
fa6c7ab
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jul 2, 2023
f5a787f
test_supported_file_types fix
skolchin Jul 2, 2023
d9a7c41
Merge branch 'main' of https://github.com/skolchin/astro-sdk
skolchin Jul 2, 2023
c45af34
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jul 2, 2023
2e67e8e
Merge branch 'main' into main
skolchin Jul 5, 2023
f8f545a
xls, xlsx FileType const separated
skolchin Jul 31, 2023
7167bd1
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jul 31, 2023
1ac19b7
Merge branch 'main' into main
skolchin Aug 1, 2023
c0babf6
ExcelFile DAG test added
skolchin Aug 10, 2023
c19479a
Merge branch 'main' into main
skolchin Aug 10, 2023
0fa19b3
Fix test and add class for each file extention type
pankajastro Aug 10, 2023
cea8063
limit databricks-sql-python
pankajastro Aug 10, 2023
1c8c57b
Fix tests
pankajastro Aug 10, 2023
c137526
Fix tests
pankajastro Aug 10, 2023
c76d433
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Aug 10, 2023
1fd634c
Ignore type
pankajastro Aug 10, 2023
8c43168
limit databricks-sql-connector
pankajastro Aug 10, 2023
1d0e135
Reuse tests
pankajastro Aug 10, 2023
539d019
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Aug 10, 2023
f1ab110
Reuse test
pankajastro Aug 10, 2023
31aebfe
Reuse test and fix data
pankajastro Aug 10, 2023
67d1307
Merge branch 'main' into main
pankajastro Aug 11, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ pip install astro-sdk-python[amazon,google,snowflake,postgres]
| json |
| ndjson |
| parquet |
| xls,xlsx |

| Database |
| :-------- |
Expand Down
3 changes: 2 additions & 1 deletion python-sdk/dev/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@ RUN apt-get install -y --no-install-recommends \
freetds-dev \
libssl-dev \
libkrb5-dev \
libmariadb-dev
libmariadb-dev \
pkg-config
tatiana marked this conversation as resolved.
Show resolved Hide resolved

ENV SETUPTOOLS_USE_DISTUTILS=stdlib

Expand Down
1 change: 1 addition & 0 deletions python-sdk/src/astro/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ class FileType(Enum):
JSON = "json"
NDJSON = "ndjson"
PARQUET = "parquet"
EXCEL = "xls,xlsx"
pankajastro marked this conversation as resolved.
Show resolved Hide resolved
# [END filetypes]

def __str__(self) -> str:
Expand Down
1 change: 1 addition & 0 deletions python-sdk/src/astro/dataframes/load_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ class PandasLoadOptions(LoadOptions):
1. CSV file type - https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html
2. NDJSON/JSON file type - https://pandas.pydata.org/docs/reference/api/pandas.read_json.html
3. Parquet file type - https://pandas.pydata.org/docs/reference/api/pandas.read_parquet.html
4. Excel file type: https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html

:param delimiter: Delimiter to use. Defaults to None
:param dtype: Data type for data or columns.
Expand Down
2 changes: 1 addition & 1 deletion python-sdk/src/astro/files/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def is_binary(self) -> bool:

:return: True or False
"""
result: bool = self.type.name == constants.FileType.PARQUET
result: bool = self.type.name in (constants.FileType.PARQUET, constants.FileType.EXCEL)
return result

def is_local(self) -> bool:
Expand Down
12 changes: 9 additions & 3 deletions python-sdk/src/astro/files/types/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from astro.constants import FileType as FileTypeConstants
from astro.files.types.base import FileType
from astro.files.types.csv import CSVFileType
from astro.files.types.excel import ExcelFileType
from astro.files.types.json import JSONFileType
from astro.files.types.ndjson import NDJSONFileType
from astro.files.types.parquet import ParquetFileType
Expand All @@ -23,6 +24,7 @@ def create_file_type(
FileTypeConstants.JSON: JSONFileType,
FileTypeConstants.NDJSON: NDJSONFileType,
FileTypeConstants.PARQUET: ParquetFileType,
FileTypeConstants.EXCEL: ExcelFileType,
}
if not filetype:
filetype = get_filetype(path)
Expand All @@ -49,9 +51,13 @@ def get_filetype(filepath: str | pathlib.PosixPath) -> FileTypeConstants:

:param filepath: URI or Path to a file
:type filepath: str or pathlib.PosixPath
:return: The filetype (e.g. csv, ndjson, json, parquet)
:return: The filetype (e.g. csv, ndjson, json, parquet, excel)
:rtype: astro.constants.FileType
"""
ext_to_filetype: dict[str, type[FileTypeConstants]] = {
ext: t for t in FileTypeConstants for ext in t.value.split(",")
}

if isinstance(filepath, pathlib.PosixPath):
extension = filepath.suffix[1:]
else:
Expand All @@ -67,6 +73,6 @@ def get_filetype(filepath: str | pathlib.PosixPath) -> FileTypeConstants:
)

try:
return FileTypeConstants(extension)
except ValueError:
return ext_to_filetype[extension.lower()]
except KeyError:
raise ValueError(f"Unsupported filetype '{extension}' from file '{filepath}'.")
51 changes: 51 additions & 0 deletions python-sdk/src/astro/files/types/excel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
from __future__ import annotations

import io
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looking good. Just one thing - can we change the dir structure? if it's not too much trouble?
it can be something like:
python-sdk/src/astro/files/types/excel
python-sdk/src/astro/files/types/excel/base.py
python-sdk/src/astro/files/types/excel/xls.py
python-sdk/src/astro/files/types/excel/xlsx.py

WDYT?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I thought about it but some test refactoring requires

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe we can do it as separate PR if needed


import pandas as pd

from astro.constants import FileType as FileTypeConstants
from astro.dataframes.load_options import PandasLoadOptions
from astro.dataframes.pandas import PandasDataframe
from astro.files.types.base import FileType
from astro.utils.dataframe import convert_columns_names_capitalization


class ExcelFileType(FileType):
"""Concrete implementation to handle Excel file type"""

LOAD_OPTIONS_CLASS_NAME = ("PandasLoadOptions",)

# We need skipcq because it's a method overloading so we don't want to make it a static method
def export_to_dataframe(
self,
stream,
columns_names_capitalization="original",
**kwargs,
) -> pd.DataFrame: # skipcq PYL-R0201
"""read Excel file from one of the supported locations and return dataframe

:param stream: file stream object
:param columns_names_capitalization: determines whether to convert all columns to lowercase/uppercase
in the resulting dataframe
"""
if isinstance(self.load_options, PandasLoadOptions):
kwargs = self.load_options.populate_kwargs(kwargs)
df = pd.read_excel(stream, **kwargs)
df = convert_columns_names_capitalization(
df=df, columns_names_capitalization=columns_names_capitalization
)
return PandasDataframe.from_pandas_df(df)

# We need skipcq because it's a method overloading so we don't want to make it a static method
def create_from_dataframe(self, df: pd.DataFrame, stream: io.TextIOWrapper) -> None: # skipcq PYL-R0201
"""Write csv file to one of the supported locations

:param df: pandas dataframe
:param stream: file stream object
"""
df.to_excel(stream, index=False)

@property
def name(self):
return FileTypeConstants.EXCEL
Binary file added python-sdk/tests/data/sample.xlsx
Binary file not shown.
7 changes: 5 additions & 2 deletions python-sdk/tests/files/test_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
(False, "/tmp/sample.json"),
(False, "/tmp/sample.ndjson"),
(True, "/tmp/sample.parquet"),
(True, "/tmp/sample.xlsx"),
]


Expand All @@ -49,12 +50,13 @@ def test_is_binary(filetype):
(False, "/tmp/sample.json"),
(False, "/tmp/sample.ndjson"),
(False, "/tmp/sample.parquet"),
(False, "/tmp/sample.xlsx"),
(True, "/tmp/"),
(True, "s3://tmp/home_*"),
(False, "s3://tmp/.folder/sample.csv"),
(True, "s3://tmp/.folder/"),
],
ids=["csv", "json", "ndjson", "parquet", "csv", "json", "csv", "json"],
ids=["csv", "json", "ndjson", "parquet", "xlsx", "csv", "json", "csv", "json"],
)
def test_is_pattern(filetype):
"""Test if the file is a file pattern"""
Expand Down Expand Up @@ -226,8 +228,9 @@ def test_if_file_object_can_be_pickled():
{"type": "ndjson", "expected_class": PandasLoadOptions},
{"type": "json", "expected_class": PandasLoadOptions},
{"type": "parquet", "expected_class": PandasLoadOptions},
{"type": "xlsx", "expected_class": PandasLoadOptions},
],
ids=["csv", "ndjson", "json", "parquet"],
ids=["csv", "ndjson", "json", "parquet", "xlsx"],
)
@pytest.mark.parametrize(
"file_location",
Expand Down
46 changes: 46 additions & 0 deletions python-sdk/tests/files/type/test_excel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import pathlib
import tempfile
from unittest import mock

import pandas as pd

from astro.dataframes.load_options import PandasLoadOptions
from astro.dataframes.pandas import PandasDataframe
from astro.files.types import ExcelFileType

sample_file = pathlib.Path(pathlib.Path(__file__).parent.parent.parent, "data/sample.xlsx")


def test_read_excel_file():
"""Test reading of excel file from local location"""
path = str(sample_file.absolute())
excel_type = ExcelFileType(path)
with open(path, "rb") as file:
df = excel_type.export_to_dataframe(file)
assert df.shape == (3, 2)
assert isinstance(df, PandasDataframe)


@mock.patch("astro.files.types.excel.pd.read_excel")
def test_read_excel_file_with_pandas_opts(mock_read_excel):
"""Test pandas option get pass to read_excel"""
path = str(sample_file.absolute())
excel_type = ExcelFileType(path, load_options=PandasLoadOptions())
with open(path, "rb") as file:
excel_type.export_to_dataframe(file)
mock_read_excel.assert_called_once_with(file)


def test_write_excel_file():
"""Test writing of excel file from local location"""
with tempfile.NamedTemporaryFile() as temp_file:
path = temp_file.name
data = {
"id": [1, 2, 3],
"name": ["First", "Second", "Third with unicode पांचाल"],
}
df = pd.DataFrame(data=data)

excel_type = ExcelFileType(path)
excel_type.create_from_dataframe(stream=temp_file, df=df)
assert pd.read_excel(path).shape == (3, 2)
1 change: 1 addition & 0 deletions python-sdk/tests/files/type/test_type_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
(FileType.JSON, "sample.json"),
(FileType.NDJSON, "sample.ndjson"),
(FileType.PARQUET, "sample.parquet"),
(FileType.EXCEL, "sample.xlsx"),
]
sample_filetypes = [items[0] for items in sample_filepaths_per_filetype]
sample_filepaths = [items[1] for items in sample_filepaths_per_filetype]
Expand Down
2 changes: 1 addition & 1 deletion python-sdk/tests/test_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ def test_supported_file_locations():


def test_supported_file_types():
expected = {"csv", "json", "ndjson", "parquet"}
expected = {"csv", "json", "ndjson", "parquet", "xls,xlsx"}
pankajastro marked this conversation as resolved.
Show resolved Hide resolved
assert set(SUPPORTED_FILE_TYPES) == expected


Expand Down