From e6899695cacd65e18ef12fa6f73520c8b43bc748 Mon Sep 17 00:00:00 2001 From: Irina Truong Date: Fri, 21 Jul 2023 14:03:07 -0700 Subject: [PATCH] Simple deltalake benchmark. --- .github/workflows/tests.yml | 47 +++++++++++++++--------------- ci/environment.yml | 4 ++- tests/benchmarks/test_deltalake.py | 34 +++++++++++++++++++++ 3 files changed, 61 insertions(+), 24 deletions(-) create mode 100644 tests/benchmarks/test_deltalake.py diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 9b76cdbff5..7c952c2c56 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -32,29 +32,30 @@ jobs: matrix: os: [ubuntu-latest] python-version: ["3.9"] - pytest_args: [tests] - include: - # Run stability tests on the lowest and highest versions of Python only - # These are temporarily redundant with the current global python-version - # - pytest_args: tests/stability - # python-version: "3.9" - # os: ubuntu-latest - # - pytest_args: tests/stability - # python-version: "3.9" - # os: ubuntu-latest - - pytest_args: tests/stability - python-version: "3.11" - os: ubuntu-latest - - pytest_args: tests/stability - python-version: "3.11" - os: ubuntu-latest - # Run stability tests on Python Windows and MacOS (latest py39 only) - - pytest_args: tests/stability - python-version: "3.9" - os: windows-latest - - pytest_args: tests/stability - python-version: "3.9" - os: macos-latest + # pytest_args: [tests] + pytest_args: [tests/benchmarks/test_deltalake.py] +# include: +# # Run stability tests on the lowest and highest versions of Python only +# # These are temporarily redundant with the current global python-version +# # - pytest_args: tests/stability +# # python-version: "3.9" +# # os: ubuntu-latest +# # - pytest_args: tests/stability +# # python-version: "3.9" +# # os: ubuntu-latest +# - pytest_args: tests/stability +# python-version: "3.11" +# os: ubuntu-latest +# - pytest_args: tests/stability +# python-version: "3.11" +# os: ubuntu-latest +# # Run stability tests on Python Windows and MacOS (latest py39 only) +# - pytest_args: tests/stability +# python-version: "3.9" +# os: windows-latest +# - pytest_args: tests/stability +# python-version: "3.9" +# os: macos-latest steps: - name: Checkout diff --git a/ci/environment.yml b/ci/environment.yml index aadf8a1189..1e2dea97d7 100644 --- a/ci/environment.yml +++ b/ci/environment.yml @@ -41,4 +41,6 @@ dependencies: - gilknocker ==0.4.1 - openssl >1.1.0g - pyopenssl ==22.1.0 # Pinned by snowflake-connector-python - - cryptography ==38.0.4 # Pinned by snowflake-connector-python \ No newline at end of file + - cryptography ==38.0.4 # Pinned by snowflake-connector-python + - pip: + - git+https://github.com/dask-contrib/dask-deltatable.git # TODO: link to release version \ No newline at end of file diff --git a/tests/benchmarks/test_deltalake.py b/tests/benchmarks/test_deltalake.py new file mode 100644 index 0000000000..2d7cf48dc4 --- /dev/null +++ b/tests/benchmarks/test_deltalake.py @@ -0,0 +1,34 @@ +import dask.dataframe as dd +import dask_deltatable as ddt +import pytest + + +@pytest.fixture(params=["read_deltalake", "read_parquet"]) +def ddf(request, small_client): + uri = "s3://coiled-datasets/delta/ds20f_100M/" + if request.param == "read_deltalake": + yield ddt.read_deltalake(uri) + else: + yield dd.read_parquet(f"{uri}*.parquet", engine="pyarrow") + + +def test_column_agg(ddf): + ddf["float1"].agg(["sum", "mean"]).compute() + + +def test_group_agg(ddf): + ddf = ddf[["int1", "int2", "int3"]] + ( + ddf.groupby(["int2", "int3"], dropna=False, observed=True) + .agg({"int1": ["sum", "mean"]}) + .compute() + ) + + +def test_group_median(ddf, shuffle_method): + ddf = ddf[["int1", "int2", "int3"]] + ( + ddf.groupby(["int2", "int3"], dropna=False, observed=True) + .agg({"int1": ["median", "std"]}, shuffle=shuffle_method) + .compute() + )