From a2b46b0dd786f431a3fa39960a65aa734ad0ee6f Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Sat, 14 Oct 2023 15:13:38 -0600 Subject: [PATCH] Cleanups (#276) - prettier - stricter mypy - remove split-reduce --- .github/workflows/benchmarks.yml | 2 +- .github/workflows/ci.yaml | 20 +- .github/workflows/pypi.yaml | 2 +- .gitignore | 1 + .pre-commit-config.yaml | 101 +- asv_bench/asv.conf.json | 192 +- asv_bench/benchmarks/README_CI.md | 2 +- ci/benchmark.yml | 2 +- ci/docs.yml | 4 +- ci/environment.yml | 7 +- ci/minimal-requirements.yml | 2 +- ci/no-dask.yml | 2 +- ci/no-numba.yml | 24 + ci/no-xarray.yml | 2 +- ci/upstream-dev-env.yml | 10 +- codecov.yml | 6 +- docs/source/_static/style.css | 4 +- docs/source/engines.md | 2 +- docs/source/implementation.md | 12 +- docs/source/index.md | 4 +- .../user-stories/hourly-climatology.html | 15025 +++++++++++++++- flox/aggregations.py | 2 +- flox/cache.py | 2 +- flox/core.py | 39 +- flox/xarray.py | 6 +- flox/xrutils.py | 2 +- pyproject.toml | 4 +- readthedocs.yml | 6 +- tests/__init__.py | 4 +- tests/test_core.py | 2 +- 30 files changed, 15194 insertions(+), 299 deletions(-) create mode 100644 ci/no-numba.yml diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml index d8c39cdfd..3a4215d40 100644 --- a/.github/workflows/benchmarks.yml +++ b/.github/workflows/benchmarks.yml @@ -8,7 +8,7 @@ on: jobs: benchmark: # if: ${{ contains( github.event.pull_request.labels.*.name, 'run-benchmark') && github.event_name == 'pull_request' || github.event_name == 'workflow_dispatch' }} # Run if the PR has been labelled correctly. - if: ${{ github.event_name == 'pull_request' || github.event_name == 'workflow_dispatch' }} # Always run. + if: ${{ github.event_name == 'pull_request' || github.event_name == 'workflow_dispatch' }} # Always run. name: Linux runs-on: ubuntu-20.04 env: diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index b963d671d..51777368c 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -59,20 +59,20 @@ jobs: optional-deps: name: ${{ matrix.env }} - runs-on: ${{ matrix.os }} + runs-on: "ubuntu-latest" defaults: run: shell: bash -l {0} strategy: fail-fast: false matrix: - os: ["ubuntu-latest"] - env: - [ - "no-xarray", - "no-dask", - "minimal-requirements", - ] + python-version: ["3.11"] + env: ["no-xarray", "no-dask"] + include: + - env: "no-numba" + python-version: "3.12" + - env: "minimal-requirements" + python-version: "3.9" steps: - uses: actions/checkout@v4 with: @@ -80,7 +80,7 @@ jobs: - name: Set up conda environment uses: mamba-org/setup-micromamba@v1 with: - environment-file: ci/environment.yml + environment-file: ci/${{ matrix.env }}.yml environment-name: flox-tests init-shell: bash cache-environment: true @@ -110,7 +110,7 @@ jobs: steps: - uses: actions/checkout@v4 with: - repository: 'pydata/xarray' + repository: "pydata/xarray" fetch-depth: 0 # Fetch all history for all branches and tags. - name: Set up conda environment uses: mamba-org/setup-micromamba@v1 diff --git a/.github/workflows/pypi.yaml b/.github/workflows/pypi.yaml index 93891fcd7..1cd8b20f2 100644 --- a/.github/workflows/pypi.yaml +++ b/.github/workflows/pypi.yaml @@ -12,7 +12,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v4 with: - python-version: '3.x' + python-version: "3.x" - name: Install dependencies run: | python -m pip install --upgrade pip diff --git a/.gitignore b/.gitignore index dbfcb2ef4..04f8d580d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ docs/source/generated/ html/ .asv/ +asv_bench/pkgs/ # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f7e20ea85..080a08bab 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,56 +1,61 @@ ci: - autoupdate_schedule: quarterly + autoupdate_schedule: quarterly repos: - - repo: https://github.com/astral-sh/ruff-pre-commit - # Ruff version. - rev: 'v0.0.292' - hooks: - - id: ruff - args: ["--fix", "--show-fixes"] - - - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.4.0 - hooks: - - id: check-yaml - - id: trailing-whitespace - - id: end-of-file-fixer - - id: check-docstring-first - - - repo: https://github.com/psf/black-pre-commit-mirror - rev: 23.9.1 - hooks: - - id: black - - - repo: https://github.com/executablebooks/mdformat - rev: 0.7.17 - hooks: + - repo: https://github.com/astral-sh/ruff-pre-commit + # Ruff version. + rev: "v0.0.292" + hooks: + - id: ruff + args: ["--fix", "--show-fixes"] + + - repo: https://github.com/pre-commit/mirrors-prettier + rev: "v3.0.3" + hooks: + - id: prettier + + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.5.0 + hooks: + - id: check-yaml + - id: trailing-whitespace + - id: end-of-file-fixer + - id: check-docstring-first + + - repo: https://github.com/psf/black-pre-commit-mirror + rev: 23.9.1 + hooks: + - id: black + + - repo: https://github.com/executablebooks/mdformat + rev: 0.7.17 + hooks: - id: mdformat additional_dependencies: - mdformat-black - mdformat-myst - - repo: https://github.com/nbQA-dev/nbQA - rev: 1.7.0 - hooks: - - id: nbqa-black - - id: nbqa-ruff - args: [--fix] - - - repo: https://github.com/kynan/nbstripout - rev: 0.6.1 - hooks: - - id: nbstripout - args: [--extra-keys=metadata.kernelspec metadata.language_info.version] - - - repo: https://github.com/codespell-project/codespell - rev: v2.2.6 - hooks: - - id: codespell - additional_dependencies: - - tomli - - - repo: https://github.com/abravalheri/validate-pyproject - rev: v0.14 - hooks: - - id: validate-pyproject + - repo: https://github.com/nbQA-dev/nbQA + rev: 1.7.0 + hooks: + - id: nbqa-black + - id: nbqa-ruff + args: [--fix] + + - repo: https://github.com/kynan/nbstripout + rev: 0.6.1 + hooks: + - id: nbstripout + args: [--extra-keys=metadata.kernelspec metadata.language_info.version] + + - repo: https://github.com/codespell-project/codespell + rev: v2.2.6 + hooks: + - id: codespell + additional_dependencies: + - tomli + + - repo: https://github.com/abravalheri/validate-pyproject + rev: v0.15 + hooks: + - id: validate-pyproject diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json index cfcbdf230..207572fce 100644 --- a/asv_bench/asv.conf.json +++ b/asv_bench/asv.conf.json @@ -1,98 +1,98 @@ { - // The version of the config file format. Do not change, unless - // you know what you are doing. - "version": 1, - - // The name of the project being benchmarked - "project": "flox", - - // The project's homepage - "project_url": "http://flox.readthedocs.io/", - - // The URL or local path of the source code repository for the - // project being benchmarked - "repo": "..", - - // The Python project's subdirectory in your repo. If missing or - // the empty string, the project is assumed to be located at the root - // of the repository. - // "repo_subdir": "", - - // Customizable commands for building, installing, and - // uninstalling the project. See asv.conf.json documentation. - // - // "install_command": ["in-dir={env_dir} python -mpip install {wheel_file}"], - // "uninstall_command": ["return-code=any python -mpip uninstall -y {project}"], - // "build_command": [ - // "python setup.py build", - // "PIP_NO_BUILD_ISOLATION=false python -mpip wheel --no-deps --no-index -w {build_cache_dir} {build_dir}" - // ], - - // List of branches to benchmark. If not provided, defaults to "master" - // (for git) or "default" (for mercurial). - "branches": ["main"], // for git - "dvcs": "git", - - // timeout in seconds for installing any dependencies in environment - // defaults to 10 min - "install_timeout": 600, - - // the base URL to show a commit for the project. - "show_commit_url": "http://github.com/xarray-contrib/flox/commit/", - - // The Pythons you'd like to test against. If not provided, defaults - // to the current version of Python used to run `asv`. - // "pythons": ["3.9"], - - "environment_type": "mamba", - "conda_channels": ["conda-forge"], - "conda_environment_file": "../ci/benchmark.yml", - - // The directory (relative to the current directory) that benchmarks are - // stored in. If not provided, defaults to "benchmarks" - "benchmark_dir": "benchmarks", - - // The directory (relative to the current directory) to cache the Python - // environments in. If not provided, defaults to "env" - "env_dir": ".asv/env", - - // The directory (relative to the current directory) that raw benchmark - // results are stored in. If not provided, defaults to "results". - "results_dir": ".asv/results", - - // The directory (relative to the current directory) that the html tree - // should be written to. If not provided, defaults to "html". - "html_dir": ".asv/html", - - // The number of characters to retain in the commit hashes. - // "hash_length": 8, - - // `asv` will cache results of the recent builds in each - // environment, making them faster to install next time. This is - // the number of builds to keep, per environment. - // "build_cache_size": 2, - - // The commits after which the regression search in `asv publish` - // should start looking for regressions. Dictionary whose keys are - // regexps matching to benchmark names, and values corresponding to - // the commit (exclusive) after which to start looking for - // regressions. The default is to start from the first commit - // with results. If the commit is `null`, regression detection is - // skipped for the matching benchmark. - // - // "regressions_first_commits": { - // "some_benchmark": "352cdf", // Consider regressions only after this commit - // "another_benchmark": null, // Skip regression detection altogether - // }, - - // The thresholds for relative change in results, after which `asv - // publish` starts reporting regressions. Dictionary of the same - // form as in ``regressions_first_commits``, with values - // indicating the thresholds. If multiple entries match, the - // maximum is taken. If no entry matches, the default is 5%. - // - // "regressions_thresholds": { - // "some_benchmark": 0.01, // Threshold of 1% - // "another_benchmark": 0.5, // Threshold of 50% - // }, + // The version of the config file format. Do not change, unless + // you know what you are doing. + "version": 1, + + // The name of the project being benchmarked + "project": "flox", + + // The project's homepage + "project_url": "http://flox.readthedocs.io/", + + // The URL or local path of the source code repository for the + // project being benchmarked + "repo": "..", + + // The Python project's subdirectory in your repo. If missing or + // the empty string, the project is assumed to be located at the root + // of the repository. + // "repo_subdir": "", + + // Customizable commands for building, installing, and + // uninstalling the project. See asv.conf.json documentation. + // + // "install_command": ["in-dir={env_dir} python -mpip install {wheel_file}"], + // "uninstall_command": ["return-code=any python -mpip uninstall -y {project}"], + // "build_command": [ + // "python setup.py build", + // "PIP_NO_BUILD_ISOLATION=false python -mpip wheel --no-deps --no-index -w {build_cache_dir} {build_dir}" + // ], + + // List of branches to benchmark. If not provided, defaults to "master" + // (for git) or "default" (for mercurial). + "branches": ["main"], // for git + "dvcs": "git", + + // timeout in seconds for installing any dependencies in environment + // defaults to 10 min + "install_timeout": 600, + + // the base URL to show a commit for the project. + "show_commit_url": "http://github.com/xarray-contrib/flox/commit/", + + // The Pythons you'd like to test against. If not provided, defaults + // to the current version of Python used to run `asv`. + // "pythons": ["3.9"], + + "environment_type": "mamba", + "conda_channels": ["conda-forge"], + "conda_environment_file": "../ci/benchmark.yml", + + // The directory (relative to the current directory) that benchmarks are + // stored in. If not provided, defaults to "benchmarks" + "benchmark_dir": "benchmarks", + + // The directory (relative to the current directory) to cache the Python + // environments in. If not provided, defaults to "env" + "env_dir": ".asv/env", + + // The directory (relative to the current directory) that raw benchmark + // results are stored in. If not provided, defaults to "results". + "results_dir": ".asv/results", + + // The directory (relative to the current directory) that the html tree + // should be written to. If not provided, defaults to "html". + "html_dir": ".asv/html" + + // The number of characters to retain in the commit hashes. + // "hash_length": 8, + + // `asv` will cache results of the recent builds in each + // environment, making them faster to install next time. This is + // the number of builds to keep, per environment. + // "build_cache_size": 2, + + // The commits after which the regression search in `asv publish` + // should start looking for regressions. Dictionary whose keys are + // regexps matching to benchmark names, and values corresponding to + // the commit (exclusive) after which to start looking for + // regressions. The default is to start from the first commit + // with results. If the commit is `null`, regression detection is + // skipped for the matching benchmark. + // + // "regressions_first_commits": { + // "some_benchmark": "352cdf", // Consider regressions only after this commit + // "another_benchmark": null, // Skip regression detection altogether + // }, + + // The thresholds for relative change in results, after which `asv + // publish` starts reporting regressions. Dictionary of the same + // form as in ``regressions_first_commits``, with values + // indicating the thresholds. If multiple entries match, the + // maximum is taken. If no entry matches, the default is 5%. + // + // "regressions_thresholds": { + // "some_benchmark": 0.01, // Threshold of 1% + // "another_benchmark": 0.5, // Threshold of 50% + // }, } diff --git a/asv_bench/benchmarks/README_CI.md b/asv_bench/benchmarks/README_CI.md index f306736ab..a2e04e3ac 100644 --- a/asv_bench/benchmarks/README_CI.md +++ b/asv_bench/benchmarks/README_CI.md @@ -13,7 +13,7 @@ The `asv` suite can be run for any PR on GitHub Actions (check workflow `.github We use `asv continuous` to run the job, which runs a relative performance measurement. This means that there's no state to be saved and that regressions are only caught in terms of performance ratio (absolute numbers are available but they are not useful since we do not use stable hardware over time). `asv continuous` will: - Compile `scikit-image` for _both_ commits. We use `ccache` to speed up the process, and `mamba` is used to create the build environments. -- Run the benchmark suite for both commits, _twice_ (since `processes=2` by default). +- Run the benchmark suite for both commits, _twice_ (since `processes=2` by default). - Generate a report table with performance ratios: - `ratio=1.0` -> performance didn't change. - `ratio<1.0` -> PR made it slower. diff --git a/ci/benchmark.yml b/ci/benchmark.yml index e4987000b..80667a4f6 100644 --- a/ci/benchmark.yml +++ b/ci/benchmark.yml @@ -5,7 +5,7 @@ dependencies: - asv - cachey - dask-core - - numpy>=1.20 + - numpy>=1.21 - mamba - pip - python=3.10 diff --git a/ci/docs.yml b/ci/docs.yml index c3359e4f1..ad8feee08 100644 --- a/ci/docs.yml +++ b/ci/docs.yml @@ -5,7 +5,7 @@ dependencies: - dask-core - pip - xarray - - numpy>=1.20 + - numpy>=1.21 - numpydoc - numpy_groupies>=0.9.19 - toolz @@ -18,4 +18,4 @@ dependencies: - jupyter - sphinx-codeautolink - pip: - - -e .. + - -e .. diff --git a/ci/environment.yml b/ci/environment.yml index 33e1b4661..ca9111105 100644 --- a/ci/environment.yml +++ b/ci/environment.yml @@ -8,8 +8,8 @@ dependencies: - dask-core - netcdf4 - pandas - - numpy>=1.20 - - lxml # for mypy coverage report + - numpy>=1.21 + - lxml # for mypy coverage report - matplotlib - pip - pytest @@ -18,10 +18,9 @@ dependencies: - pytest-xdist - xarray - pre-commit + - numbagg>=0.3 - numpy_groupies>=0.9.19 - pooch - toolz - numba - scipy - - pip: - - numbagg>=0.3 diff --git a/ci/minimal-requirements.yml b/ci/minimal-requirements.yml index eeb075194..40b287f64 100644 --- a/ci/minimal-requirements.yml +++ b/ci/minimal-requirements.yml @@ -9,7 +9,7 @@ dependencies: - pytest-cov - pytest-pretty - pytest-xdist - - numpy==1.20 + - numpy==1.21 - numpy_groupies==0.9.19 - pandas - pooch diff --git a/ci/no-dask.yml b/ci/no-dask.yml index 8876a84cf..f74d522b6 100644 --- a/ci/no-dask.yml +++ b/ci/no-dask.yml @@ -5,7 +5,7 @@ dependencies: - codecov - netcdf4 - pandas - - numpy>=1.20 + - numpy>=1.21 - pip - pytest - pytest-cov diff --git a/ci/no-numba.yml b/ci/no-numba.yml new file mode 100644 index 000000000..1d846d117 --- /dev/null +++ b/ci/no-numba.yml @@ -0,0 +1,24 @@ +name: flox-tests +channels: + - conda-forge +dependencies: + - asv + - cachey + - codecov + - dask-core + - netcdf4 + - pandas + - numpy>=1.21 + - lxml # for mypy coverage report + - matplotlib + - pip + - pytest + - pytest-cov + - pytest-pretty + - pytest-xdist + - xarray + - pre-commit + - numpy_groupies>=0.9.19 + - pooch + - toolz + - scipy diff --git a/ci/no-xarray.yml b/ci/no-xarray.yml index 491d7ba8e..9eb75cf96 100644 --- a/ci/no-xarray.yml +++ b/ci/no-xarray.yml @@ -5,7 +5,7 @@ dependencies: - codecov - netcdf4 - pandas - - numpy>=1.20 + - numpy>=1.21 - pip - pytest - pytest-cov diff --git a/ci/upstream-dev-env.yml b/ci/upstream-dev-env.yml index 6b8e796ea..47e6202ff 100644 --- a/ci/upstream-dev-env.yml +++ b/ci/upstream-dev-env.yml @@ -14,8 +14,8 @@ dependencies: - pytest-xdist - pip - pip: - - git+https://github.com/pydata/xarray - - git+https://github.com/pandas-dev/pandas - - git+https://github.com/dask/dask - - git+https://github.com/ml31415/numpy-groupies - - git+https://github.com/numbagg/numbagg + - git+https://github.com/pydata/xarray + - git+https://github.com/pandas-dev/pandas + - git+https://github.com/dask/dask + - git+https://github.com/ml31415/numpy-groupies + - git+https://github.com/numbagg/numbagg diff --git a/codecov.yml b/codecov.yml index 90c354ae2..6228abe13 100644 --- a/codecov.yml +++ b/codecov.yml @@ -5,9 +5,9 @@ codecov: comment: false ignore: - - 'benchmarks/*.py' - - 'tests/*.py' - - 'setup.py' + - "benchmarks/*.py" + - "tests/*.py" + - "setup.py" coverage: precision: 2 diff --git a/docs/source/_static/style.css b/docs/source/_static/style.css index 5d9d45bb7..d97a63fad 100644 --- a/docs/source/_static/style.css +++ b/docs/source/_static/style.css @@ -4,7 +4,9 @@ padding-left: 1.25em; border-left: thin var(--color-foreground-muted) solid; } -.xr-array-wrap, .xr-var-data, .xr-var-preview { +.xr-array-wrap, +.xr-var-data, +.xr-var-preview { font-size: 0.9em; } .gp { diff --git a/docs/source/engines.md b/docs/source/engines.md index 68c65cab5..8a33c675c 100644 --- a/docs/source/engines.md +++ b/docs/source/engines.md @@ -16,7 +16,7 @@ See [](arrays) for more details. ## Tradeoffs -For the common case of reducing a nD array by a 1D array of group labels (e.g. `groupby("time.month")`), `engine="numbagg"` is almost always faster, and `engine="flox"` *can* be faster. +For the common case of reducing a nD array by a 1D array of group labels (e.g. `groupby("time.month")`), `engine="numbagg"` is almost always faster, and `engine="flox"` _can_ be faster. The reason is that `numpy_groupies` converts all groupby problems to a 1D problem, this can involve [some overhead](https://github.com/ml31415/numpy-groupies/pull/46). It is possible to optimize this a bit in `flox` or `numpy_groupies`, but the work has not been done yet. diff --git a/docs/source/implementation.md b/docs/source/implementation.md index 29d9faf46..9a66a6c8d 100644 --- a/docs/source/implementation.md +++ b/docs/source/implementation.md @@ -43,7 +43,7 @@ width: 100% --- ``` -The first step is to extract all members of a group, which involves a *lot* of +The first step is to extract all members of a group, which involves a _lot_ of communication and is quite expensive (in dataframe terminology, this is a "shuffle"). This is fundamentally why many groupby reductions don't work well right now with big datasets. @@ -129,7 +129,7 @@ width: 100% --- ``` -*Tradeoffs* +_Tradeoffs_ 1. Only works for certain groupings. 1. Group labels must be known at graph construction time, so this only works for numpy arrays @@ -146,14 +146,14 @@ width: 100% The `map-reduce` strategy is quite effective but can involve some unnecessary communication. It can be possible to exploit patterns in how group labels are distributed across chunks (similar to `method="blockwise"` above). Two cases are illustrative: -1. Groups labels can be *approximately-periodic*: e.g. `time.dayofyear` (period 365 or 366) or `time.month` (period 12). +1. Groups labels can be _approximately-periodic_: e.g. `time.dayofyear` (period 365 or 366) or `time.month` (period 12). Consider our earlier example, `groupby("time.month")` with monthly frequency data and chunksize of 4 along `time`. ![cohorts-schematic](/../diagrams/cohorts-month-chunk4.png) Because a chunksize of 4 evenly divides the number of groups (12) all we need to do is index out blocks 0, 3, 7 and then apply the `"map-reduce"` strategy to form the final result for months Jan-Apr. Repeat for the remaining groups of months (May-Aug; Sep-Dec) and then concatenate. -1. Groups can be *spatially localized* like the blockwise case above, for example grouping by country administrative boundaries like +1. Groups can be _spatially localized_ like the blockwise case above, for example grouping by country administrative boundaries like counties or districts. In this case, concatenating the result for the northwesternmost county or district and the southeasternmost district can involve a lot of wasteful communication (again depending on chunking). @@ -172,7 +172,7 @@ Consider our earlier example, `groupby("time.month")` with monthly frequency dat ![cohorts-schematic](/../diagrams/cohorts-month-chunk4.png) With `method="map-reduce", reindex=True`, each block will become 3x its original size at the blockwise step: input blocks have 4 timesteps while output block -has a value for all 12 months. Note that the blockwise groupby-reduction *does not reduce* the data since there is only one element in each +has a value for all 12 months. Note that the blockwise groupby-reduction _does not reduce_ the data since there is only one element in each group. In addition, since `map-reduce` will make the final result have only one chunk of size 12 along the new `month` dimension, the final result has chunk sizes 3x that of the input, which may not be ideal. @@ -184,7 +184,7 @@ remaining groups of months (May-Aug; Sep-Dec) and then concatenate. This is the We can generalize this idea for more complicated problems (inspired by the `split_out`kwarg in `dask.dataframe.groupby`) We first apply the groupby-reduction blockwise, then split and reindex blocks to create a new array with which we complete the reduction -using `map-reduce`. Because the split or shuffle step occurs after the blockwise reduction, we *sometimes* communicate a significantly smaller +using `map-reduce`. Because the split or shuffle step occurs after the blockwise reduction, we _sometimes_ communicate a significantly smaller amount of data than if we split or shuffled the input array. ```{image} /../diagrams/new-cohorts-annotated.svg diff --git a/docs/source/index.md b/docs/source/index.md index 9fdd470c2..353dddfbe 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -31,8 +31,8 @@ See a presentation ([video](https://discourse.pangeo.io/t/november-17-2021-flox- 1. {py:func}`flox.xarray.xarray_reduce` [extends](xarray.md) Xarray's GroupBy operations allowing lazy grouping by dask arrays, grouping by multiple arrays, as well as combining categorical grouping and histogram-style binning operations using multiple variables. 1. `flox` also provides utility functions for rechunking both dask arrays and Xarray objects along a single dimension using the group labels as a guide: - 1. To rechunk for blockwise operations: {py:func}`flox.rechunk_for_blockwise`, {py:func}`flox.xarray.rechunk_for_blockwise`. - 1. To rechunk so that "cohorts", or groups of labels, tend to occur in the same chunks: {py:func}`flox.rechunk_for_cohorts`, {py:func}`flox.xarray.rechunk_for_cohorts`. + 1. To rechunk for blockwise operations: {py:func}`flox.rechunk_for_blockwise`, {py:func}`flox.xarray.rechunk_for_blockwise`. + 1. To rechunk so that "cohorts", or groups of labels, tend to occur in the same chunks: {py:func}`flox.rechunk_for_cohorts`, {py:func}`flox.xarray.rechunk_for_cohorts`. ## Installing diff --git a/docs/source/user-stories/hourly-climatology.html b/docs/source/user-stories/hourly-climatology.html index fc78db18d..f626ee551 100644 --- a/docs/source/user-stories/hourly-climatology.html +++ b/docs/source/user-stories/hourly-climatology.html @@ -1,88 +1,14961 @@ - - - - - + - + + Dask Performance Report - - Dask Performance Report - - - - - - - - - - - - - - - + + + + - +
- - - - - -
- - - - - - - + - + } + }, + 10, + root, + ); + } + })(window); + }); + }; + if (document.readyState != "loading") fn(); + else document.addEventListener("DOMContentLoaded", fn); + })(); + - diff --git a/flox/aggregations.py b/flox/aggregations.py index ec696da53..b2129c197 100644 --- a/flox/aggregations.py +++ b/flox/aggregations.py @@ -225,7 +225,7 @@ def __init__( "final": final_dtype, "intermediate": self._normalize_dtype_fill_value(dtypes, "dtype"), } - self.dtype: AggDtype = None # type: ignore + self.dtype: AggDtype = None # type: ignore[assignment] # The following are set by _initialize_aggregation self.finalize_kwargs: dict[Any, Any] = {} diff --git a/flox/cache.py b/flox/cache.py index dd6416aea..e43c540d6 100644 --- a/flox/cache.py +++ b/flox/cache.py @@ -9,4 +9,4 @@ memoize = partial(cache.memoize, key=dask.base.tokenize) except ImportError: cache = {} - memoize = lambda x: x # type: ignore + memoize = lambda x: x # type: ignore[assignment] diff --git a/flox/core.py b/flox/core.py index e2784ae99..482c8fac1 100644 --- a/flox/core.py +++ b/flox/core.py @@ -7,7 +7,7 @@ import sys import warnings from collections import namedtuple -from collections.abc import Mapping, Sequence +from collections.abc import Sequence from functools import partial, reduce from numbers import Integral from typing import ( @@ -42,9 +42,10 @@ else: from typing import Unpack except (ModuleNotFoundError, ImportError): - Unpack: Any # type: ignore + Unpack: Any # type: ignore[no-redef] import dask.array.Array as DaskArray + from dask.typing import Graph T_DuckArray = Union[np.ndarray, DaskArray] # Any ? T_By = T_DuckArray @@ -576,7 +577,7 @@ def factorize_( idx = sorter[(idx,)] idx[mask] = -1 else: - idx, groups = pd.factorize(flat, sort=sort) # type: ignore # pandas issue? + idx, groups = pd.factorize(flat, sort=sort) # type: ignore[arg-type] found_groups.append(np.array(groups)) factorized.append(idx.reshape(groupvar.shape)) @@ -1156,11 +1157,7 @@ def _reduce_blockwise( agg.finalize = None assert agg.finalize_kwargs is not None - if isinstance(agg.finalize_kwargs, Mapping): - finalize_kwargs_: tuple[dict[Any, Any], ...] = (agg.finalize_kwargs,) - else: - finalize_kwargs_ = agg.finalize_kwargs - finalize_kwargs_ += ({},) + ({},) + finalize_kwargs_: tuple[dict[Any, Any], ...] = (agg.finalize_kwargs,) + ({},) + ({},) results = chunk_reduce( array, @@ -1266,7 +1263,7 @@ def subset_to_blocks( chunks = tuple(tuple(np.array(c)[i].tolist()) for c, i in zip(array.chunks, squeezed)) keys = itertools.product(*(range(len(c)) for c in chunks)) - layer = {(name,) + key: tuple(new_keys[key].tolist()) for key in keys} + layer: Graph = {(name,) + key: tuple(new_keys[key].tolist()) for key in keys} graph = HighLevelGraph.from_collections(name, layer, dependencies=[array]) return dask.array.Array(graph, name, chunks, meta=array) @@ -1276,14 +1273,11 @@ def _extract_unknown_groups(reduced, dtype) -> tuple[DaskArray]: import dask.array from dask.highlevelgraph import HighLevelGraph - layer: dict[tuple, tuple] = {} groups_token = f"group-{reduced.name}" first_block = reduced.ndim * (0,) - layer[(groups_token, *first_block)] = ( - operator.getitem, - (reduced.name, *first_block), - "groups", - ) + layer: Graph = { + (groups_token, *first_block): (operator.getitem, (reduced.name, *first_block), "groups") + } groups: tuple[DaskArray] = ( dask.array.Array( HighLevelGraph.from_collections(groups_token, layer, dependencies=[reduced]), @@ -1537,6 +1531,7 @@ def _collapse_blocks_along_axes(reduced: DaskArray, axis: T_Axes, group_chunks) inchunk = ochunk[: -len(axis)] + np.unravel_index(ochunk[-1], nblocks) layer2[(name, *ochunk)] = (reduced.name, *inchunk) + layer2: Graph return dask.array.Array( HighLevelGraph.from_collections(name, layer2, dependencies=[reduced]), name, @@ -1814,7 +1809,7 @@ def groupby_reduce( fewer than min_count non-NA values are present the result will be NA. Only used if skipna is set to True or defaults to True for the array's dtype. - method : {"map-reduce", "blockwise", "cohorts", "split-reduce"}, optional + method : {"map-reduce", "blockwise", "cohorts"}, optional Strategy for reduction of dask arrays only: * ``"map-reduce"``: First apply the reduction blockwise on ``array``, then @@ -1838,8 +1833,6 @@ def groupby_reduce( 'month', dayofyear' etc. Optimize chunking ``array`` for this method by first rechunking using ``rechunk_for_cohorts`` (for 1D ``by`` only). - * ``"split-reduce"``: - Same as "cohorts" and will be removed soon. engine : {"flox", "numpy", "numba", "numbagg"}, optional Algorithm to compute the groupby reduction on non-dask arrays and on each dask chunk: * ``"numpy"``: @@ -1915,12 +1908,9 @@ def groupby_reduce( "Try engine='numpy' or engine='numba' instead." ) - if method in ["split-reduce", "cohorts"] and any_by_dask: + if method == "cohorts" and any_by_dask: raise ValueError(f"method={method!r} can only be used when grouping by numpy arrays.") - if method == "split-reduce": - method = "cohorts" - reindex = _validate_reindex( reindex, func, method, expected_groups, any_by_dask, is_duck_dask_array(array) ) @@ -1976,7 +1966,7 @@ def groupby_reduce( axis_ = tuple(array.ndim + np.arange(-by_.ndim, 0)) else: # TODO: How come this function doesn't exist according to mypy? - axis_ = np.core.numeric.normalize_axis_tuple(axis, array.ndim) # type: ignore + axis_ = np.core.numeric.normalize_axis_tuple(axis, array.ndim) # type: ignore[attr-defined] nax = len(axis_) has_dask = is_duck_dask_array(array) or is_duck_dask_array(by_) @@ -2053,7 +2043,8 @@ def groupby_reduce( # TODO: How else to narrow that array.chunks is there? assert isinstance(array, DaskArray) - if agg.chunk[0] is None and method != "blockwise": + # TODO: fix typing of FuncTuple in Aggregation + if agg.chunk[0] is None and method != "blockwise": # type: ignore[unreachable] raise NotImplementedError( f"Aggregation {agg.name!r} is only implemented for dask arrays when method='blockwise'." f"Received method={method!r}" diff --git a/flox/xarray.py b/flox/xarray.py index 7e8c4f2b1..0f5d68fde 100644 --- a/flox/xarray.py +++ b/flox/xarray.py @@ -110,7 +110,7 @@ def xarray_reduce( in ``expected_groups`` is not actually present in ``by``. dtype : data-type, optional DType for the output. Can be anything accepted by ``np.dtype``. - method : {"map-reduce", "blockwise", "cohorts", "split-reduce"}, optional + method : {"map-reduce", "blockwise", "cohorts"}, optional Strategy for reduction of dask arrays only: * ``"map-reduce"``: First apply the reduction blockwise on ``array``, then @@ -134,8 +134,6 @@ def xarray_reduce( 'month', dayofyear' etc. Optimize chunking ``array`` for this method by first rechunking using ``rechunk_for_cohorts`` (for 1D ``by`` only). - * ``"split-reduce"``: - Same as "cohorts" and will be removed soon. engine : {"flox", "numpy", "numba"}, optional Algorithm to compute the groupby reduction on non-dask arrays and on each dask chunk: * ``"numpy"``: @@ -253,7 +251,7 @@ def xarray_reduce( try: from xarray.indexes import PandasMultiIndex except ImportError: - PandasMultiIndex = tuple() # type: ignore + PandasMultiIndex = tuple() # type: ignore[assignment, misc] more_drop = set() for var in maybe_drop: diff --git a/flox/xrutils.py b/flox/xrutils.py index 4f80ec8c8..b1be05fe9 100644 --- a/flox/xrutils.py +++ b/flox/xrutils.py @@ -20,7 +20,7 @@ dask_array_type = dask.array.Array except ImportError: - dask_array_type = () # type: ignore + dask_array_type = () # type: ignore[assignment, misc] def asarray(data, xp=np): diff --git a/pyproject.toml b/pyproject.toml index 721661c91..0a236de62 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,7 +17,7 @@ classifiers = [ ] dependencies = [ "pandas", - "numpy>=1.20", + "numpy>=1.21", "numpy_groupies>=0.9.19", "toolz", ] @@ -107,6 +107,8 @@ allow_redefinition = true files = "**/*.py" show_error_codes = true warn_unused_ignores = true +warn_unreachable = true +enable_error_code = ["ignore-without-code", "redundant-expr", "truthy-bool"] [[tool.mypy.overrides]] module=[ diff --git a/readthedocs.yml b/readthedocs.yml index ebe5245c9..25699dd03 100644 --- a/readthedocs.yml +++ b/readthedocs.yml @@ -1,11 +1,11 @@ version: 2 build: - os: 'ubuntu-20.04' + os: "ubuntu-20.04" tools: - python: 'mambaforge-4.10' + python: "mambaforge-4.10" conda: - environment: ci/docs.yml + environment: ci/docs.yml formats: [] diff --git a/tests/__init__.py b/tests/__init__.py index f46319172..4af6bc87e 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -14,7 +14,7 @@ dask_array_type = da.Array except ImportError: - dask_array_type = () # type: ignore + dask_array_type = () # type: ignore[assignment, misc] try: @@ -22,7 +22,7 @@ xr_types = (xr.DataArray, xr.Dataset) except ImportError: - xr_types = () # type: ignore + xr_types = () # type: ignore[assignment] def _importorskip(modname, minversion=None): diff --git a/tests/test_core.py b/tests/test_core.py index 09ee50902..d170c95e5 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1252,7 +1252,7 @@ def grouped_median(group_idx, array, *, axis=-1, size=None, fill_value=None, dty expected = np.median(array, axis=-1, keepdims=True) assert_equal(expected, actual) - for method in ["map-reduce", "cohorts", "split-reduce"]: + for method in ["map-reduce", "cohorts"]: with pytest.raises(NotImplementedError): groupby_reduce( dask.array.from_array(array, chunks=(1, -1)),