diff --git a/.asf.yaml b/.asf.yaml index 9bd4e0ef42b7f..1e7fcf1e07ece 100644 --- a/.asf.yaml +++ b/.asf.yaml @@ -24,6 +24,7 @@ github: - danepitkin - davisusanibar - felipecrv + - js8544 - mapleFU notifications: diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 0d68763ae90d5..41a075b1c0bcb 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -42,12 +42,12 @@ /go/ @zeroshade /java/ @lidavidm /js/ @domoritz @trxcllnt -/matlab/ @kevingurney -/python/ @AlenkaF +/matlab/ @kevingurney @kou /python/pyarrow/_flight.pyx @lidavidm /python/pyarrow/**/*gandiva* @wjones127 /r/ @paleolimbot @thisisnic /ruby/ @kou +/swift/ @kou # Docs # /docs/ diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 5c1e35da52fda..538482f96c0ee 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -29,3 +29,13 @@ updates: interval: "weekly" commit-message: prefix: "MINOR: [C#] " + ignore: + - dependency-name: "Microsoft.Extensions.*" + update-types: + - "version-update:semver-major" + - dependency-name: "Microsoft.Bcl.*" + update-types: + - "version-update:semver-major" + - dependency-name: "System.*" + update-types: + - "version-update:semver-major" diff --git a/.github/workflows/cpp.yml b/.github/workflows/cpp.yml index 28a8de8dd802f..cd12be11488bb 100644 --- a/.github/workflows/cpp.yml +++ b/.github/workflows/cpp.yml @@ -67,7 +67,8 @@ jobs: image: conda-cpp llvm: "14" runs-on: ubuntu-latest - title: AMD64 Conda C++ + simd-level: AVX2 + title: AMD64 Conda C++ AVX2 ubuntu: "22.04" - arch: amd64 clang-tools: "14" @@ -85,6 +86,7 @@ jobs: ubuntu: "20.04" env: ARCH: ${{ matrix.arch }} + ARROW_SIMD_LEVEL: ${{ matrix.simd-level }} CLANG_TOOLS: ${{ matrix.clang-tools }} LLVM: ${{ matrix.llvm }} UBUNTU: ${{ matrix.ubuntu }} @@ -151,6 +153,7 @@ jobs: if: ${{ !contains(github.event.pull_request.title, 'WIP') }} timeout-minutes: 75 env: + ARROW_AZURE: ON ARROW_BUILD_TESTS: ON ARROW_DATASET: ON ARROW_FLIGHT: ON @@ -174,6 +177,10 @@ jobs: ARROW_WITH_ZSTD: ON GTest_SOURCE: BUNDLED steps: + - name: CPU Info + run: | + sysctl -a | grep cpu + sysctl -a | grep "hw.optional" - name: Checkout Arrow uses: actions/checkout@v3 with: @@ -219,7 +226,7 @@ jobs: ci/scripts/cpp_test.sh $(pwd) $(pwd)/build windows: - name: AMD64 ${{ matrix.name }} C++17 + name: ${{ matrix.title }} runs-on: ${{ matrix.os }} if: ${{ !contains(github.event.pull_request.title, 'WIP') }} timeout-minutes: 60 @@ -230,7 +237,8 @@ jobs: - windows-2019 include: - os: windows-2019 - name: Windows 2019 + simd-level: AVX2 + title: AMD64 Windows 2019 C++17 AVX2 env: ARROW_BOOST_USE_SHARED: OFF ARROW_BUILD_BENCHMARKS: ON @@ -245,6 +253,7 @@ jobs: ARROW_MIMALLOC: ON ARROW_ORC: ON ARROW_PARQUET: ON + ARROW_SIMD_LEVEL: ${{ matrix.simd-level }} ARROW_USE_GLOG: OFF ARROW_VERBOSE_THIRDPARTY_BUILD: OFF ARROW_WITH_BROTLI: OFF @@ -260,8 +269,6 @@ jobs: CMAKE_INSTALL_LIBDIR: bin CMAKE_INSTALL_PREFIX: /usr CMAKE_UNITY_BUILD: ON - OPENSSL_ROOT_DIR: >- - C:\Program Files\OpenSSL-Win64 NPROC: 3 steps: - name: Disable Crash Dialogs @@ -364,6 +371,7 @@ jobs: CMAKE_ARGS: >- -DARROW_PACKAGE_PREFIX=/${{ matrix.msystem_lower}} -DBoost_NO_BOOST_CMAKE=ON + -DCMAKE_FIND_PACKAGE_PREFER_CONFIG=ON # We can't use unity build because we don't have enough memory on # GitHub Actions. # CMAKE_UNITY_BUILD: ON diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml index 7c2437f6edfb5..119d11d9a399a 100644 --- a/.github/workflows/dev.yml +++ b/.github/workflows/dev.yml @@ -103,7 +103,7 @@ jobs: shell: bash run: | gem install test-unit - pip install cython setuptools six pytest jira + pip install "cython<3" setuptools six pytest jira - name: Run Release Test env: ARROW_GITHUB_API_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/dev_pr/labeler.yml b/.github/workflows/dev_pr/labeler.yml index a9a13e82a9dd0..c31d1f309752e 100644 --- a/.github/workflows/dev_pr/labeler.yml +++ b/.github/workflows/dev_pr/labeler.yml @@ -45,6 +45,9 @@ "Component: Ruby": - ruby/**/* +"Component: Swift": + - swift/**/* + "Component: FlightRPC": - cpp/src/arrow/flight/**/* - r/R/flight.* diff --git a/.github/workflows/issue_bot.yml b/.github/workflows/issue_bot.yml index ffd56e440f430..ae344a4c1eba9 100644 --- a/.github/workflows/issue_bot.yml +++ b/.github/workflows/issue_bot.yml @@ -21,6 +21,7 @@ on: issues: types: - opened + - edited permissions: contents: read @@ -38,6 +39,20 @@ jobs: let split_body = context.payload.issue.body.split('### Component(s)'); if (split_body.length != 2) throw new Error('No components found!'); + let current_labels = await github.rest.issues.listLabelsOnIssue({ + "owner": context.repo.owner, + "repo": context.repo.repo, + "per_page": 100, + "issue_number": context.payload.issue.number, + }); + + let current_label_names = current_labels.data.map(label => label.name); + + // keep non-component labels + let non_component_labels = current_label_names.filter( + label => !label.startsWith("Component: ") + ); + let component_labels = split_body[1] .split(',') .map(component => component.trim()) @@ -56,9 +71,9 @@ jobs: if (component_labels.length == 0) throw new Error('No components found!'); - await github.rest.issues.addLabels({ + await github.rest.issues.setLabels({ "owner": context.repo.owner, "repo": context.repo.repo, "issue_number": context.payload.issue.number, - "labels": component_labels, - }); \ No newline at end of file + "labels": component_labels.concat(non_component_labels), + }); diff --git a/.github/workflows/js.yml b/.github/workflows/js.yml index 60e880101f3bf..7e8ef31b49cb6 100644 --- a/.github/workflows/js.yml +++ b/.github/workflows/js.yml @@ -80,7 +80,7 @@ jobs: name: AMD64 macOS 11 NodeJS ${{ matrix.node }} runs-on: macos-latest if: github.event_name == 'push' - timeout-minutes: 60 + timeout-minutes: 90 strategy: fail-fast: false matrix: @@ -90,6 +90,12 @@ jobs: uses: actions/checkout@v3 with: fetch-depth: 0 + - name: Jest Cache + uses: actions/cache@v3 + with: + path: js/.jest-cache + key: js-jest-cache-${{ runner.os }}-${{ hashFiles('js/src/**/*.ts', 'js/test/**/*.ts', 'js/yarn.lock') }} + restore-keys: js-jest-cache-${{ runner.os }}- - name: Install NodeJS uses: actions/setup-node@v3 with: @@ -114,6 +120,12 @@ jobs: uses: actions/checkout@v3 with: fetch-depth: 0 + - name: Jest Cache + uses: actions/cache@v3 + with: + path: js/.jest-cache + key: js-jest-cache-${{ runner.os }}-${{ hashFiles('js/src/**/*.ts', 'js/test/**/*.ts', 'js/yarn.lock') }} + restore-keys: js-jest-cache-${{ runner.os }}- - name: Install NodeJS uses: actions/setup-node@v3 with: diff --git a/.github/workflows/ruby.yml b/.github/workflows/ruby.yml index c35f01be3f569..a05f4d5a11d68 100644 --- a/.github/workflows/ruby.yml +++ b/.github/workflows/ruby.yml @@ -229,6 +229,7 @@ jobs: CMAKE_ARGS: >- -DARROW_PACKAGE_PREFIX=/ucrt${{ matrix.mingw-n-bits }} -DBoost_NO_BOOST_CMAKE=ON + -DCMAKE_FIND_PACKAGE_PREFER_CONFIG=ON CMAKE_UNITY_BUILD: ON steps: - name: Disable Crash Dialogs diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e95778ce1cbfd..d3c7624f63e71 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -37,7 +37,7 @@ repos: entry: --entrypoint /bin/hadolint hadolint/hadolint:latest - exclude: ^dev/.*$ - repo: https://github.com/pycqa/flake8 - rev: 5.0.3 + rev: 6.1.0 hooks: - id: flake8 name: Python Format diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 2c5f2965107b8..0000000000000 --- a/.travis.yml +++ /dev/null @@ -1,166 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -dist: focal - -language: minimal - -cache: - directories: - - $TRAVIS_BUILD_DIR/.docker - -addons: - apt: - packages: - - python3-pip - -services: - - docker - -# Note that the global "env" setting isn't inherited automatically by -# matrix entries with their own "env", so we have to insert it explicitly. -env: &global_env - ARROW_ENABLE_TIMING_TESTS: "OFF" - COMPOSE_DOCKER_CLI_BUILD: 1 - DOCKER_BUILDKIT: 0 - DOCKER_VOLUME_PREFIX: $TRAVIS_BUILD_DIR/.docker/ - -jobs: - include: - - name: "C++ on s390x" - os: linux - arch: s390x - env: - <<: *global_env - ARCH: s390x - ARROW_CI_MODULES: "CPP" - DOCKER_IMAGE_ID: ubuntu-cpp - # Can't enable ARROW_MIMALLOC because of failures in memory pool tests. - # Can't enable ARROW_S3 because compiler is killed while compiling - # aws-sdk-cpp. - DOCKER_RUN_ARGS: >- - " - -e ARROW_FLIGHT=ON - -e ARROW_GCS=OFF - -e ARROW_MIMALLOC=OFF - -e ARROW_ORC=OFF - -e ARROW_PARQUET=OFF - -e ARROW_S3=OFF - -e ARROW_SUBSTRAIT=OFF - -e CMAKE_BUILD_PARALLEL_LEVEL=2 - -e CMAKE_UNITY_BUILD=ON - -e PARQUET_BUILD_EXAMPLES=OFF - -e PARQUET_BUILD_EXECUTABLES=OFF - -e Protobuf_SOURCE=BUNDLED - -e gRPC_SOURCE=BUNDLED - " - # The LLVM's APT repository causes download error for s390x binary - # We should use the LLVM provided by the default APT repository - CLANG_TOOLS: "10" - LLVM: "10" - UBUNTU: "20.04" - - - name: "Go on s390x" - os: linux - arch: s390x - env: - <<: *global_env - ARCH: s390x - ARROW_CI_MODULES: "GO" - DOCKER_IMAGE_ID: debian-go - - - name: "Java on s390x" - os: linux - arch: s390x - env: - <<: *global_env - ARCH: s390x - ARROW_CI_MODULES: "JAVA" - DOCKER_IMAGE_ID: debian-java - JDK: 11 - - - name: "Python on s390x" - os: linux - arch: s390x - env: - <<: *global_env - ARCH: s390x - ARROW_CI_MODULES: "PYTHON" - DOCKER_IMAGE_ID: ubuntu-python - # Can't enable ARROW_MIMALLOC because of failures in memory pool tests. - # Can't enable ARROW_S3 because compiler is killed while compiling - # aws-sdk-cpp. - DOCKER_RUN_ARGS: >- - " - -e ARROW_FLIGHT=ON - -e ARROW_GCS=OFF - -e ARROW_MIMALLOC=OFF - -e ARROW_ORC=OFF - -e ARROW_PARQUET=OFF - -e ARROW_PYTHON=ON - -e ARROW_S3=OFF - -e CMAKE_BUILD_PARALLEL_LEVEL=2 - -e CMAKE_UNITY_BUILD=ON - -e PARQUET_BUILD_EXAMPLES=OFF - -e PARQUET_BUILD_EXECUTABLES=OFF - -e Protobuf_SOURCE=BUNDLED - -e gRPC_SOURCE=BUNDLED - " - # The LLVM's APT repository causes download error for s390x binary - # We should use the LLVM provided by the default APT repository - CLANG_TOOLS: "10" - LLVM: "10" - UBUNTU: "20.04" - - allow_failures: - - name: "Java on s390x" - - name: "C++ on s390x" - - name: "Python on s390x" - -before_install: - - eval "$(python ci/detect-changes.py)" - - | - arrow_ci_affected=no - for arrow_ci_module in ${ARROW_CI_MODULES}; do - arrow_ci_affected_variable=ARROW_CI_${arrow_ci_module}_AFFECTED - if [ "$(eval "echo \$${arrow_ci_affected_variable}")" = "1" ]; then - arrow_ci_affected=yes - fi - done - if [ "${arrow_ci_affected}" = "no" ]; then - travis_terminate 0 - fi - -install: - - sudo -H pip3 install --upgrade pip - - sudo -H pip3 install 'docker-compose>=1.27.0' - - sudo -H pip3 install -e dev/archery[docker] - -script: - - export ARCHERY_DEFAULT_BRANCH=$(git rev-parse --abbrev-ref origin/HEAD | sed s@origin/@@) - - | - archery docker run \ - ${DOCKER_RUN_ARGS} \ - --volume ${PWD}/build:/build \ - ${DOCKER_IMAGE_ID} - -after_success: - - | - if [ "${TRAVIS_EVENT_TYPE}" = "push" -a \ - "${TRAVIS_REPO_SLUG}" = "apache/arrow" ]; then - archery docker push ${DOCKER_IMAGE_ID} || : - fi diff --git a/ci/conda_env_python.txt b/ci/conda_env_python.txt index 04f985c94bb2c..4ae5c3614a1dc 100644 --- a/ci/conda_env_python.txt +++ b/ci/conda_env_python.txt @@ -18,7 +18,7 @@ # don't add pandas here, because it is not a mandatory test dependency boto3 # not a direct dependency of s3fs, but needed for our s3fs fixture cffi -cython +cython<3 cloudpickle fsspec hypothesis diff --git a/ci/detect-changes.py b/ci/detect-changes.py deleted file mode 100644 index 7669639ecd3a9..0000000000000 --- a/ci/detect-changes.py +++ /dev/null @@ -1,362 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from __future__ import print_function - -import functools -import os -import pprint -import re -import sys -import subprocess - - -perr = functools.partial(print, file=sys.stderr) - - -def dump_env_vars(prefix, pattern=None): - if pattern is not None: - match = lambda s: re.search(pattern, s) - else: - match = lambda s: True - for name in sorted(os.environ): - if name.startswith(prefix) and match(name): - perr("- {0}: {1!r}".format(name, os.environ[name])) - - -def run_cmd(cmdline): - proc = subprocess.Popen(cmdline, - stdout=subprocess.PIPE, stderr=subprocess.PIPE) - out, err = proc.communicate() - if proc.returncode != 0: - raise RuntimeError("Command {cmdline} failed with code {returncode}, " - "stderr was:\n{stderr}\n" - .format(cmdline=cmdline, returncode=proc.returncode, - stderr=err.decode())) - return out - - -def get_commit_description(commit): - """ - Return the textual description (title + body) of the given git commit. - """ - out = run_cmd(["git", "show", "--no-patch", "--pretty=format:%B", - commit]) - return out.decode('utf-8', 'ignore') - - -def list_affected_files(commit_range): - """ - Return a list of files changed by the given git commit range. - """ - perr("Getting affected files from", repr(commit_range)) - out = run_cmd(["git", "diff", "--name-only", commit_range]) - return list(filter(None, (s.strip() for s in out.decode().splitlines()))) - - -def get_travis_head_commit(): - return os.environ['TRAVIS_COMMIT'] - - -def get_travis_commit_range(): - if os.environ['TRAVIS_EVENT_TYPE'] == 'pull_request': - # TRAVIS_COMMIT_RANGE is too pessimistic for PRs, as it may contain - # unrelated changes. Instead, use the same strategy as on AppVeyor - # below. - run_cmd(["git", "fetch", "-q", "origin", - "+refs/heads/{0}".format(os.environ['TRAVIS_BRANCH'])]) - merge_base = run_cmd(["git", "merge-base", - "HEAD", "FETCH_HEAD"]).decode().strip() - return "{0}..HEAD".format(merge_base) - else: - cr = os.environ['TRAVIS_COMMIT_RANGE'] - # See - # https://github.com/travis-ci/travis-ci/issues/4596#issuecomment-139811122 - return cr.replace('...', '..') - - -def get_travis_commit_description(): - # Prefer this to get_commit_description(get_travis_head_commit()), - # as rebasing or other repository events may make TRAVIS_COMMIT invalid - # at the time we inspect it - return os.environ['TRAVIS_COMMIT_MESSAGE'] - - -def list_travis_affected_files(): - """ - Return a list of files affected in the current Travis build. - """ - commit_range = get_travis_commit_range() - try: - return list_affected_files(commit_range) - except RuntimeError: - # TRAVIS_COMMIT_RANGE can contain invalid revisions when - # building a branch (not a PR) after rebasing: - # https://github.com/travis-ci/travis-ci/issues/2668 - if os.environ['TRAVIS_EVENT_TYPE'] == 'pull_request': - raise - # If it's a rebase, it's probably enough to use the last commit only - commit_range = '{0}^..'.format(get_travis_head_commit()) - return list_affected_files(commit_range) - - -def list_appveyor_affected_files(): - """ - Return a list of files affected in the current AppVeyor build. - This only works for PR builds. - """ - # Re-fetch PR base branch (e.g. origin/master), pointing FETCH_HEAD to it - run_cmd(["git", "fetch", "-q", "origin", - "+refs/heads/{0}".format(os.environ['APPVEYOR_REPO_BRANCH'])]) - # Compute base changeset between FETCH_HEAD (PR base) and HEAD (PR head) - merge_base = run_cmd(["git", "merge-base", - "HEAD", "FETCH_HEAD"]).decode().strip() - # Compute changes files between base changeset and HEAD - return list_affected_files("{0}..HEAD".format(merge_base)) - - -def list_github_actions_affected_files(): - """ - Return a list of files affected in the current GitHub Actions build. - """ - # GitHub Actions checkout `refs/remotes/pull/$PR/merge` where `HEAD` points - # to the merge commit while `HEAD^` points to the commit before. Hence, - # `..HEAD^` points to all commit between the default branch and the PR. - return list_affected_files("HEAD^..") - - -LANGUAGE_TOPICS = ['c_glib', 'cpp', 'docs', 'go', 'java', 'js', 'python', - 'r', 'ruby', 'csharp'] - -ALL_TOPICS = LANGUAGE_TOPICS + ['integration', 'dev'] - - -AFFECTED_DEPENDENCIES = { - 'java': ['integration', 'python'], - 'js': ['integration'], - 'ci': ALL_TOPICS, - 'cpp': ['python', 'c_glib', 'r', 'ruby', 'integration'], - 'format': LANGUAGE_TOPICS, - 'go': ['integration'], - '.travis.yml': ALL_TOPICS, - 'appveyor.yml': ALL_TOPICS, - # In theory, it should ignore CONTRIBUTING.md and ISSUE_TEMPLATE.md, but in - # practice it's going to be CI - '.github': ALL_TOPICS, - 'c_glib': ['ruby'] -} - -COMPONENTS = {'cpp', 'java', 'c_glib', 'r', 'ruby', 'integration', 'js', - 'csharp', 'go', 'docs', 'python', 'dev'} - - -def get_affected_topics(affected_files): - """ - Return a dict of topics affected by the given files. - Each dict value is True if affected, False otherwise. - """ - affected = dict.fromkeys(ALL_TOPICS, False) - - for path in affected_files: - parts = [] - head = path - while head: - head, tail = os.path.split(head) - parts.append(tail) - parts.reverse() - assert parts - p = parts[0] - fn = parts[-1] - if fn.startswith('README'): - continue - - if p in COMPONENTS: - affected[p] = True - - _path_already_affected = {} - - def _affect_dependencies(component): - if component in _path_already_affected: - # For circular dependencies, terminate - return - for topic in AFFECTED_DEPENDENCIES.get(component, ()): - affected[topic] = True - _affect_dependencies(topic) - _path_already_affected[topic] = True - - _affect_dependencies(p) - - return affected - - -def make_env_for_topics(affected): - return {'ARROW_CI_{0}_AFFECTED'.format(k.upper()): '1' if v else '0' - for k, v in affected.items()} - - -def get_unix_shell_eval(env): - """ - Return a shell-evalable string to setup some environment variables. - """ - return "; ".join(("export {0}='{1}'".format(k, v) - for k, v in env.items())) - - -def get_windows_shell_eval(env): - """ - Return a shell-evalable string to setup some environment variables. - """ - return "\n".join(('set "{0}={1}"'.format(k, v) - for k, v in env.items())) - - -def run_from_travis(): - perr("Environment variables (excerpt):") - dump_env_vars('TRAVIS_', '(BRANCH|COMMIT|PULL)') - if (os.environ['TRAVIS_REPO_SLUG'] == 'apache/arrow' and - os.environ['TRAVIS_BRANCH'] in ['master', 'main'] and - os.environ['TRAVIS_EVENT_TYPE'] != 'pull_request'): - # Never skip anything on default-branch builds in the official repo - affected = dict.fromkeys(ALL_TOPICS, True) - else: - desc = get_travis_commit_description() - if '[skip travis]' in desc: - # Skip everything - affected = dict.fromkeys(ALL_TOPICS, False) - elif '[force ci]' in desc or '[force travis]' in desc: - # Test everything - affected = dict.fromkeys(ALL_TOPICS, True) - else: - # Test affected topics - affected_files = list_travis_affected_files() - perr("Affected files:", affected_files) - affected = get_affected_topics(affected_files) - assert set(affected) <= set(ALL_TOPICS), affected - - perr("Affected topics:") - perr(pprint.pformat(affected)) - return get_unix_shell_eval(make_env_for_topics(affected)) - - -def run_from_appveyor(): - perr("Environment variables (excerpt):") - dump_env_vars('APPVEYOR_', '(PULL|REPO)') - if not os.environ.get('APPVEYOR_PULL_REQUEST_HEAD_COMMIT'): - # Not a PR build, test everything - affected = dict.fromkeys(ALL_TOPICS, True) - else: - affected_files = list_appveyor_affected_files() - perr("Affected files:", affected_files) - affected = get_affected_topics(affected_files) - assert set(affected) <= set(ALL_TOPICS), affected - - perr("Affected topics:") - perr(pprint.pformat(affected)) - return get_windows_shell_eval(make_env_for_topics(affected)) - - -def run_from_github(): - perr("Environment variables (excerpt):") - dump_env_vars('GITHUB_', '(REPOSITORY|ACTOR|SHA|REF|HEAD_REF|BASE_REF|EVENT_NAME)') - if os.environ['GITHUB_EVENT_NAME'] != 'pull_request': - # Not a PR build, test everything - affected = dict.fromkeys(ALL_TOPICS, True) - else: - affected_files = list_github_actions_affected_files() - perr("Affected files:", affected_files) - affected = get_affected_topics(affected_files) - assert set(affected) <= set(ALL_TOPICS), affected - - perr("Affected topics:") - perr(pprint.pformat(affected)) - return get_unix_shell_eval(make_env_for_topics(affected)) - - -def test_get_affected_topics(): - affected_topics = get_affected_topics(['cpp/CMakeLists.txt']) - assert affected_topics == { - 'c_glib': True, - 'cpp': True, - 'docs': False, - 'go': False, - 'java': False, - 'js': False, - 'python': True, - 'r': True, - 'ruby': True, - 'csharp': False, - 'integration': True, - 'dev': False - } - - affected_topics = get_affected_topics(['format/Schema.fbs']) - assert affected_topics == { - 'c_glib': True, - 'cpp': True, - 'docs': True, - 'go': True, - 'java': True, - 'js': True, - 'python': True, - 'r': True, - 'ruby': True, - 'csharp': True, - 'integration': True, - 'dev': False - } - - affected_topics = get_affected_topics(['.github/workflows']) - assert affected_topics == { - 'c_glib': True, - 'cpp': True, - 'docs': True, - 'go': True, - 'java': True, - 'js': True, - 'python': True, - 'r': True, - 'ruby': True, - 'csharp': True, - 'integration': True, - 'dev': True, - } - - -if __name__ == "__main__": - # This script should have its output evaluated by a shell, - # e.g. "eval `python ci/detect-changes.py`" - if os.environ.get('TRAVIS'): - try: - print(run_from_travis()) - except Exception: - # Make sure the enclosing eval will return an error - print("exit 1") - raise - elif os.environ.get('APPVEYOR'): - try: - print(run_from_appveyor()) - except Exception: - print("exit 1") - raise - elif os.environ.get('GITHUB_WORKFLOW'): - try: - print(run_from_github()) - except Exception: - print("exit 1") - raise - else: - sys.exit("Script must be run under Travis-CI, AppVeyor or GitHub Actions") diff --git a/ci/docker/java-jni-manylinux-201x.dockerfile b/ci/docker/java-jni-manylinux-201x.dockerfile index 6374e40641ce2..e9888db7c5332 100644 --- a/ci/docker/java-jni-manylinux-201x.dockerfile +++ b/ci/docker/java-jni-manylinux-201x.dockerfile @@ -34,8 +34,12 @@ RUN vcpkg install \ # Install Java ARG java=1.8.0 -RUN yum install -y java-$java-openjdk-devel rh-maven35 && yum clean all -ENV JAVA_HOME=/usr/lib/jvm/java-$java-openjdk/ +ARG maven=3.9.3 +RUN yum install -y java-$java-openjdk-devel && \ + yum clean all && \ + curl https://dlcdn.apache.org/maven/maven-3/${maven}/binaries/apache-maven-${maven}-bin.tar.gz | \ + tar xfz - -C /usr/local && \ + ln -s /usr/local/apache-maven-${maven}/bin/mvn /usr/local/bin # Install the gcs testbench COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts/ diff --git a/ci/docker/linux-apt-r.dockerfile b/ci/docker/linux-apt-r.dockerfile index 19f30717ca2e2..c59766c4a665c 100644 --- a/ci/docker/linux-apt-r.dockerfile +++ b/ci/docker/linux-apt-r.dockerfile @@ -47,10 +47,6 @@ RUN apt-get update -y && \ libxml2-dev \ libgit2-dev \ libssl-dev \ - # install clang to mirror what was done on Travis - clang \ - clang-format \ - clang-tidy \ # R CMD CHECK --as-cran needs pdflatex to build the package manual texlive-latex-base \ # Need locales so we can set UTF-8 diff --git a/ci/docker/ubuntu-20.04-cpp.dockerfile b/ci/docker/ubuntu-20.04-cpp.dockerfile index a5c1f0cdc1822..f94494177e8ee 100644 --- a/ci/docker/ubuntu-20.04-cpp.dockerfile +++ b/ci/docker/ubuntu-20.04-cpp.dockerfile @@ -141,6 +141,7 @@ RUN /arrow/ci/scripts/install_sccache.sh unknown-linux-musl /usr/local/bin # static Arrow to run Flight/Flight SQL tests ENV absl_SOURCE=BUNDLED \ ARROW_ACERO=ON \ + ARROW_AZURE=ON \ ARROW_BUILD_STATIC=ON \ ARROW_BUILD_TESTS=ON \ ARROW_DEPENDENCY_SOURCE=SYSTEM \ diff --git a/ci/docker/ubuntu-22.04-cpp.dockerfile b/ci/docker/ubuntu-22.04-cpp.dockerfile index 764e9fd4f9ded..e773c6f1ee659 100644 --- a/ci/docker/ubuntu-22.04-cpp.dockerfile +++ b/ci/docker/ubuntu-22.04-cpp.dockerfile @@ -164,6 +164,7 @@ RUN /arrow/ci/scripts/install_sccache.sh unknown-linux-musl /usr/local/bin # - libgtest-dev only provide sources ENV absl_SOURCE=BUNDLED \ ARROW_ACERO=ON \ + ARROW_AZURE=ON \ ARROW_BUILD_STATIC=ON \ ARROW_BUILD_TESTS=ON \ ARROW_DEPENDENCY_SOURCE=SYSTEM \ diff --git a/ci/scripts/cpp_build.sh b/ci/scripts/cpp_build.sh index 9854c5ff162af..e53b3fa460915 100755 --- a/ci/scripts/cpp_build.sh +++ b/ci/scripts/cpp_build.sh @@ -83,7 +83,8 @@ pushd ${build_dir} cmake \ -Dabsl_SOURCE=${absl_SOURCE:-} \ - -DARROW_ACERO=${ARROW_ACERO:-ON} \ + -DARROW_ACERO=${ARROW_ACERO:-OFF} \ + -DARROW_AZURE=${ARROW_AZURE:-OFF} \ -DARROW_BOOST_USE_SHARED=${ARROW_BOOST_USE_SHARED:-ON} \ -DARROW_BUILD_BENCHMARKS_REFERENCE=${ARROW_BUILD_BENCHMARKS:-OFF} \ -DARROW_BUILD_BENCHMARKS=${ARROW_BUILD_BENCHMARKS:-OFF} \ @@ -109,7 +110,6 @@ cmake \ -DARROW_EXTRA_ERROR_CONTEXT=${ARROW_EXTRA_ERROR_CONTEXT:-OFF} \ -DARROW_FILESYSTEM=${ARROW_FILESYSTEM:-ON} \ -DARROW_FLIGHT=${ARROW_FLIGHT:-OFF} \ - -DARROW_FLIGHT_REQUIRE_TLSCREDENTIALSOPTIONS=${ARROW_FLIGHT_REQUIRE_TLSCREDENTIALSOPTIONS:-OFF} \ -DARROW_FLIGHT_SQL=${ARROW_FLIGHT_SQL:-OFF} \ -DARROW_FUZZING=${ARROW_FUZZING:-OFF} \ -DARROW_GANDIVA_PC_CXX_FLAGS=${ARROW_GANDIVA_PC_CXX_FLAGS:-} \ @@ -126,6 +126,7 @@ cmake \ -DARROW_PARQUET=${ARROW_PARQUET:-OFF} \ -DARROW_RUNTIME_SIMD_LEVEL=${ARROW_RUNTIME_SIMD_LEVEL:-MAX} \ -DARROW_S3=${ARROW_S3:-OFF} \ + -DARROW_SIMD_LEVEL=${ARROW_SIMD_LEVEL:-DEFAULT} \ -DARROW_SKYHOOK=${ARROW_SKYHOOK:-OFF} \ -DARROW_SUBSTRAIT=${ARROW_SUBSTRAIT:-ON} \ -DARROW_TEST_LINKAGE=${ARROW_TEST_LINKAGE:-shared} \ diff --git a/ci/scripts/integration_dask.sh b/ci/scripts/integration_dask.sh index eeaba715b6ae7..d1e2ecdc847f2 100755 --- a/ci/scripts/integration_dask.sh +++ b/ci/scripts/integration_dask.sh @@ -33,6 +33,9 @@ python -c "import dask.dataframe" pytest -v --pyargs dask.dataframe.tests.test_dataframe pytest -v --pyargs dask.dataframe.io.tests.test_orc -pytest -v --pyargs dask.dataframe.io.tests.test_parquet +# skip failing parquet tests +# test_pandas_timestamp_overflow_pyarrow is skipped because of GH-33321. +pytest -v --pyargs dask.dataframe.io.tests.test_parquet \ + -k "not test_pandas_timestamp_overflow_pyarrow" # this file contains parquet tests that use S3 filesystem pytest -v --pyargs dask.bytes.tests.test_s3 diff --git a/ci/scripts/java_jni_macos_build.sh b/ci/scripts/java_jni_macos_build.sh index 2b2384ab7f0f7..4a6f9444ec25f 100755 --- a/ci/scripts/java_jni_macos_build.sh +++ b/ci/scripts/java_jni_macos_build.sh @@ -99,6 +99,7 @@ if [ "${ARROW_BUILD_TESTS}" == "ON" ]; then # MinIO is required exclude_tests="arrow-s3fs-test" # unstable + exclude_tests="${exclude_tests}|arrow-acero-asof-join-node-test" exclude_tests="${exclude_tests}|arrow-acero-hash-join-node-test" ctest \ --exclude-regex "${exclude_tests}" \ diff --git a/ci/scripts/java_jni_manylinux_build.sh b/ci/scripts/java_jni_manylinux_build.sh index 4e1192a4dbad2..03939715e390f 100755 --- a/ci/scripts/java_jni_manylinux_build.sh +++ b/ci/scripts/java_jni_manylinux_build.sh @@ -21,8 +21,14 @@ set -ex arrow_dir=${1} build_dir=${2} +normalized_arch=$(arch) +case ${normalized_arch} in + aarch64) + normalized_arch=aarch_64 + ;; +esac # The directory where the final binaries will be stored when scripts finish -dist_dir=${3}/$(arch) +dist_dir=${3}/${normalized_arch} echo "=== Clear output directories and leftovers ===" # Clear output directories and leftovers @@ -103,9 +109,18 @@ ninja install if [ "${ARROW_BUILD_TESTS}" = "ON" ]; then # MinIO is required exclude_tests="arrow-s3fs-test" + case $(arch) in + aarch64) + # GCS testbench is crashed on aarch64: + # ImportError: ../grpc/_cython/cygrpc.cpython-38-aarch64-linux-gnu.so: + # undefined symbol: vtable for std::__cxx11::basic_ostringstream< + # char, std::char_traits, std::allocator > + exclude_tests="${exclude_tests}|arrow-gcsfs-test" + ;; + esac # unstable - exclude_tests="${exclude_tests}|arrow-compute-hash-join-node-test" - exclude_tests="${exclude_tests}|arrow-dataset-scanner-test" + exclude_tests="${exclude_tests}|arrow-acero-asof-join-node-test" + exclude_tests="${exclude_tests}|arrow-acero-hash-join-node-test" # strptime exclude_tests="${exclude_tests}|arrow-utility-test" ctest \ @@ -138,6 +153,7 @@ fi echo "=== Checking shared dependencies for libraries ===" pushd ${dist_dir} archery linking check-dependencies \ + --allow ld-linux-aarch64 \ --allow ld-linux-x86-64 \ --allow libc \ --allow libdl \ diff --git a/ci/scripts/swift_test.sh b/ci/scripts/swift_test.sh index eac13c5d68ef4..b7ab37fd489c9 100755 --- a/ci/scripts/swift_test.sh +++ b/ci/scripts/swift_test.sh @@ -30,3 +30,8 @@ source_dir=${1}/swift/Arrow pushd ${source_dir} swift test popd + +source_dir=${1}/swift/ArrowFlight +pushd ${source_dir} +swift test +popd diff --git a/cpp/CMakePresets.json b/cpp/CMakePresets.json index 7882be57a0534..94141d693be8f 100644 --- a/cpp/CMakePresets.json +++ b/cpp/CMakePresets.json @@ -97,6 +97,7 @@ "inherits": "features-basic", "hidden": true, "cacheVariables": { + "ARROW_AZURE": "ON", "ARROW_GCS": "ON", "ARROW_HDFS": "ON", "ARROW_S3": "ON" diff --git a/cpp/cmake_modules/DefineOptions.cmake b/cpp/cmake_modules/DefineOptions.cmake index 8601184309f34..f32bb2bcf7290 100644 --- a/cpp/cmake_modules/DefineOptions.cmake +++ b/cpp/cmake_modules/DefineOptions.cmake @@ -31,24 +31,6 @@ function(check_description_length name description) endforeach() endfunction() -function(list_join lst glue out) - if("${${lst}}" STREQUAL "") - set(${out} - "" - PARENT_SCOPE) - return() - endif() - - list(GET ${lst} 0 joined) - list(REMOVE_AT ${lst} 0) - foreach(item ${${lst}}) - set(joined "${joined}${glue}${item}") - endforeach() - set(${out} - ${joined} - PARENT_SCOPE) -endfunction() - macro(define_option name description default) set(options) set(one_value_args) @@ -63,7 +45,7 @@ macro(define_option name description default) endif() check_description_length(${name} ${description}) - list_join(description "\n" multiline_description) + list(JOIN description "\n" multiline_description) option(${name} "${multiline_description}" ${default}) @@ -76,7 +58,7 @@ endmacro() macro(define_option_string name description default) check_description_length(${name} ${description}) - list_join(description "\n" multiline_description) + list(JOIN description "\n" multiline_description) set(${name} ${default} @@ -87,8 +69,12 @@ macro(define_option_string name description default) set("${name}_OPTION_DEFAULT" "\"${default}\"") set("${name}_OPTION_TYPE" "string") set("${name}_OPTION_POSSIBLE_VALUES" ${ARGN}) - - list_join("${name}_OPTION_POSSIBLE_VALUES" "|" "${name}_OPTION_ENUM") + list(FIND ${name}_OPTION_POSSIBLE_VALUES "${default}" default_value_index) + if(NOT ${default_value_index} EQUAL -1) + list(REMOVE_AT ${name}_OPTION_POSSIBLE_VALUES ${default_value_index}) + list(PREPEND ${name}_OPTION_POSSIBLE_VALUES "${default}") + endif() + list(JOIN "${name}_OPTION_POSSIBLE_VALUES" "|" "${name}_OPTION_ENUM") if(NOT ("${${name}_OPTION_ENUM}" STREQUAL "")) set_property(CACHE ${name} PROPERTY STRINGS "${name}_OPTION_POSSIBLE_VALUES") endif() @@ -301,6 +287,9 @@ takes precedence over ccache if a storage backend is configured" ON) ARROW_COMPUTE ARROW_IPC) + define_option(ARROW_AZURE + "Build Arrow with Azure support (requires the Azure SDK for C++)" OFF) + define_option(ARROW_BUILD_UTILITIES "Build Arrow commandline utilities" OFF) define_option(ARROW_COMPUTE "Build all Arrow Compute kernels" OFF) diff --git a/cpp/cmake_modules/FindProtobufAlt.cmake b/cpp/cmake_modules/FindProtobufAlt.cmake index d29f757aeb659..15fe1b4f27ef7 100644 --- a/cpp/cmake_modules/FindProtobufAlt.cmake +++ b/cpp/cmake_modules/FindProtobufAlt.cmake @@ -30,3 +30,10 @@ if(ProtobufAlt_FIND_QUIETLY) endif() find_package(Protobuf ${find_package_args}) set(ProtobufAlt_FOUND ${Protobuf_FOUND}) +if(ProtobufAlt_FOUND) + set(ProtobufAlt_VERSION ${Protobuf_VERSION}) + set(ProtobufAlt_VERSION_MAJOR ${Protobuf_VERSION_MAJOR}) + set(ProtobufAlt_VERSION_MINOR ${Protobuf_VERSION_MINOR}) + set(ProtobufAlt_VERSION_PATCH ${Protobuf_VERSION_PATCH}) + set(ProtobufAlt_VERSION_TWEEK ${Protobuf_VERSION_TWEEK}) +endif() diff --git a/cpp/cmake_modules/FindgRPCAlt.cmake b/cpp/cmake_modules/FindgRPCAlt.cmake index 4e38605235b11..2ff10dbc23dd2 100644 --- a/cpp/cmake_modules/FindgRPCAlt.cmake +++ b/cpp/cmake_modules/FindgRPCAlt.cmake @@ -33,6 +33,8 @@ pkg_check_modules(GRPCPP_PC grpc++) if(GRPCPP_PC_FOUND) set(gRPCAlt_VERSION "${GRPCPP_PC_VERSION}") set(GRPCPP_INCLUDE_DIRECTORIES ${GRPCPP_PC_INCLUDEDIR}) + # gRPC's pkg-config file neglects to specify pthreads. + find_package(Threads REQUIRED) if(ARROW_GRPC_USE_SHARED) set(GRPCPP_LINK_LIBRARIES ${GRPCPP_PC_LINK_LIBRARIES}) set(GRPCPP_LINK_OPTIONS ${GRPCPP_PC_LDFLAGS_OTHER}) @@ -48,19 +50,22 @@ if(GRPCPP_PC_FOUND) set(GRPCPP_LINK_OPTIONS ${GRPCPP_PC_STATIC_LDFLAGS_OTHER}) set(GRPCPP_COMPILE_OPTIONS ${GRPCPP_PC_STATIC_CFLAGS_OTHER}) endif() + list(APPEND GRPCPP_LINK_LIBRARIES Threads::Threads) list(GET GRPCPP_LINK_LIBRARIES 0 GRPCPP_IMPORTED_LOCATION) list(REMOVE_AT GRPCPP_LINK_LIBRARIES 0) find_program(GRPC_CPP_PLUGIN grpc_cpp_plugin HINTS ${GRPCPP_PC_PREFIX} NO_DEFAULT_PATH PATH_SUFFIXES "bin") - set(gRPCAlt_FIND_PACKAGE_ARGS gRPCAlt REQUIRED_VARS GRPCPP_IMPORTED_LOCATION - GRPC_CPP_PLUGIN) - if(gRPCAlt_VERSION) - list(APPEND gRPCAlt_FIND_PACKAGE_ARGS VERSION_VAR gRPCAlt_VERSION) - endif() - find_package_handle_standard_args(${gRPCAlt_FIND_PACKAGE_ARGS}) +endif() +set(gRPCAlt_FIND_PACKAGE_ARGS gRPCAlt REQUIRED_VARS GRPCPP_IMPORTED_LOCATION + GRPC_CPP_PLUGIN) +if(gRPCAlt_VERSION) + list(APPEND gRPCAlt_FIND_PACKAGE_ARGS VERSION_VAR gRPCAlt_VERSION) +endif() +find_package_handle_standard_args(${gRPCAlt_FIND_PACKAGE_ARGS}) +if(gRPCAlt_FOUND) # gRPC does not expose the reflection library via pkg-config, but it should be alongside the main library get_filename_component(GRPCPP_IMPORTED_DIRECTORY ${GRPCPP_IMPORTED_LOCATION} DIRECTORY) if(ARROW_GRPC_USE_SHARED) @@ -74,11 +79,7 @@ if(GRPCPP_PC_FOUND) NAMES grpc++_reflection ${GRPCPP_REFLECTION_LIB_NAME} PATHS ${GRPCPP_IMPORTED_DIRECTORY} NO_DEFAULT_PATH) -else() - set(gRPCAlt_FOUND FALSE) -endif() -if(gRPCAlt_FOUND) add_library(gRPC::grpc++ UNKNOWN IMPORTED) set_target_properties(gRPC::grpc++ PROPERTIES IMPORTED_LOCATION "${GRPCPP_IMPORTED_LOCATION}" diff --git a/cpp/cmake_modules/SetupCxxFlags.cmake b/cpp/cmake_modules/SetupCxxFlags.cmake index 076c2e7450798..6b47fcb717287 100644 --- a/cpp/cmake_modules/SetupCxxFlags.cmake +++ b/cpp/cmake_modules/SetupCxxFlags.cmake @@ -62,29 +62,32 @@ if(ARROW_CPU_FLAG STREQUAL "x86") "${ARROW_AVX512_FLAG} -mavx512f -mavx512cd -mavx512vl -mavx512dq -mavx512bw") check_cxx_compiler_flag(${ARROW_SSE4_2_FLAG} CXX_SUPPORTS_SSE4_2) endif() - check_cxx_compiler_flag(${ARROW_AVX2_FLAG} CXX_SUPPORTS_AVX2) - if(MINGW) - # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=65782 - message(STATUS "Disable AVX512 support on MINGW for now") - else() - # Check for AVX512 support in the compiler. - set(OLD_CMAKE_REQURED_FLAGS ${CMAKE_REQUIRED_FLAGS}) - set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${ARROW_AVX512_FLAG}") - check_cxx_source_compiles(" - #ifdef _MSC_VER - #include - #else - #include - #endif - - int main() { - __m512i mask = _mm512_set1_epi32(0x1); - char out[32]; - _mm512_storeu_si512(out, mask); - return 0; - }" - CXX_SUPPORTS_AVX512) - set(CMAKE_REQUIRED_FLAGS ${OLD_CMAKE_REQURED_FLAGS}) + if(CMAKE_SIZEOF_VOID_P EQUAL 8) + # Check for AVX extensions on 64-bit systems only, as 32-bit support seems iffy + check_cxx_compiler_flag(${ARROW_AVX2_FLAG} CXX_SUPPORTS_AVX2) + if(MINGW) + # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=65782 + message(STATUS "Disable AVX512 support on MINGW for now") + else() + # Check for AVX512 support in the compiler. + set(OLD_CMAKE_REQURED_FLAGS ${CMAKE_REQUIRED_FLAGS}) + set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${ARROW_AVX512_FLAG}") + check_cxx_source_compiles(" + #ifdef _MSC_VER + #include + #else + #include + #endif + + int main() { + __m512i mask = _mm512_set1_epi32(0x1); + char out[32]; + _mm512_storeu_si512(out, mask); + return 0; + }" + CXX_SUPPORTS_AVX512) + set(CMAKE_REQUIRED_FLAGS ${OLD_CMAKE_REQURED_FLAGS}) + endif() endif() # Runtime SIMD level it can get from compiler and ARROW_RUNTIME_SIMD_LEVEL if(CXX_SUPPORTS_SSE4_2 AND ARROW_RUNTIME_SIMD_LEVEL MATCHES diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 4a19e226f7b56..1e7840cf92e08 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -138,6 +138,9 @@ if(ARROW_PACKAGE_PREFIX) if(NOT ENV{Boost_ROOT}) set(ENV{Boost_ROOT} ${ARROW_PACKAGE_PREFIX}) endif() + if(NOT DEFINED OPENSSL_ROOT_DIR) + set(OPENSSL_ROOT_DIR ${ARROW_PACKAGE_PREFIX}) + endif() endif() # For each dependency, set dependency source to global default, if unset @@ -913,6 +916,7 @@ set(EP_COMMON_CMAKE_ARGS -DCMAKE_EXPORT_NO_PACKAGE_REGISTRY=${CMAKE_EXPORT_NO_PACKAGE_REGISTRY} -DCMAKE_FIND_PACKAGE_NO_PACKAGE_REGISTRY=${CMAKE_FIND_PACKAGE_NO_PACKAGE_REGISTRY} -DCMAKE_INSTALL_LIBDIR=lib + -DCMAKE_OSX_SYSROOT=${CMAKE_OSX_SYSROOT} -DCMAKE_VERBOSE_MAKEFILE=${CMAKE_VERBOSE_MAKEFILE}) # Enable s/ccache if set by parent. @@ -1224,7 +1228,7 @@ if(ARROW_USE_BOOST) target_compile_definitions(Boost::headers INTERFACE "BOOST_USE_WINDOWS_H=1") endif() - message(STATUS "Boost include dir: ${Boost_INCLUDE_DIR}") + message(STATUS "Boost include dir: ${Boost_INCLUDE_DIRS}") endif() # ---------------------------------------------------------------------- @@ -1364,8 +1368,9 @@ set(ARROW_OPENSSL_REQUIRED_VERSION "1.0.2") set(ARROW_USE_OPENSSL OFF) if(PARQUET_REQUIRE_ENCRYPTION OR ARROW_FLIGHT - OR ARROW_S3 - OR ARROW_GANDIVA) + OR ARROW_GANDIVA + OR ARROW_GCS + OR ARROW_S3) set(OpenSSL_SOURCE "SYSTEM") resolve_dependency(OpenSSL HAVE_ALT @@ -1710,7 +1715,17 @@ if(ARROW_WITH_PROTOBUF) else() set(ARROW_PROTOBUF_REQUIRED_VERSION "2.6.1") endif() + # We need to use FORCE_ANY_NEWER_VERSION here to accept Protobuf + # newer version such as 23.4. If we don't use it, 23.4 is processed + # as an incompatible version with 3.12.0 with protobuf-config.cmake + # provided by Protobuf. Because protobuf-config-version.cmake + # requires the same major version. In the example, "23" for 23.4 and + # "3" for 3.12.0 are different. So 23.4 is rejected with 3.12.0. If + # we use FORCE_ANY_NEWER_VERSION here, we can bypass the check and + # use 23.4. resolve_dependency(Protobuf + FORCE_ANY_NEWER_VERSION + TRUE HAVE_ALT TRUE REQUIRED_VERSION @@ -1853,7 +1868,7 @@ macro(build_substrait) add_library(substrait STATIC ${SUBSTRAIT_SOURCES}) set_target_properties(substrait PROPERTIES POSITION_INDEPENDENT_CODE ON) target_include_directories(substrait PUBLIC ${SUBSTRAIT_INCLUDES}) - target_link_libraries(substrait INTERFACE ${ARROW_PROTOBUF_LIBPROTOBUF}) + target_link_libraries(substrait PUBLIC ${ARROW_PROTOBUF_LIBPROTOBUF}) add_dependencies(substrait substrait_gen) list(APPEND ARROW_BUNDLED_STATIC_LIBS substrait) @@ -3971,7 +3986,7 @@ macro(build_grpc) endmacro() if(ARROW_WITH_GRPC) - set(ARROW_GRPC_REQUIRED_VERSION "1.17.0") + set(ARROW_GRPC_REQUIRED_VERSION "1.30.0") if(NOT Protobuf_SOURCE STREQUAL gRPC_SOURCE) # ARROW-15495: Protobuf/gRPC must come from the same source message(STATUS "Forcing gRPC_SOURCE to Protobuf_SOURCE (${Protobuf_SOURCE})") @@ -3986,21 +4001,15 @@ if(ARROW_WITH_GRPC) grpc++) if(GRPC_VENDORED) - set(GRPCPP_PP_INCLUDE TRUE) + # Remove "v" from "vX.Y.Z" + string(SUBSTRING ${ARROW_GRPC_BUILD_VERSION} 1 -1 ARROW_GRPC_VERSION) # Examples need to link to static Arrow if we're using static gRPC set(ARROW_GRPC_USE_SHARED OFF) else() - # grpc++ headers may reside in ${GRPC_INCLUDE_DIR}/grpc++ or ${GRPC_INCLUDE_DIR}/grpcpp - # depending on the gRPC version. - get_target_property(GRPC_INCLUDE_DIR gRPC::grpc++ INTERFACE_INCLUDE_DIRECTORIES) - if(GRPC_INCLUDE_DIR MATCHES "^\\$<" - OR # generator expression - EXISTS "${GRPC_INCLUDE_DIR}/grpcpp/impl/codegen/config_protobuf.h") - set(GRPCPP_PP_INCLUDE TRUE) - elseif(EXISTS "${GRPC_INCLUDE_DIR}/grpc++/impl/codegen/config_protobuf.h") - set(GRPCPP_PP_INCLUDE FALSE) + if(gRPCAlt_VERSION) + set(ARROW_GRPC_VERSION ${gRPCAlt_VERSION}) else() - message(FATAL_ERROR "Cannot find grpc++ headers in ${GRPC_INCLUDE_DIR}") + set(ARROW_GRPC_VERSION ${gRPC_VERSION}) endif() if(ARROW_USE_ASAN) # Disable ASAN in system gRPC. @@ -4098,10 +4107,6 @@ macro(build_google_cloud_cpp_storage) # Curl is required on all platforms, but building it internally might also trip over S3's copy. # For now, force its inclusion from the underlying system or fail. find_curl() - if(NOT OpenSSL_FOUND) - resolve_dependency(OpenSSL HAVE_ALT REQUIRED_VERSION - ${ARROW_OPENSSL_REQUIRED_VERSION}) - endif() # Build google-cloud-cpp, with only storage_client diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 4776388e10970..c512a99f66d34 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -119,7 +119,7 @@ function(ADD_ARROW_BENCHMARK REL_TEST_NAME) ${ARG_UNPARSED_ARGUMENTS}) endfunction() -macro(append_avx2_src SRC) +macro(append_runtime_avx2_src SRC) if(ARROW_HAVE_RUNTIME_AVX2) list(APPEND ARROW_SRCS ${SRC}) set_source_files_properties(${SRC} PROPERTIES SKIP_PRECOMPILE_HEADERS ON) @@ -127,7 +127,7 @@ macro(append_avx2_src SRC) endif() endmacro() -macro(append_avx512_src SRC) +macro(append_runtime_avx512_src SRC) if(ARROW_HAVE_RUNTIME_AVX512) list(APPEND ARROW_SRCS ${SRC}) set_source_files_properties(${SRC} PROPERTIES SKIP_PRECOMPILE_HEADERS ON) @@ -255,8 +255,8 @@ if(ARROW_JEMALLOC) PROPERTIES SKIP_UNITY_BUILD_INCLUSION ON) endif() -append_avx2_src(util/bpacking_avx2.cc) -append_avx512_src(util/bpacking_avx512.cc) +append_runtime_avx2_src(util/bpacking_avx2.cc) +append_runtime_avx512_src(util/bpacking_avx512.cc) if(ARROW_HAVE_NEON) list(APPEND ARROW_SRCS util/bpacking_neon.cc) @@ -426,11 +426,11 @@ list(APPEND compute/row/row_internal.cc compute/util.cc) -append_avx2_src(compute/key_hash_avx2.cc) -append_avx2_src(compute/key_map_avx2.cc) -append_avx2_src(compute/row/compare_internal_avx2.cc) -append_avx2_src(compute/row/encode_internal_avx2.cc) -append_avx2_src(compute/util_avx2.cc) +append_runtime_avx2_src(compute/key_hash_avx2.cc) +append_runtime_avx2_src(compute/key_map_avx2.cc) +append_runtime_avx2_src(compute/row/compare_internal_avx2.cc) +append_runtime_avx2_src(compute/row/encode_internal_avx2.cc) +append_runtime_avx2_src(compute/util_avx2.cc) if(ARROW_COMPUTE) # Include the remaining kernels @@ -465,8 +465,8 @@ if(ARROW_COMPUTE) compute/kernels/vector_select_k.cc compute/kernels/vector_sort.cc) - append_avx2_src(compute/kernels/aggregate_basic_avx2.cc) - append_avx512_src(compute/kernels/aggregate_basic_avx512.cc) + append_runtime_avx2_src(compute/kernels/aggregate_basic_avx2.cc) + append_runtime_avx512_src(compute/kernels/aggregate_basic_avx512.cc) endif() if(ARROW_FILESYSTEM) @@ -482,6 +482,12 @@ if(ARROW_FILESYSTEM) filesystem/path_util.cc filesystem/util_internal.cc) + if(ARROW_AZURE) + list(APPEND ARROW_SRCS filesystem/azurefs.cc) + set_source_files_properties(filesystem/azurefs.cc + PROPERTIES SKIP_PRECOMPILE_HEADERS ON + SKIP_UNITY_BUILD_INCLUSION ON) + endif() if(ARROW_GCS) list(APPEND ARROW_SRCS filesystem/gcsfs.cc filesystem/gcsfs_internal.cc) set_source_files_properties(filesystem/gcsfs.cc filesystem/gcsfs_internal.cc diff --git a/cpp/src/arrow/acero/CMakeLists.txt b/cpp/src/arrow/acero/CMakeLists.txt index 287884432b9fe..c2c91db58d38a 100644 --- a/cpp/src/arrow/acero/CMakeLists.txt +++ b/cpp/src/arrow/acero/CMakeLists.txt @@ -19,7 +19,7 @@ add_custom_target(arrow_acero) arrow_install_all_headers("arrow/acero") -macro(append_acero_avx2_src SRC) +macro(append_acero_runtime_avx2_src SRC) if(ARROW_HAVE_RUNTIME_AVX2) list(APPEND ARROW_ACERO_SRCS ${SRC}) set_source_files_properties(${SRC} PROPERTIES SKIP_PRECOMPILE_HEADERS ON) @@ -56,8 +56,8 @@ set(ARROW_ACERO_SRCS union_node.cc util.cc) -append_acero_avx2_src(bloom_filter_avx2.cc) -append_acero_avx2_src(swiss_join_avx2.cc) +append_acero_runtime_avx2_src(bloom_filter_avx2.cc) +append_acero_runtime_avx2_src(swiss_join_avx2.cc) set(ARROW_ACERO_SHARED_LINK_LIBS) set(ARROW_ACERO_STATIC_LINK_LIBS) diff --git a/cpp/src/arrow/acero/asof_join_node.cc b/cpp/src/arrow/acero/asof_join_node.cc index 98e5918ebbf55..b7f5d878e5881 100644 --- a/cpp/src/arrow/acero/asof_join_node.cc +++ b/cpp/src/arrow/acero/asof_join_node.cc @@ -524,7 +524,7 @@ class KeyHasher { size_t index_; std::vector indices_; std::vector metadata_; - const RecordBatch* batch_; + std::atomic batch_; std::vector hashes_; LightContext ctx_; std::vector column_arrays_; @@ -819,7 +819,6 @@ class InputState { have_active_batch &= !queue_.TryPop(); if (have_active_batch) { DCHECK_GT(queue_.UnsyncFront()->num_rows(), 0); // empty batches disallowed - key_hasher_->Invalidate(); // batch changed - invalidate key hasher's cache memo_.UpdateTime(GetTime(queue_.UnsyncFront().get(), 0)); // time changed } } @@ -897,7 +896,8 @@ class InputState { Status Push(const std::shared_ptr& rb) { if (rb->num_rows() > 0) { - queue_.Push(rb); // only after above updates - push batch for processing + key_hasher_->Invalidate(); // batch changed - invalidate key hasher's cache + queue_.Push(rb); // only now push batch for processing } else { ++batches_processed_; // don't enqueue empty batches, just record as processed } diff --git a/cpp/src/arrow/acero/bloom_filter.cc b/cpp/src/arrow/acero/bloom_filter.cc index ad5e66ded0613..b9855ee506d27 100644 --- a/cpp/src/arrow/acero/bloom_filter.cc +++ b/cpp/src/arrow/acero/bloom_filter.cc @@ -123,7 +123,7 @@ void BlockedBloomFilter::InsertImp(int64_t num_rows, const T* hashes) { void BlockedBloomFilter::Insert(int64_t hardware_flags, int64_t num_rows, const uint32_t* hashes) { int64_t num_processed = 0; -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) if (hardware_flags & arrow::internal::CpuInfo::AVX2) { num_processed = Insert_avx2(num_rows, hashes); } @@ -134,7 +134,7 @@ void BlockedBloomFilter::Insert(int64_t hardware_flags, int64_t num_rows, void BlockedBloomFilter::Insert(int64_t hardware_flags, int64_t num_rows, const uint64_t* hashes) { int64_t num_processed = 0; -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) if (hardware_flags & arrow::internal::CpuInfo::AVX2) { num_processed = Insert_avx2(num_rows, hashes); } @@ -181,7 +181,7 @@ void BlockedBloomFilter::Find(int64_t hardware_flags, int64_t num_rows, bool enable_prefetch) const { int64_t num_processed = 0; -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) if (!(enable_prefetch && UsePrefetch()) && (hardware_flags & arrow::internal::CpuInfo::AVX2)) { num_processed = Find_avx2(num_rows, hashes, result_bit_vector); @@ -202,7 +202,7 @@ void BlockedBloomFilter::Find(int64_t hardware_flags, int64_t num_rows, bool enable_prefetch) const { int64_t num_processed = 0; -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) if (!(enable_prefetch && UsePrefetch()) && (hardware_flags & arrow::internal::CpuInfo::AVX2)) { num_processed = Find_avx2(num_rows, hashes, result_bit_vector); diff --git a/cpp/src/arrow/acero/bloom_filter.h b/cpp/src/arrow/acero/bloom_filter.h index b8f7f8cd256b1..50d07bfd948e0 100644 --- a/cpp/src/arrow/acero/bloom_filter.h +++ b/cpp/src/arrow/acero/bloom_filter.h @@ -17,13 +17,14 @@ #pragma once -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) #include #endif #include #include #include + #include "arrow/acero/partition_util.h" #include "arrow/acero/util.h" #include "arrow/memory_pool.h" @@ -203,7 +204,7 @@ class ARROW_ACERO_EXPORT BlockedBloomFilter { void SingleFold(int num_folds); -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) inline __m256i mask_avx2(__m256i hash) const; inline __m256i block_id_avx2(__m256i hash) const; int64_t Insert_avx2(int64_t num_rows, const uint32_t* hashes); diff --git a/cpp/src/arrow/acero/bloom_filter_avx2.cc b/cpp/src/arrow/acero/bloom_filter_avx2.cc index b6c281276db8d..5816bb4fc0a32 100644 --- a/cpp/src/arrow/acero/bloom_filter_avx2.cc +++ b/cpp/src/arrow/acero/bloom_filter_avx2.cc @@ -16,14 +16,13 @@ // under the License. #include + #include "arrow/acero/bloom_filter.h" #include "arrow/util/bit_util.h" namespace arrow { namespace acero { -#if defined(ARROW_HAVE_AVX2) - inline __m256i BlockedBloomFilter::mask_avx2(__m256i hash) const { // AVX2 translation of mask() method // @@ -132,7 +131,5 @@ int64_t BlockedBloomFilter::Insert_avx2(int64_t num_rows, const uint64_t* hashes return InsertImp_avx2(num_rows, hashes); } -#endif - } // namespace acero } // namespace arrow diff --git a/cpp/src/arrow/acero/bloom_filter_test.cc b/cpp/src/arrow/acero/bloom_filter_test.cc index de433ac68c11a..95375e277e2b8 100644 --- a/cpp/src/arrow/acero/bloom_filter_test.cc +++ b/cpp/src/arrow/acero/bloom_filter_test.cc @@ -22,13 +22,13 @@ #include #include #include + #include "arrow/acero/bloom_filter.h" #include "arrow/acero/task_util.h" #include "arrow/acero/test_util_internal.h" #include "arrow/acero/util.h" #include "arrow/compute/key_hash.h" #include "arrow/util/bitmap_ops.h" -#include "arrow/util/cpu_info.h" namespace arrow { @@ -171,9 +171,7 @@ void TestBloomSmallHashHelper(int64_t num_input_hashes, const T* input_hashes, // Output FPR and build and probe cost. // void TestBloomSmall(BloomFilterBuildStrategy strategy, int64_t num_build, - int num_build_copies, bool use_simd, bool enable_prefetch) { - int64_t hardware_flags = use_simd ? ::arrow::internal::CpuInfo::AVX2 : 0; - + int num_build_copies, int64_t hardware_flags, bool enable_prefetch) { // Generate input keys // int64_t num_probe = 4 * num_build; @@ -324,10 +322,8 @@ void TestBloomLargeHashHelper(int64_t hardware_flags, int64_t block, // Test with larger size Bloom filters (use large prime with arithmetic // sequence modulo 2^64). // -void TestBloomLarge(BloomFilterBuildStrategy strategy, int64_t num_build, bool use_simd, - bool enable_prefetch) { - int64_t hardware_flags = use_simd ? ::arrow::internal::CpuInfo::AVX2 : 0; - +void TestBloomLarge(BloomFilterBuildStrategy strategy, int64_t num_build, + int64_t hardware_flags, bool enable_prefetch) { // Largest 63-bit prime constexpr uint64_t prime = 0x7FFFFFFFFFFFFFE7ULL; @@ -458,42 +454,40 @@ TEST(BloomFilter, Basic) { num_build.push_back(1LL << log_large); #endif - constexpr int num_param_sets = 3; - struct { - bool use_avx2; + struct TestParam { + int64_t hardware_flags; bool enable_prefetch; bool insert_multiple_copies; - } params[num_param_sets]; - for (int i = 0; i < num_param_sets; ++i) { - params[i].use_avx2 = (i == 1); - params[i].enable_prefetch = (i == 2); - params[i].insert_multiple_copies = (i == 3); + }; + std::vector test_params; + for (const auto hardware_flags : HardwareFlagsForTesting()) { + test_params.push_back({hardware_flags, false, false}); } + test_params.push_back({0, true, false}); + test_params.push_back({0, false, true}); - std::vector strategy; - strategy.push_back(BloomFilterBuildStrategy::SINGLE_THREADED); + std::vector strategies; + strategies.push_back(BloomFilterBuildStrategy::SINGLE_THREADED); #ifndef ARROW_VALGRIND - strategy.push_back(BloomFilterBuildStrategy::PARALLEL); + strategies.push_back(BloomFilterBuildStrategy::PARALLEL); #endif static constexpr int64_t min_rows_for_large = 2 * 1024 * 1024; - for (size_t istrategy = 0; istrategy < strategy.size(); ++istrategy) { - for (int iparam_set = 0; iparam_set < num_param_sets; ++iparam_set) { - ARROW_SCOPED_TRACE("%s ", params[iparam_set].use_avx2 ? "AVX2" - : params[iparam_set].enable_prefetch ? "PREFETCH" - : params[iparam_set].insert_multiple_copies ? "FOLDING" - : "REGULAR"); - for (size_t inum_build = 0; inum_build < num_build.size(); ++inum_build) { - ARROW_SCOPED_TRACE("num_build ", static_cast(num_build[inum_build])); - if (num_build[inum_build] >= min_rows_for_large) { - TestBloomLarge(strategy[istrategy], num_build[inum_build], - params[iparam_set].use_avx2, params[iparam_set].enable_prefetch); + for (const auto& strategy : strategies) { + for (const auto& test_param : test_params) { + ARROW_SCOPED_TRACE("hardware_flags = ", test_param.hardware_flags, + test_param.enable_prefetch ? " PREFETCH" : "", + test_param.insert_multiple_copies ? " FOLDING" : "REGULAR"); + for (const auto n : num_build) { + ARROW_SCOPED_TRACE("num_build ", n); + if (n >= min_rows_for_large) { + TestBloomLarge(strategy, n, test_param.hardware_flags, + test_param.enable_prefetch); } else { - TestBloomSmall(strategy[istrategy], num_build[inum_build], - params[iparam_set].insert_multiple_copies ? 8 : 1, - params[iparam_set].use_avx2, params[iparam_set].enable_prefetch); + TestBloomSmall(strategy, n, test_param.insert_multiple_copies ? 8 : 1, + test_param.hardware_flags, test_param.enable_prefetch); } } } @@ -506,19 +500,18 @@ TEST(BloomFilter, Scaling) { num_build.push_back(1000000); num_build.push_back(4000000); - std::vector strategy; - strategy.push_back(BloomFilterBuildStrategy::PARALLEL); - - for (bool use_avx2 : {false, true}) { - for (size_t istrategy = 0; istrategy < strategy.size(); ++istrategy) { - for (size_t inum_build = 0; inum_build < num_build.size(); ++inum_build) { - ARROW_SCOPED_TRACE("num_build = ", static_cast(num_build[inum_build])); - ARROW_SCOPED_TRACE("strategy = ", - strategy[istrategy] == BloomFilterBuildStrategy::PARALLEL - ? "PARALLEL" - : "SINGLE_THREADED"); - ARROW_SCOPED_TRACE("avx2 = ", use_avx2 ? "AVX2" : "SCALAR"); - TestBloomLarge(strategy[istrategy], num_build[inum_build], use_avx2, + std::vector strategies; + strategies.push_back(BloomFilterBuildStrategy::PARALLEL); + + for (const auto hardware_flags : HardwareFlagsForTesting()) { + for (const auto& strategy : strategies) { + for (const auto n : num_build) { + ARROW_SCOPED_TRACE("num_build = ", n); + ARROW_SCOPED_TRACE("strategy = ", strategy == BloomFilterBuildStrategy::PARALLEL + ? "PARALLEL" + : "SINGLE_THREADED"); + ARROW_SCOPED_TRACE("hardware_flags = ", hardware_flags); + TestBloomLarge(strategy, n, hardware_flags, /*enable_prefetch=*/false); } } diff --git a/cpp/src/arrow/acero/exec_plan.h b/cpp/src/arrow/acero/exec_plan.h index 04303aa9512b1..dba6c64ddc837 100644 --- a/cpp/src/arrow/acero/exec_plan.h +++ b/cpp/src/arrow/acero/exec_plan.h @@ -739,7 +739,7 @@ DeclarationToBatchesAsync(Declaration declaration, ExecContext exec_context); /// \brief Utility method to run a declaration and return results as a RecordBatchReader /// /// If an exec context is not provided then a default exec context will be used based -/// on the value of `use_threads`. If `use_threads` is false then the CPU exeuctor will +/// on the value of `use_threads`. If `use_threads` is false then the CPU executor will /// be a serial executor and all CPU work will be done on the calling thread. I/O tasks /// will still happen on the I/O executor and may be multi-threaded. /// diff --git a/cpp/src/arrow/acero/hash_join_node.cc b/cpp/src/arrow/acero/hash_join_node.cc index a617914164953..254dad361ff87 100644 --- a/cpp/src/arrow/acero/hash_join_node.cc +++ b/cpp/src/arrow/acero/hash_join_node.cc @@ -236,14 +236,14 @@ Status HashJoinSchema::ValidateSchemas(JoinType join_type, const Schema& left_sc const auto& type = *field->type(); if (!IsTypeSupported(type)) { return Status::Invalid("Data type ", type, - " is not supported in join non-key field"); + " is not supported in join non-key field ", field->name()); } } for (const auto& field : right_schema.fields()) { const auto& type = *field->type(); if (!IsTypeSupported(type)) { return Status::Invalid("Data type ", type, - " is not supported in join non-key field"); + " is not supported in join non-key field ", field->name()); } } diff --git a/cpp/src/arrow/acero/options.h b/cpp/src/arrow/acero/options.h index bb94bdaa4a628..1ede3fbfc8ed0 100644 --- a/cpp/src/arrow/acero/options.h +++ b/cpp/src/arrow/acero/options.h @@ -80,7 +80,7 @@ class ARROW_ACERO_EXPORT ExecNodeOptions { /// /// For each batch received a new task will be created to push that batch downstream. /// This task will slice smaller units of size `ExecPlan::kMaxBatchSize` from the -/// parent batch and call InputRecieved. Thus, if the `generator` yields a large +/// parent batch and call InputReceived. Thus, if the `generator` yields a large /// batch it may result in several calls to InputReceived. /// /// The SourceNode will, by default, assign an implicit ordering to outgoing batches. @@ -115,7 +115,7 @@ class ARROW_ACERO_EXPORT TableSourceNodeOptions : public ExecNodeOptions { /// Create an instance from values TableSourceNodeOptions(std::shared_ptr table, int64_t max_batch_size = kDefaultMaxBatchSize) - : table(table), max_batch_size(max_batch_size) {} + : table(std::move(table)), max_batch_size(max_batch_size) {} /// \brief a table which acts as the data source std::shared_ptr
table; @@ -135,7 +135,7 @@ class ARROW_ACERO_EXPORT NamedTableNodeOptions : public ExecNodeOptions { public: /// Create an instance from values NamedTableNodeOptions(std::vector names, std::shared_ptr schema) - : names(std::move(names)), schema(schema) {} + : names(std::move(names)), schema(std::move(schema)) {} /// \brief the names to put in the serialized plan std::vector names; @@ -156,7 +156,7 @@ class ARROW_ACERO_EXPORT SchemaSourceNodeOptions : public ExecNodeOptions { /// Create an instance that will create a new task on io_executor for each iteration SchemaSourceNodeOptions(std::shared_ptr schema, ItMaker it_maker, arrow::internal::Executor* io_executor) - : schema(schema), + : schema(std::move(schema)), it_maker(std::move(it_maker)), io_executor(io_executor), requires_io(true) {} @@ -165,7 +165,7 @@ class ARROW_ACERO_EXPORT SchemaSourceNodeOptions : public ExecNodeOptions { /// executor SchemaSourceNodeOptions(std::shared_ptr schema, ItMaker it_maker, bool requires_io = false) - : schema(schema), + : schema(std::move(schema)), it_maker(std::move(it_maker)), io_executor(NULLPTR), requires_io(requires_io) {} diff --git a/cpp/src/arrow/acero/swiss_join_avx2.cc b/cpp/src/arrow/acero/swiss_join_avx2.cc index d5c0b7817f55f..0888dd8938455 100644 --- a/cpp/src/arrow/acero/swiss_join_avx2.cc +++ b/cpp/src/arrow/acero/swiss_join_avx2.cc @@ -23,8 +23,6 @@ namespace arrow { namespace acero { -#if defined(ARROW_HAVE_AVX2) - template int RowArrayAccessor::Visit_avx2(const RowTableImpl& rows, int column_id, int num_rows, const uint32_t* row_ids, @@ -191,7 +189,5 @@ int RowArrayAccessor::VisitNulls_avx2(const RowTableImpl& rows, int column_id, return num_rows - (num_rows % unroll); } -#endif - } // namespace acero } // namespace arrow diff --git a/cpp/src/arrow/acero/swiss_join_internal.h b/cpp/src/arrow/acero/swiss_join_internal.h index cd12b34a0c6dc..88b80f06f57f2 100644 --- a/cpp/src/arrow/acero/swiss_join_internal.h +++ b/cpp/src/arrow/acero/swiss_join_internal.h @@ -80,7 +80,7 @@ class RowArrayAccessor { const uint32_t* row_ids, PROCESS_VALUE_FN process_value_fn); private: -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) // This is equivalent to Visit method, but processing 8 rows at a time in a // loop. // Returns the number of processed rows, which may be less than requested (up diff --git a/cpp/src/arrow/acero/test_util_internal.cc b/cpp/src/arrow/acero/test_util_internal.cc index 2042650be6acb..f50ca92238dc4 100644 --- a/cpp/src/arrow/acero/test_util_internal.cc +++ b/cpp/src/arrow/acero/test_util_internal.cc @@ -45,8 +45,10 @@ #include "arrow/testing/builder.h" #include "arrow/testing/gtest_util.h" #include "arrow/testing/random.h" +#include "arrow/testing/util.h" #include "arrow/type.h" #include "arrow/util/async_generator.h" +#include "arrow/util/cpu_info.h" #include "arrow/util/iterator.h" #include "arrow/util/logging.h" #include "arrow/util/unreachable.h" @@ -54,6 +56,7 @@ namespace arrow { +using arrow::internal::CpuInfo; using arrow::internal::Executor; using compute::SortKey; @@ -62,6 +65,7 @@ using compute::Take; namespace acero { namespace { + void ValidateOutputImpl(const ArrayData& output) { ASSERT_OK(::arrow::internal::ValidateArrayFull(output)); TestInitialized(output); @@ -116,6 +120,11 @@ void ValidateOutput(const Datum& output) { } } +std::vector HardwareFlagsForTesting() { + // Acero currently only has AVX2 optimizations + return arrow::GetSupportedHardwareFlags({CpuInfo::AVX2}); +} + namespace { struct DummyNode : ExecNode { diff --git a/cpp/src/arrow/acero/test_util_internal.h b/cpp/src/arrow/acero/test_util_internal.h index 03f417028650b..569fb1254db4a 100644 --- a/cpp/src/arrow/acero/test_util_internal.h +++ b/cpp/src/arrow/acero/test_util_internal.h @@ -20,6 +20,7 @@ #include "arrow/testing/gtest_util.h" #include "arrow/util/vector.h" +#include #include #include #include @@ -33,12 +34,14 @@ #include "arrow/util/async_generator.h" #include "arrow/util/pcg_random.h" -namespace arrow { - -namespace acero { +namespace arrow::acero { void ValidateOutput(const Datum& output); +// Enumerate all hardware flags that can be tested on this platform +// and would lead to different code paths being tested in Acero. +std::vector HardwareFlagsForTesting(); + using StartProducingFunc = std::function; using StopProducingFunc = std::function; @@ -204,5 +207,4 @@ struct TableGenerationProperties { Result> MakeRandomTimeSeriesTable( const TableGenerationProperties& properties); -} // namespace acero -} // namespace arrow +} // namespace arrow::acero diff --git a/cpp/src/arrow/array/array_base.h b/cpp/src/arrow/array/array_base.h index 48d65e30de8d9..7e857bf20568e 100644 --- a/cpp/src/arrow/array/array_base.h +++ b/cpp/src/arrow/array/array_base.h @@ -106,7 +106,7 @@ class ARROW_EXPORT Array { /// \see GetNullCount int64_t ComputeLogicalNullCount() const; - std::shared_ptr type() const { return data_->type; } + const std::shared_ptr& type() const { return data_->type; } Type::type type_id() const { return data_->type->id(); } /// Buffer for the validity (null) bitmap, if any. Note that Union types @@ -251,7 +251,7 @@ class ARROW_EXPORT PrimitiveArray : public FlatArray { int64_t null_count = kUnknownNullCount, int64_t offset = 0); /// Does not account for any slice offset - std::shared_ptr values() const { return data_->buffers[1]; } + const std::shared_ptr& values() const { return data_->buffers[1]; } protected: PrimitiveArray() : raw_values_(NULLPTR) {} diff --git a/cpp/src/arrow/array/array_dict.cc b/cpp/src/arrow/array/array_dict.cc index 8fbe9f69d7897..cccc7bb78220d 100644 --- a/cpp/src/arrow/array/array_dict.cc +++ b/cpp/src/arrow/array/array_dict.cc @@ -50,7 +50,7 @@ using internal::CopyBitmap; // ---------------------------------------------------------------------- // DictionaryArray -std::shared_ptr DictionaryArray::indices() const { return indices_; } +const std::shared_ptr& DictionaryArray::indices() const { return indices_; } int64_t DictionaryArray::GetValueIndex(int64_t i) const { const uint8_t* indices_data = data_->buffers[1]->data(); @@ -106,8 +106,9 @@ DictionaryArray::DictionaryArray(const std::shared_ptr& type, SetData(data); } -std::shared_ptr DictionaryArray::dictionary() const { +const std::shared_ptr& DictionaryArray::dictionary() const { if (!dictionary_) { + // TODO(GH-36503) this isn't thread safe dictionary_ = MakeArray(data_->dictionary); } return dictionary_; diff --git a/cpp/src/arrow/array/array_dict.h b/cpp/src/arrow/array/array_dict.h index 8791eaa07db3a..b7d4db4b415b4 100644 --- a/cpp/src/arrow/array/array_dict.h +++ b/cpp/src/arrow/array/array_dict.h @@ -101,8 +101,8 @@ class ARROW_EXPORT DictionaryArray : public Array { /// \brief Return the dictionary for this array, which is stored as /// a member of the ArrayData internal structure - std::shared_ptr dictionary() const; - std::shared_ptr indices() const; + const std::shared_ptr& dictionary() const; + const std::shared_ptr& indices() const; /// \brief Return the ith value of indices, cast to int64_t. Not recommended /// for use in performance-sensitive code. Does not validate whether the diff --git a/cpp/src/arrow/array/array_list_test.cc b/cpp/src/arrow/array/array_list_test.cc index 2a00cadcab9aa..a3a2f99851b55 100644 --- a/cpp/src/arrow/array/array_list_test.cc +++ b/cpp/src/arrow/array/array_list_test.cc @@ -215,17 +215,20 @@ class TestListArray : public ::testing::Test { // Offsets with nulls will match. ASSERT_OK_AND_ASSIGN(auto result, ArrayType::FromArrays(*offsets_w_nulls, *values, pool_)); + ASSERT_OK(result->ValidateFull()); AssertArraysEqual(*result, *expected); // Offets without nulls, will replace null with empty list ASSERT_OK_AND_ASSIGN(result, ArrayType::FromArrays(*offsets_wo_nulls, *values, pool_)); + ASSERT_OK(result->ValidateFull()); AssertArraysEqual(*result, *std::dynamic_pointer_cast( ArrayFromJSON(type, "[[0], [], [0, null], [0]]"))); // Specify non-null offsets with null_bitmap ASSERT_OK_AND_ASSIGN(result, ArrayType::FromArrays(*offsets_wo_nulls, *values, pool_, expected->null_bitmap())); + ASSERT_OK(result->ValidateFull()); AssertArraysEqual(*result, *expected); // Cannot specify both null offsets with null_bitmap @@ -233,6 +236,58 @@ class TestListArray : public ::testing::Test { expected->null_bitmap())); } + void TestFromArraysWithSlicedOffsets() { + std::vector offsets = {-1, -1, 0, 1, 2, 4}; + + std::shared_ptr offsets_wo_nulls; + ArrayFromVector(offsets, &offsets_wo_nulls); + + auto type = std::make_shared(int32()); + auto expected = std::dynamic_pointer_cast( + ArrayFromJSON(type, "[[0], [1], [0, null]]")); + auto values = expected->values(); + + // Apply an offset to the offsets array + auto sliced_offsets = offsets_wo_nulls->Slice(2, 4); + ASSERT_OK_AND_ASSIGN(auto result, + ArrayType::FromArrays(*sliced_offsets, *values, pool_)); + ASSERT_OK(result->ValidateFull()); + AssertArraysEqual(*result, *expected); + + // Non-zero starter offset + sliced_offsets = offsets_wo_nulls->Slice(3, 3); + ASSERT_OK_AND_ASSIGN(result, ArrayType::FromArrays(*sliced_offsets, *values, pool_)); + ASSERT_OK(result->ValidateFull()); + AssertArraysEqual(*result, *expected->Slice(1, 2)); + } + + void TestFromArraysWithSlicedNullOffsets() { + std::vector offsets = {-1, -1, 0, 1, 1, 3}; + std::vector offsets_w_nulls_is_valid = {true, true, true, false, true, true}; + + std::shared_ptr offsets_w_nulls; + ArrayFromVector(offsets_w_nulls_is_valid, offsets, + &offsets_w_nulls); + + auto type = std::make_shared(int32()); + auto expected = std::dynamic_pointer_cast( + ArrayFromJSON(type, "[[0], null, [0, null]]")); + auto values = expected->values(); + + // Apply an offset to the offsets array with nulls (GH-36776) + auto sliced_offsets = offsets_w_nulls->Slice(2, 4); + ASSERT_OK_AND_ASSIGN(auto result, + ArrayType::FromArrays(*sliced_offsets, *values, pool_)); + ASSERT_OK(result->ValidateFull()); + AssertArraysEqual(*result, *expected); + + // Non-zero starter offset + sliced_offsets = offsets_w_nulls->Slice(3, 3); + ASSERT_OK_AND_ASSIGN(result, ArrayType::FromArrays(*sliced_offsets, *values, pool_)); + ASSERT_OK(result->ValidateFull()); + AssertArraysEqual(*result, *expected->Slice(1, 2)); + } + void TestFromArrays() { std::shared_ptr offsets1, offsets2, offsets3, offsets4, offsets5, values; @@ -586,6 +641,8 @@ TYPED_TEST(TestListArray, FromArrays) { this->TestFromArrays(); } TYPED_TEST(TestListArray, FromArraysWithNullBitMap) { this->TestFromArraysWithNullBitMap(); + this->TestFromArraysWithSlicedOffsets(); + this->TestFromArraysWithSlicedNullOffsets(); } TYPED_TEST(TestListArray, AppendNull) { this->TestAppendNull(); } diff --git a/cpp/src/arrow/array/array_nested.cc b/cpp/src/arrow/array/array_nested.cc index f99163206ea57..df60074c78470 100644 --- a/cpp/src/arrow/array/array_nested.cc +++ b/cpp/src/arrow/array/array_nested.cc @@ -52,6 +52,9 @@ using internal::CopyBitmap; namespace { +/// \brief Clean offsets when their null_count is greater than 0 +/// +/// \pre offsets.null_count() > 0 template Result CleanListOffsets(const std::shared_ptr& validity_buffer, const Array& offsets, MemoryPool* pool) { @@ -59,43 +62,36 @@ Result CleanListOffsets(const std::shared_ptr& validity_bu using OffsetArrowType = typename CTypeTraits::ArrowType; using OffsetArrayType = typename TypeTraits::ArrayType; - const auto& typed_offsets = checked_cast(offsets); + DCHECK_GT(offsets.null_count(), 0); const int64_t num_offsets = offsets.length(); - DCHECK(validity_buffer == nullptr || offsets.null_count() == 0) - << "When a validity_buffer is passed, offsets must have no nulls"; + if (!offsets.IsValid(num_offsets - 1)) { + return Status::Invalid("Last list offset should be non-null"); + } - if (offsets.null_count() > 0) { - if (!offsets.IsValid(num_offsets - 1)) { - return Status::Invalid("Last list offset should be non-null"); - } + ARROW_ASSIGN_OR_RAISE(auto clean_offsets, + AllocateBuffer(num_offsets * sizeof(offset_type), pool)); - ARROW_ASSIGN_OR_RAISE(auto clean_offsets, - AllocateBuffer(num_offsets * sizeof(offset_type), pool)); + // Copy valid bits, ignoring the final offset (since for a length N list array, + // we have N + 1 offsets) + ARROW_ASSIGN_OR_RAISE( + auto clean_validity_buffer, + CopyBitmap(pool, offsets.null_bitmap()->data(), offsets.offset(), num_offsets - 1)); - // Copy valid bits, ignoring the final offset (since for a length N list array, - // we have N + 1 offsets) - ARROW_ASSIGN_OR_RAISE( - auto clean_validity_buffer, - offsets.null_bitmap()->CopySlice(0, bit_util::BytesForBits(num_offsets - 1))); - - const offset_type* raw_offsets = typed_offsets.raw_values(); - auto clean_raw_offsets = - reinterpret_cast(clean_offsets->mutable_data()); - - // Must work backwards so we can tell how many values were in the last non-null value - offset_type current_offset = raw_offsets[num_offsets - 1]; - for (int64_t i = num_offsets - 1; i >= 0; --i) { - if (offsets.IsValid(i)) { - current_offset = raw_offsets[i]; - } - clean_raw_offsets[i] = current_offset; - } + const offset_type* raw_offsets = + checked_cast(offsets).raw_values(); + auto clean_raw_offsets = reinterpret_cast(clean_offsets->mutable_data()); - return BufferVector({std::move(clean_validity_buffer), std::move(clean_offsets)}); + // Must work backwards so we can tell how many values were in the last non-null value + offset_type current_offset = raw_offsets[num_offsets - 1]; + for (int64_t i = num_offsets - 1; i >= 0; --i) { + if (offsets.IsValid(i)) { + current_offset = raw_offsets[i]; + } + clean_raw_offsets[i] = current_offset; } - return BufferVector({validity_buffer, typed_offsets.values()}); + return BufferVector({std::move(clean_validity_buffer), std::move(clean_offsets)}); } template @@ -124,14 +120,21 @@ Result::ArrayType>> ListArrayFromArray return Status::NotImplemented("Null bitmap with offsets slice not supported."); } - std::shared_ptr offset_buf, validity_buf; - ARROW_ASSIGN_OR_RAISE(auto buffers, CleanListOffsets(null_bitmap, offsets, pool)); - int64_t null_count_ = null_bitmap ? null_count : offsets.null_count(); + // Clean the offsets if they contain nulls. + if (offsets.null_count() > 0) { + ARROW_ASSIGN_OR_RAISE(auto buffers, + CleanListOffsets(null_bitmap, offsets, pool)); + auto data = ArrayData::Make(type, offsets.length() - 1, std::move(buffers), + {values.data()}, offsets.null_count(), /*offset=*/0); + return std::make_shared(std::move(data)); + } - std::shared_ptr internal_data = ArrayData::Make( - type, offsets.length() - 1, std::move(buffers), null_count_, offsets.offset()); - internal_data->child_data.push_back(values.data()); - return std::make_shared(internal_data); + using OffsetArrayType = typename TypeTraits::ArrayType; + const auto& typed_offsets = checked_cast(offsets); + auto buffers = BufferVector({std::move(null_bitmap), typed_offsets.values()}); + auto data = ArrayData::Make(type, offsets.length() - 1, std::move(buffers), + {values.data()}, null_count, offsets.offset()); + return std::make_shared(std::move(data)); } static std::shared_ptr SliceArrayWithOffsets(const Array& array, int64_t begin, @@ -374,10 +377,18 @@ Result> MapArray::FromArraysInternal( return Status::Invalid("Map key and item arrays must be equal length"); } - ARROW_ASSIGN_OR_RAISE(auto buffers, CleanListOffsets(NULLPTR, *offsets, pool)); + if (offsets->null_count() > 0) { + ARROW_ASSIGN_OR_RAISE(auto buffers, + CleanListOffsets(NULLPTR, *offsets, pool)); + return std::make_shared(type, offsets->length() - 1, std::move(buffers), + keys, items, offsets->null_count(), 0); + } + using OffsetArrayType = typename TypeTraits::ArrayType; + const auto& typed_offsets = checked_cast(*offsets); + auto buffers = BufferVector({nullptr, typed_offsets.values()}); return std::make_shared(type, offsets->length() - 1, std::move(buffers), keys, - items, offsets->null_count(), offsets->offset()); + items, /*null_count=*/0, offsets->offset()); } Result> MapArray::FromArrays(const std::shared_ptr& offsets, @@ -470,11 +481,11 @@ const FixedSizeListType* FixedSizeListArray::list_type() const { return checked_cast(data_->type.get()); } -std::shared_ptr FixedSizeListArray::value_type() const { +const std::shared_ptr& FixedSizeListArray::value_type() const { return list_type()->value_type(); } -std::shared_ptr FixedSizeListArray::values() const { return values_; } +const std::shared_ptr& FixedSizeListArray::values() const { return values_; } Result> FixedSizeListArray::FromArrays( const std::shared_ptr& values, int32_t list_size) { diff --git a/cpp/src/arrow/array/array_nested.h b/cpp/src/arrow/array/array_nested.h index 4f5f3f614cb64..47c1db039ccc9 100644 --- a/cpp/src/arrow/array/array_nested.h +++ b/cpp/src/arrow/array/array_nested.h @@ -71,12 +71,12 @@ class BaseListArray : public Array { /// \brief Return array object containing the list's values /// /// Note that this buffer does not account for any slice offset or length. - std::shared_ptr values() const { return values_; } + const std::shared_ptr& values() const { return values_; } /// Note that this buffer does not account for any slice offset or length. - std::shared_ptr value_offsets() const { return data_->buffers[1]; } + const std::shared_ptr& value_offsets() const { return data_->buffers[1]; } - std::shared_ptr value_type() const { return list_type_->value_type(); } + const std::shared_ptr& value_type() const { return list_type_->value_type(); } /// Return pointer to raw value offsets accounting for any slice offset const offset_type* raw_value_offsets() const { @@ -269,10 +269,10 @@ class ARROW_EXPORT MapArray : public ListArray { const MapType* map_type() const { return map_type_; } /// \brief Return array object containing all map keys - std::shared_ptr keys() const { return keys_; } + const std::shared_ptr& keys() const { return keys_; } /// \brief Return array object containing all mapped items - std::shared_ptr items() const { return items_; } + const std::shared_ptr& items() const { return items_; } /// Validate child data before constructing the actual MapArray. static Status ValidateChildData( @@ -310,9 +310,9 @@ class ARROW_EXPORT FixedSizeListArray : public Array { const FixedSizeListType* list_type() const; /// \brief Return array object containing the list's values - std::shared_ptr values() const; + const std::shared_ptr& values() const; - std::shared_ptr value_type() const; + const std::shared_ptr& value_type() const; // The following functions will not perform boundschecking int64_t value_offset(int64_t i) const { @@ -432,7 +432,7 @@ class ARROW_EXPORT UnionArray : public Array { using type_code_t = int8_t; /// Note that this buffer does not account for any slice offset - std::shared_ptr type_codes() const { return data_->buffers[1]; } + const std::shared_ptr& type_codes() const { return data_->buffers[1]; } const type_code_t* raw_type_codes() const { return raw_type_codes_ + data_->offset; } @@ -571,7 +571,7 @@ class ARROW_EXPORT DenseUnionArray : public UnionArray { } /// Note that this buffer does not account for any slice offset - std::shared_ptr value_offsets() const { return data_->buffers[2]; } + const std::shared_ptr& value_offsets() const { return data_->buffers[2]; } int32_t value_offset(int64_t i) const { return raw_value_offsets_[i + data_->offset]; } diff --git a/cpp/src/arrow/array/array_run_end_test.cc b/cpp/src/arrow/array/array_run_end_test.cc index bc8b929c53aff..3e8c658726809 100644 --- a/cpp/src/arrow/array/array_run_end_test.cc +++ b/cpp/src/arrow/array/array_run_end_test.cc @@ -21,6 +21,7 @@ #include #include #include +#include #include #include "arrow/array.h" @@ -127,34 +128,55 @@ TEST_P(TestRunEndEncodedArray, FromRunEndsAndValues) { RunEndEncodedArray::Make(30, run_end_values, ArrayFromJSON(int32(), "[2, 0]"))); } -TEST_P(TestRunEndEncodedArray, FindOffsetAndLength) { +TEST_P(TestRunEndEncodedArray, FindPhysicalRange) { auto run_ends = ArrayFromJSON(run_end_type, "[100, 200, 300, 400, 500]"); auto values = ArrayFromJSON(utf8(), R"(["Hello", "beautiful", "world", "of", "REE"])"); ASSERT_OK_AND_ASSIGN(auto ree_array, RunEndEncodedArray::Make(500, run_ends, values)); + auto Range = [](int64_t offset, int64_t length) -> std::pair { + return std::make_pair(offset, length); + }; ASSERT_EQ(ree_array->FindPhysicalOffset(), 0); ASSERT_EQ(ree_array->FindPhysicalLength(), 5); + ASSERT_EQ(ree_util::FindPhysicalRange(*ree_array->data(), ree_array->offset(), + ree_array->length()), + Range(0, 5)); auto slice = std::dynamic_pointer_cast(ree_array->Slice(199, 5)); ASSERT_EQ(slice->FindPhysicalOffset(), 1); ASSERT_EQ(slice->FindPhysicalLength(), 2); + ASSERT_EQ(ree_util::FindPhysicalRange(*slice->data(), slice->offset(), slice->length()), + Range(1, 2)); auto slice2 = std::dynamic_pointer_cast(ree_array->Slice(199, 101)); ASSERT_EQ(slice2->FindPhysicalOffset(), 1); ASSERT_EQ(slice2->FindPhysicalLength(), 2); + ASSERT_EQ( + ree_util::FindPhysicalRange(*slice2->data(), slice2->offset(), slice2->length()), + Range(1, 2)); auto slice3 = std::dynamic_pointer_cast(ree_array->Slice(400, 100)); ASSERT_EQ(slice3->FindPhysicalOffset(), 4); ASSERT_EQ(slice3->FindPhysicalLength(), 1); + ASSERT_EQ( + ree_util::FindPhysicalRange(*slice3->data(), slice3->offset(), slice3->length()), + Range(4, 1)); auto slice4 = std::dynamic_pointer_cast(ree_array->Slice(0, 150)); ASSERT_EQ(slice4->FindPhysicalOffset(), 0); ASSERT_EQ(slice4->FindPhysicalLength(), 2); + ASSERT_EQ( + ree_util::FindPhysicalRange(*slice4->data(), slice4->offset(), slice4->length()), + Range(0, 2)); auto zero_length_at_end = std::dynamic_pointer_cast(ree_array->Slice(500, 0)); ASSERT_EQ(zero_length_at_end->FindPhysicalOffset(), 5); ASSERT_EQ(zero_length_at_end->FindPhysicalLength(), 0); + ASSERT_EQ(ree_util::FindPhysicalRange(*zero_length_at_end->data(), + zero_length_at_end->offset(), + zero_length_at_end->length()), + Range(5, 0)); } TEST_P(TestRunEndEncodedArray, LogicalRunEnds) { diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc index 602a468fafb05..0b82a82fbdb26 100644 --- a/cpp/src/arrow/array/array_test.cc +++ b/cpp/src/arrow/array/array_test.cc @@ -59,6 +59,7 @@ #include "arrow/util/bitmap_builders.h" #include "arrow/util/checked_cast.h" #include "arrow/util/decimal.h" +#include "arrow/util/key_value_metadata.h" #include "arrow/util/macros.h" #include "arrow/util/range.h" #include "arrow/visit_data_inline.h" @@ -366,13 +367,12 @@ TEST_F(TestArray, BuildLargeInMemoryArray) { ASSERT_EQ(length, result->length()); } -TEST_F(TestArray, TestMakeArrayOfNull) { +static std::vector> TestArrayUtilitiesAgainstTheseTypes() { FieldVector union_fields1({field("a", utf8()), field("b", int32())}); FieldVector union_fields2({field("a", null()), field("b", list(large_utf8()))}); std::vector union_type_codes{7, 42}; - std::shared_ptr types[] = { - // clang-format off + return { null(), boolean(), int8(), @@ -387,7 +387,7 @@ TEST_F(TestArray, TestMakeArrayOfNull) { utf8(), large_utf8(), list(utf8()), - list(int64()), // ARROW-9071 + list(int64()), // NOTE: Regression case for ARROW-9071/MakeArrayOfNull large_list(large_utf8()), fixed_size_list(utf8(), 3), fixed_size_list(int64(), 4), @@ -397,13 +397,15 @@ TEST_F(TestArray, TestMakeArrayOfNull) { sparse_union(union_fields2, union_type_codes), dense_union(union_fields1, union_type_codes), dense_union(union_fields2, union_type_codes), - smallint(), // extension type - list_extension_type(), // nested extension type - // clang-format on + smallint(), // extension type + list_extension_type(), // nested extension type + run_end_encoded(int16(), utf8()), }; +} +TEST_F(TestArray, TestMakeArrayOfNull) { for (int64_t length : {0, 1, 16, 133}) { - for (auto type : types) { + for (auto type : TestArrayUtilitiesAgainstTheseTypes()) { ARROW_SCOPED_TRACE("type = ", type->ToString()); ASSERT_OK_AND_ASSIGN(auto array, MakeArrayOfNull(type, length)); ASSERT_EQ(array->type(), type); @@ -716,36 +718,7 @@ void CheckSpanRoundTrip(const Array& array) { } TEST_F(TestArray, TestMakeEmptyArray) { - FieldVector union_fields1({field("a", utf8()), field("b", int32())}); - FieldVector union_fields2({field("a", null()), field("b", list(large_utf8()))}); - std::vector union_type_codes{7, 42}; - - std::shared_ptr types[] = {null(), - boolean(), - int8(), - uint16(), - int32(), - uint64(), - float64(), - binary(), - large_binary(), - fixed_size_binary(3), - decimal(16, 4), - utf8(), - large_utf8(), - list(utf8()), - list(int64()), - large_list(large_utf8()), - fixed_size_list(utf8(), 3), - fixed_size_list(int64(), 4), - dictionary(int32(), utf8()), - struct_({field("a", utf8()), field("b", int32())}), - sparse_union(union_fields1, union_type_codes), - sparse_union(union_fields2, union_type_codes), - dense_union(union_fields1, union_type_codes), - dense_union(union_fields2, union_type_codes)}; - - for (auto type : types) { + for (auto type : TestArrayUtilitiesAgainstTheseTypes()) { ARROW_SCOPED_TRACE("type = ", type->ToString()); ASSERT_OK_AND_ASSIGN(auto array, MakeEmptyArray(type)); ASSERT_OK(array->ValidateFull()); @@ -754,6 +727,29 @@ TEST_F(TestArray, TestMakeEmptyArray) { } } +TEST_F(TestArray, TestFillFromScalar) { + for (auto type : TestArrayUtilitiesAgainstTheseTypes()) { + ARROW_SCOPED_TRACE("type = ", type->ToString()); + for (auto seed : {0u, 0xdeadbeef, 42u}) { + ARROW_SCOPED_TRACE("seed = ", seed); + + Field field("", type, /*nullable=*/true, + key_value_metadata({{"extension_allow_random_storage", "true"}})); + auto array = random::GenerateArray(field, 1, seed); + + ASSERT_OK_AND_ASSIGN(auto scalar, array->GetScalar(0)); + + ArraySpan span(*scalar); + auto roundtripped_array = span.ToArray(); + AssertArraysEqual(*array, *roundtripped_array); + + ASSERT_OK(roundtripped_array->ValidateFull()); + ASSERT_OK_AND_ASSIGN(auto roundtripped_scalar, roundtripped_array->GetScalar(0)); + AssertScalarsEqual(*scalar, *roundtripped_scalar); + } + } +} + TEST_F(TestArray, ExtensionSpanRoundTrip) { // Other types are checked in MakeEmptyArray but MakeEmptyArray doesn't // work for extension types so we check that here @@ -1731,21 +1727,6 @@ TYPED_TEST(TestPrimitiveBuilder, TestAppendValuesStdBool) { this->Check(this->builder_nn_, false); } -TYPED_TEST(TestPrimitiveBuilder, TestAdvance) { - ARROW_SUPPRESS_DEPRECATION_WARNING - int64_t n = 1000; - ASSERT_OK(this->builder_->Reserve(n)); - - ASSERT_OK(this->builder_->Advance(100)); - ASSERT_EQ(100, this->builder_->length()); - - ASSERT_OK(this->builder_->Advance(900)); - - int64_t too_many = this->builder_->capacity() - 1000 + 1; - ASSERT_RAISES(Invalid, this->builder_->Advance(too_many)); - ARROW_UNSUPPRESS_DEPRECATION_WARNING -} - TYPED_TEST(TestPrimitiveBuilder, TestResize) { int64_t cap = kMinBuilderCapacity * 2; @@ -1761,9 +1742,7 @@ TYPED_TEST(TestPrimitiveBuilder, TestReserve) { ASSERT_OK(this->builder_->Reserve(100)); ASSERT_EQ(0, this->builder_->length()); ASSERT_GE(100, this->builder_->capacity()); - ARROW_SUPPRESS_DEPRECATION_WARNING - ASSERT_OK(this->builder_->Advance(100)); - ARROW_UNSUPPRESS_DEPRECATION_WARNING + ASSERT_OK(this->builder_->AppendEmptyValues(100)); ASSERT_EQ(100, this->builder_->length()); ASSERT_GE(100, this->builder_->capacity()); diff --git a/cpp/src/arrow/array/builder_adaptive.h b/cpp/src/arrow/array/builder_adaptive.h index 382c35789c4e0..0cea571be3e32 100644 --- a/cpp/src/arrow/array/builder_adaptive.h +++ b/cpp/src/arrow/array/builder_adaptive.h @@ -142,7 +142,6 @@ class ARROW_EXPORT AdaptiveUIntBuilder : public internal::AdaptiveIntBuilderBase explicit AdaptiveUIntBuilder(MemoryPool* pool = default_memory_pool()) : AdaptiveUIntBuilder(sizeof(uint8_t), pool) {} - using ArrayBuilder::Advance; using internal::AdaptiveIntBuilderBase::Reset; /// Scalar append @@ -182,7 +181,6 @@ class ARROW_EXPORT AdaptiveIntBuilder : public internal::AdaptiveIntBuilderBase int64_t alignment = kDefaultBufferAlignment) : AdaptiveIntBuilder(sizeof(uint8_t), pool, alignment) {} - using ArrayBuilder::Advance; using internal::AdaptiveIntBuilderBase::Reset; /// Scalar append diff --git a/cpp/src/arrow/array/builder_base.cc b/cpp/src/arrow/array/builder_base.cc index 70da1fbb2966a..3000aea3e189a 100644 --- a/cpp/src/arrow/array/builder_base.cc +++ b/cpp/src/arrow/array/builder_base.cc @@ -91,14 +91,6 @@ Status ArrayBuilder::Resize(int64_t capacity) { return null_bitmap_builder_.Resize(capacity); } -Status ArrayBuilder::Advance(int64_t elements) { - if (length_ + elements > capacity_) { - return Status::Invalid("Builder must be expanded"); - } - length_ += elements; - return null_bitmap_builder_.Advance(elements); -} - namespace { template diff --git a/cpp/src/arrow/array/builder_base.h b/cpp/src/arrow/array/builder_base.h index abbd61be80359..05af850fd149c 100644 --- a/cpp/src/arrow/array/builder_base.h +++ b/cpp/src/arrow/array/builder_base.h @@ -180,15 +180,6 @@ class ARROW_EXPORT ArrayBuilder { return Status::NotImplemented("AppendArraySlice for builder for ", *type()); } - /// For cases where raw data was memcpy'd into the internal buffers, allows us - /// to advance the length of the builder. It is your responsibility to use - /// this function responsibly. - ARROW_DEPRECATED( - "Deprecated in 6.0.0. ArrayBuilder::Advance is poorly supported and mostly " - "untested.\nFor low-level control over buffer construction, use BufferBuilder " - "or TypedBufferBuilder directly.") - Status Advance(int64_t elements); - /// \brief Return result of builder as an internal generic ArrayData /// object. Resets builder except for dictionary builder /// diff --git a/cpp/src/arrow/array/data.cc b/cpp/src/arrow/array/data.cc index 8764e9c354c55..79595ab7c7c31 100644 --- a/cpp/src/arrow/array/data.cc +++ b/cpp/src/arrow/array/data.cc @@ -130,7 +130,8 @@ std::shared_ptr ArrayData::Make(std::shared_ptr type, int64 } std::shared_ptr ArrayData::Slice(int64_t off, int64_t len) const { - ARROW_CHECK_LE(off, length) << "Slice offset greater than array length"; + ARROW_CHECK_LE(off, length) << "Slice offset (" << off + << ") greater than array length (" << length << ")"; len = std::min(length - off, len); off += offset; @@ -228,12 +229,11 @@ void ArraySpan::SetMembers(const ArrayData& data) { namespace { template -void SetOffsetsForScalar(ArraySpan* span, offset_type* buffer, int64_t value_size, - int buffer_index = 1) { - buffer[0] = 0; - buffer[1] = static_cast(value_size); - span->buffers[buffer_index].data = reinterpret_cast(buffer); - span->buffers[buffer_index].size = 2 * sizeof(offset_type); +BufferSpan OffsetsForScalar(uint8_t* scratch_space, offset_type value_size) { + auto* offsets = reinterpret_cast(scratch_space); + offsets[0] = 0; + offsets[1] = static_cast(value_size); + return {scratch_space, sizeof(offset_type) * 2}; } int GetNumBuffers(const DataType& type) { @@ -241,9 +241,8 @@ int GetNumBuffers(const DataType& type) { case Type::NA: case Type::STRUCT: case Type::FIXED_SIZE_LIST: - return 1; case Type::RUN_END_ENCODED: - return 0; + return 1; case Type::BINARY: case Type::LARGE_BINARY: case Type::STRING: @@ -265,16 +264,19 @@ int GetNumBuffers(const DataType& type) { namespace internal { void FillZeroLengthArray(const DataType* type, ArraySpan* span) { - memset(span->scratch_space, 0x00, sizeof(span->scratch_space)); - span->type = type; span->length = 0; int num_buffers = GetNumBuffers(*type); for (int i = 0; i < num_buffers; ++i) { - span->buffers[i].data = reinterpret_cast(span->scratch_space); + alignas(int64_t) static std::array kZeros{0}; + span->buffers[i].data = kZeros.data(); span->buffers[i].size = 0; } + if (!HasValidityBitmap(type->id())) { + span->buffers[0] = {}; + } + for (int i = num_buffers; i < 3; ++i) { span->buffers[i] = {}; } @@ -304,9 +306,13 @@ void ArraySpan::FillFromScalar(const Scalar& value) { Type::type type_id = value.type->id(); - // Populate null count and validity bitmap (only for non-union/null types) - this->null_count = value.is_valid ? 0 : 1; - if (!is_union(type_id) && type_id != Type::NA) { + if (type_id == Type::NA) { + this->null_count = 1; + } else if (!internal::HasValidityBitmap(type_id)) { + this->null_count = 0; + } else { + // Populate null count and validity bitmap + this->null_count = value.is_valid ? 0 : 1; this->buffers[0].data = value.is_valid ? &kTrueBit : &kFalseBit; this->buffers[0].size = 1; } @@ -329,7 +335,7 @@ void ArraySpan::FillFromScalar(const Scalar& value) { } } else if (is_base_binary_like(type_id)) { const auto& scalar = checked_cast(value); - this->buffers[1].data = reinterpret_cast(this->scratch_space); + const uint8_t* data_buffer = nullptr; int64_t data_size = 0; if (scalar.is_valid) { @@ -337,12 +343,11 @@ void ArraySpan::FillFromScalar(const Scalar& value) { data_size = scalar.value->size(); } if (is_binary_like(type_id)) { - SetOffsetsForScalar(this, reinterpret_cast(this->scratch_space), - data_size); + this->buffers[1] = + OffsetsForScalar(scalar.scratch_space_, static_cast(data_size)); } else { // is_large_binary_like - SetOffsetsForScalar(this, reinterpret_cast(this->scratch_space), - data_size); + this->buffers[1] = OffsetsForScalar(scalar.scratch_space_, data_size); } this->buffers[2].data = const_cast(data_buffer); this->buffers[2].size = data_size; @@ -367,11 +372,10 @@ void ArraySpan::FillFromScalar(const Scalar& value) { } if (type_id == Type::LIST || type_id == Type::MAP) { - SetOffsetsForScalar(this, reinterpret_cast(this->scratch_space), - value_length); + this->buffers[1] = + OffsetsForScalar(scalar.scratch_space_, static_cast(value_length)); } else if (type_id == Type::LARGE_LIST) { - SetOffsetsForScalar(this, reinterpret_cast(this->scratch_space), - value_length); + this->buffers[1] = OffsetsForScalar(scalar.scratch_space_, value_length); } else { // FIXED_SIZE_LIST: does not have a second buffer this->buffers[1] = {}; @@ -384,26 +388,31 @@ void ArraySpan::FillFromScalar(const Scalar& value) { this->child_data[i].FillFromScalar(*scalar.value[i]); } } else if (is_union(type_id)) { + // Dense union needs scratch space to store both offsets and a type code + struct UnionScratchSpace { + alignas(int64_t) int8_t type_code; + alignas(int64_t) uint8_t offsets[sizeof(int32_t) * 2]; + }; + static_assert(sizeof(UnionScratchSpace) <= sizeof(UnionScalar::scratch_space_)); + auto* union_scratch_space = reinterpret_cast( + &checked_cast(value).scratch_space_); + // First buffer is kept null since unions have no validity vector this->buffers[0] = {}; - this->buffers[1].data = reinterpret_cast(this->scratch_space); + union_scratch_space->type_code = checked_cast(value).type_code; + this->buffers[1].data = reinterpret_cast(&union_scratch_space->type_code); this->buffers[1].size = 1; - int8_t* type_codes = reinterpret_cast(this->scratch_space); - type_codes[0] = checked_cast(value).type_code; this->child_data.resize(this->type->num_fields()); if (type_id == Type::DENSE_UNION) { const auto& scalar = checked_cast(value); - // Has offset; start 4 bytes in so it's aligned to a 32-bit boundaries - SetOffsetsForScalar(this, - reinterpret_cast(this->scratch_space) + 1, 1, - /*buffer_index=*/2); + this->buffers[2] = + OffsetsForScalar(union_scratch_space->offsets, static_cast(1)); // We can't "see" the other arrays in the union, but we put the "active" // union array in the right place and fill zero-length arrays for the // others - const std::vector& child_ids = - checked_cast(this->type)->child_ids(); + const auto& child_ids = checked_cast(this->type)->child_ids(); DCHECK_GE(scalar.type_code, 0); DCHECK_LT(scalar.type_code, static_cast(child_ids.size())); for (int i = 0; i < static_cast(this->child_data.size()); ++i) { @@ -429,6 +438,32 @@ void ArraySpan::FillFromScalar(const Scalar& value) { // Restore the extension type this->type = value.type.get(); + } else if (type_id == Type::RUN_END_ENCODED) { + const auto& scalar = checked_cast(value); + this->child_data.resize(2); + + auto set_run_end = [&](auto run_end) { + auto& e = this->child_data[0]; + e.type = scalar.run_end_type().get(); + e.length = 1; + e.null_count = 0; + e.buffers[1].data = scalar.scratch_space_; + e.buffers[1].size = sizeof(run_end); + reinterpret_cast(scalar.scratch_space_)[0] = run_end; + }; + + switch (scalar.run_end_type()->id()) { + case Type::INT16: + set_run_end(static_cast(1)); + break; + case Type::INT32: + set_run_end(static_cast(1)); + break; + default: + DCHECK_EQ(scalar.run_end_type()->id(), Type::INT64); + set_run_end(static_cast(1)); + } + this->child_data[1].FillFromScalar(*scalar.value); } else { DCHECK_EQ(Type::NA, type_id) << "should be unreachable: " << *value.type; } diff --git a/cpp/src/arrow/array/data.h b/cpp/src/arrow/array/data.h index 82a6e73372798..8c6b250b71adf 100644 --- a/cpp/src/arrow/array/data.h +++ b/cpp/src/arrow/array/data.h @@ -360,6 +360,15 @@ struct ARROW_EXPORT BufferSpan { int64_t size = 0; // Pointer back to buffer that owns this memory const std::shared_ptr* owner = NULLPTR; + + template + const T* data_as() const { + return reinterpret_cast(data); + } + template + T* mutable_data_as() { + return reinterpret_cast(data); + } }; /// \brief EXPERIMENTAL: A non-owning ArrayData reference that is cheaply @@ -372,11 +381,6 @@ struct ARROW_EXPORT ArraySpan { int64_t offset = 0; BufferSpan buffers[3]; - // 16 bytes of scratch space to enable this ArraySpan to be a view onto - // scalar values including binary scalars (where we need to create a buffer - // that looks like two 32-bit or 64-bit offsets) - uint64_t scratch_space[2]; - ArraySpan() = default; explicit ArraySpan(const DataType* type, int64_t length) : type(type), length(length) {} diff --git a/cpp/src/arrow/array/util.cc b/cpp/src/arrow/array/util.cc index d7a8783d4427b..e84ab404ad6e3 100644 --- a/cpp/src/arrow/array/util.cc +++ b/cpp/src/arrow/array/util.cc @@ -554,13 +554,18 @@ class NullArrayFactory { } Status Visit(const RunEndEncodedType& type) { - ARROW_ASSIGN_OR_RAISE(auto values, MakeArrayOfNull(type.value_type(), 1, pool_)); - ARROW_ASSIGN_OR_RAISE(auto run_end_scalar, - MakeScalarForRunEndValue(*type.run_end_type(), length_)); - ARROW_ASSIGN_OR_RAISE(auto run_ends, MakeArrayFromScalar(*run_end_scalar, 1, pool_)); - ARROW_ASSIGN_OR_RAISE(auto ree_array, - RunEndEncodedArray::Make(length_, run_ends, values)); - out_ = ree_array->data(); + std::shared_ptr run_ends, values; + if (length_ == 0) { + ARROW_ASSIGN_OR_RAISE(run_ends, MakeEmptyArray(type.run_end_type(), pool_)); + ARROW_ASSIGN_OR_RAISE(values, MakeEmptyArray(type.value_type(), pool_)); + } else { + ARROW_ASSIGN_OR_RAISE(auto run_end_scalar, + MakeScalarForRunEndValue(*type.run_end_type(), length_)); + ARROW_ASSIGN_OR_RAISE(run_ends, MakeArrayFromScalar(*run_end_scalar, 1, pool_)); + ARROW_ASSIGN_OR_RAISE(values, MakeArrayOfNull(type.value_type(), 1, pool_)); + } + out_->child_data[0] = run_ends->data(); + out_->child_data[1] = values->data(); return Status::OK(); } @@ -582,7 +587,7 @@ class NullArrayFactory { } MemoryPool* pool_; - std::shared_ptr type_; + const std::shared_ptr& type_; int64_t length_; std::shared_ptr out_; std::shared_ptr buffer_; @@ -859,6 +864,13 @@ Result> MakeArrayFromScalar(const Scalar& scalar, int64_t Result> MakeEmptyArray(std::shared_ptr type, MemoryPool* memory_pool) { + if (type->id() == Type::EXTENSION) { + const auto& ext_type = checked_cast(*type); + ARROW_ASSIGN_OR_RAISE(auto storage, + MakeEmptyArray(ext_type.storage_type(), memory_pool)); + storage->data()->type = std::move(type); + return ext_type.MakeArray(storage->data()); + } std::unique_ptr builder; RETURN_NOT_OK(MakeBuilder(memory_pool, type, &builder)); RETURN_NOT_OK(builder->Resize(0)); diff --git a/cpp/src/arrow/buffer.cc b/cpp/src/arrow/buffer.cc index afe3d773594f0..99dc29cfe5296 100644 --- a/cpp/src/arrow/buffer.cc +++ b/cpp/src/arrow/buffer.cc @@ -21,6 +21,7 @@ #include #include +#include "arrow/memory_pool_internal.h" #include "arrow/result.h" #include "arrow/status.h" #include "arrow/util/bit_util.h" @@ -43,6 +44,8 @@ Result> Buffer::CopySlice(const int64_t start, return std::move(new_buffer); } +Buffer::Buffer() : Buffer(memory_pool::internal::kZeroSizeArea, 0) {} + namespace { Status CheckBufferSlice(const Buffer& buffer, int64_t offset, int64_t length) { @@ -147,11 +150,12 @@ Result> Buffer::ViewOrCopy( class StlStringBuffer : public Buffer { public: - explicit StlStringBuffer(std::string data) - : Buffer(nullptr, 0), input_(std::move(data)) { - data_ = reinterpret_cast(input_.c_str()); - size_ = static_cast(input_.size()); - capacity_ = size_; + explicit StlStringBuffer(std::string data) : input_(std::move(data)) { + if (!input_.empty()) { + data_ = reinterpret_cast(input_.c_str()); + size_ = static_cast(input_.size()); + capacity_ = size_; + } } private: @@ -209,8 +213,11 @@ Result> ConcatenateBuffers( ARROW_ASSIGN_OR_RAISE(auto out, AllocateBuffer(out_length, pool)); auto out_data = out->mutable_data(); for (const auto& buffer : buffers) { - std::memcpy(out_data, buffer->data(), buffer->size()); - out_data += buffer->size(); + // Passing nullptr to std::memcpy is undefined behavior, so skip empty buffers + if (buffer->size() != 0) { + std::memcpy(out_data, buffer->data(), buffer->size()); + out_data += buffer->size(); + } } return std::move(out); } diff --git a/cpp/src/arrow/buffer.h b/cpp/src/arrow/buffer.h index 65f1abda16106..08a3bd749e25d 100644 --- a/cpp/src/arrow/buffer.h +++ b/cpp/src/arrow/buffer.h @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -28,7 +29,6 @@ #include "arrow/device.h" #include "arrow/status.h" #include "arrow/type_fwd.h" -#include "arrow/util/bytes_view.h" #include "arrow/util/macros.h" #include "arrow/util/visibility.h" @@ -50,6 +50,8 @@ namespace arrow { /// The following invariant is always true: Size <= Capacity class ARROW_EXPORT Buffer { public: + ARROW_DISALLOW_COPY_AND_ASSIGN(Buffer); + /// \brief Construct from buffer and size without copying memory /// /// \param[in] data a memory buffer @@ -57,18 +59,31 @@ class ARROW_EXPORT Buffer { /// /// \note The passed memory must be kept alive through some other means Buffer(const uint8_t* data, int64_t size) - : is_mutable_(false), is_cpu_(true), data_(data), size_(size), capacity_(size) { + : is_mutable_(false), + is_cpu_(true), + data_(data), + size_(size), + capacity_(size), + device_type_(DeviceAllocationType::kCPU) { SetMemoryManager(default_cpu_memory_manager()); } Buffer(const uint8_t* data, int64_t size, std::shared_ptr mm, - std::shared_ptr parent = NULLPTR) + std::shared_ptr parent = NULLPTR, + std::optional device_type = std::nullopt) : is_mutable_(false), data_(data), size_(size), capacity_(size), parent_(std::move(parent)) { + // SetMemoryManager will also set device_type_ SetMemoryManager(std::move(mm)); + // if a device type is specified, use that instead. for example: + // CUDA_HOST. The CudaMemoryManager will set device_type_ to CUDA, + // but you can specify CUDA_HOST as the device type to override it. + if (device_type != std::nullopt) { + device_type_ = device_type; + } } Buffer(uintptr_t address, int64_t size, std::shared_ptr mm, @@ -137,6 +152,32 @@ class ARROW_EXPORT Buffer { /// \return a new Buffer instance static std::shared_ptr FromString(std::string data); + /// \brief Construct an immutable buffer that takes ownership of the contents + /// of an std::vector (without copying it). Only vectors of TrivialType objects + /// (integers, floating point numbers, ...) can be wrapped by this function. + /// + /// \param[in] vec a vector to own + /// \return a new Buffer instance + template + static std::shared_ptr FromVector(std::vector vec) { + static_assert(std::is_trivial_v, + "Buffer::FromVector can only wrap vectors of trivial objects"); + + if (vec.empty()) { + return std::shared_ptr{new Buffer()}; + } + + auto* data = reinterpret_cast(vec.data()); + auto size_in_bytes = static_cast(vec.size() * sizeof(T)); + return std::shared_ptr{ + new Buffer{data, size_in_bytes}, + // Keep the vector's buffer alive inside the shared_ptr's destructor until after + // we have deleted the Buffer. Note we can't use this trick in FromString since + // std::string's data is inline for short strings so moving invalidates pointers + // into the string's buffer. + [vec = std::move(vec)](Buffer* buffer) { delete buffer; }}; + } + /// \brief Create buffer referencing typed memory with some length without /// copying /// \param[in] data the typed memory as C array @@ -167,13 +208,9 @@ class ARROW_EXPORT Buffer { /// \brief View buffer contents as a std::string_view /// \return std::string_view explicit operator std::string_view() const { - return std::string_view(reinterpret_cast(data_), size_); + return {reinterpret_cast(data_), static_cast(size_)}; } - /// \brief View buffer contents as a util::bytes_view - /// \return util::bytes_view - explicit operator util::bytes_view() const { return util::bytes_view(data_, size_); } - /// \brief Return a pointer to the buffer's data /// /// The buffer has to be a CPU buffer (`is_cpu()` is true). @@ -187,6 +224,15 @@ class ARROW_EXPORT Buffer { return ARROW_PREDICT_TRUE(is_cpu_) ? data_ : NULLPTR; } + /// \brief Return a pointer to the buffer's data cast to a specific type + /// + /// The buffer has to be a CPU buffer (`is_cpu()` is true). + /// Otherwise, an assertion may be thrown or a null pointer may be returned. + template + const T* data_as() const { + return reinterpret_cast(data()); + } + /// \brief Return a writable pointer to the buffer's data /// /// The buffer has to be a mutable CPU buffer (`is_cpu()` and `is_mutable()` @@ -204,6 +250,16 @@ class ARROW_EXPORT Buffer { : NULLPTR; } + /// \brief Return a writable pointer to the buffer's data cast to a specific type + /// + /// The buffer has to be a mutable CPU buffer (`is_cpu()` and `is_mutable()` + /// are true). Otherwise, an assertion may be thrown or a null pointer may + /// be returned. + template + T* mutable_data_as() { + return reinterpret_cast(mutable_data()); + } + /// \brief Return the device address of the buffer's data uintptr_t address() const { return reinterpret_cast(data_); } @@ -240,6 +296,8 @@ class ARROW_EXPORT Buffer { const std::shared_ptr& memory_manager() const { return memory_manager_; } + std::optional device_type() const { return device_type_; } + std::shared_ptr parent() const { return parent_; } /// \brief Get a RandomAccessFile for reading a buffer @@ -294,6 +352,7 @@ class ARROW_EXPORT Buffer { const uint8_t* data_; int64_t size_; int64_t capacity_; + std::optional device_type_; // null by default, but may be set std::shared_ptr parent_; @@ -303,17 +362,16 @@ class ARROW_EXPORT Buffer { std::shared_ptr memory_manager_; protected: + Buffer(); + void CheckMutable() const; void CheckCPU() const; void SetMemoryManager(std::shared_ptr mm) { memory_manager_ = std::move(mm); is_cpu_ = memory_manager_->is_cpu(); + device_type_ = memory_manager_->device()->device_type(); } - - private: - Buffer() = delete; - ARROW_DISALLOW_COPY_AND_ASSIGN(Buffer); }; /// \defgroup buffer-slicing-functions Functions for slicing buffers diff --git a/cpp/src/arrow/buffer_builder.h b/cpp/src/arrow/buffer_builder.h index 5f37e552004a1..e7eea64043ba8 100644 --- a/cpp/src/arrow/buffer_builder.h +++ b/cpp/src/arrow/buffer_builder.h @@ -118,6 +118,11 @@ class ARROW_EXPORT BufferBuilder { return Status::OK(); } + /// \brief Append the given data to the buffer + /// + /// The buffer is automatically expanded if necessary. + Status Append(std::string_view v) { return Append(v.data(), v.size()); } + /// \brief Append copies of a value to the buffer /// /// The buffer is automatically expanded if necessary. @@ -138,6 +143,7 @@ class ARROW_EXPORT BufferBuilder { memcpy(data_ + size_, data, static_cast(length)); size_ += length; } + void UnsafeAppend(std::string_view v) { UnsafeAppend(v.data(), v.size()); } void UnsafeAppend(const int64_t num_copies, uint8_t value) { memset(data_ + size_, value, static_cast(num_copies)); @@ -196,6 +202,14 @@ class ARROW_EXPORT BufferBuilder { int64_t length() const { return size_; } const uint8_t* data() const { return data_; } uint8_t* mutable_data() { return data_; } + template + const T* data_as() const { + return reinterpret_cast(data_); + } + template + T* mutable_data_as() { + return reinterpret_cast(data_); + } private: std::shared_ptr buffer_; diff --git a/cpp/src/arrow/buffer_test.cc b/cpp/src/arrow/buffer_test.cc index ce8bab846d586..13f6ea63b5e62 100644 --- a/cpp/src/arrow/buffer_test.cc +++ b/cpp/src/arrow/buffer_test.cc @@ -41,6 +41,7 @@ using internal::checked_cast; using internal::checked_pointer_cast; static const char kMyDeviceTypeName[] = "arrowtest::MyDevice"; +static const DeviceAllocationType kMyDeviceType = DeviceAllocationType::kEXT_DEV; static const int kMyDeviceAllowCopy = 1; static const int kMyDeviceAllowView = 2; @@ -70,6 +71,8 @@ class MyDevice : public Device { return checked_cast(other).value_ == value_; } + DeviceAllocationType device_type() const override { return kMyDeviceType; } + std::shared_ptr default_memory_manager() override; int value() const { return value_; } @@ -256,6 +259,7 @@ TEST_F(TestDevice, Copy) { ASSERT_EQ(buffer->device(), cpu_device_); ASSERT_TRUE(buffer->is_cpu()); ASSERT_NE(buffer->address(), cpu_src_->address()); + ASSERT_EQ(buffer->device_type(), DeviceAllocationType::kCPU); ASSERT_NE(buffer->data(), nullptr); AssertBufferEqual(*buffer, "some data"); @@ -263,6 +267,7 @@ TEST_F(TestDevice, Copy) { ASSERT_EQ(buffer->device(), cpu_device_); ASSERT_TRUE(buffer->is_cpu()); ASSERT_NE(buffer->address(), cpu_src_->address()); + ASSERT_EQ(buffer->device_type(), DeviceAllocationType::kCPU); ASSERT_NE(buffer->data(), nullptr); AssertBufferEqual(*buffer, "some data"); @@ -271,6 +276,7 @@ TEST_F(TestDevice, Copy) { ASSERT_EQ(buffer->device(), my_copy_device_); ASSERT_FALSE(buffer->is_cpu()); ASSERT_NE(buffer->address(), cpu_src_->address()); + ASSERT_EQ(buffer->device_type(), kMyDeviceType); #ifdef NDEBUG ASSERT_EQ(buffer->data(), nullptr); #endif @@ -280,6 +286,7 @@ TEST_F(TestDevice, Copy) { ASSERT_EQ(buffer->device(), my_copy_device_); ASSERT_FALSE(buffer->is_cpu()); ASSERT_NE(buffer->address(), cpu_src_->address()); + ASSERT_EQ(buffer->device_type(), kMyDeviceType); #ifdef NDEBUG ASSERT_EQ(buffer->data(), nullptr); #endif @@ -290,6 +297,7 @@ TEST_F(TestDevice, Copy) { ASSERT_EQ(buffer->device(), cpu_device_); ASSERT_TRUE(buffer->is_cpu()); ASSERT_NE(buffer->address(), my_copy_src_->address()); + ASSERT_EQ(buffer->device_type(), DeviceAllocationType::kCPU); ASSERT_NE(buffer->data(), nullptr); AssertBufferEqual(*buffer, "some data"); @@ -297,6 +305,7 @@ TEST_F(TestDevice, Copy) { ASSERT_EQ(buffer->device(), cpu_device_); ASSERT_TRUE(buffer->is_cpu()); ASSERT_NE(buffer->address(), my_copy_src_->address()); + ASSERT_EQ(buffer->device_type(), DeviceAllocationType::kCPU); ASSERT_NE(buffer->data(), nullptr); AssertBufferEqual(*buffer, "some data"); @@ -305,6 +314,7 @@ TEST_F(TestDevice, Copy) { ASSERT_EQ(buffer->device(), my_copy_device_); ASSERT_FALSE(buffer->is_cpu()); ASSERT_NE(buffer->address(), my_copy_src_->address()); + ASSERT_EQ(buffer->device_type(), kMyDeviceType); #ifdef NDEBUG ASSERT_EQ(buffer->data(), nullptr); #endif @@ -315,6 +325,7 @@ TEST_F(TestDevice, Copy) { ASSERT_EQ(buffer->device(), my_copy_device_); ASSERT_FALSE(buffer->is_cpu()); ASSERT_NE(buffer->address(), my_copy_src_->address()); + ASSERT_EQ(buffer->device_type(), kMyDeviceType); #ifdef NDEBUG ASSERT_EQ(buffer->data(), nullptr); #endif @@ -330,6 +341,7 @@ TEST_F(TestDevice, View) { ASSERT_EQ(buffer->device(), cpu_device_); ASSERT_TRUE(buffer->is_cpu()); ASSERT_EQ(buffer->address(), cpu_src_->address()); + ASSERT_EQ(buffer->device_type(), DeviceAllocationType::kCPU); ASSERT_NE(buffer->data(), nullptr); AssertBufferEqual(*buffer, "some data"); @@ -338,6 +350,7 @@ TEST_F(TestDevice, View) { ASSERT_EQ(buffer->device(), my_view_device_); ASSERT_FALSE(buffer->is_cpu()); ASSERT_EQ(buffer->address(), cpu_src_->address()); + ASSERT_EQ(buffer->device_type(), kMyDeviceType); #ifdef NDEBUG ASSERT_EQ(buffer->data(), nullptr); #endif @@ -348,6 +361,7 @@ TEST_F(TestDevice, View) { ASSERT_EQ(buffer->device(), cpu_device_); ASSERT_TRUE(buffer->is_cpu()); ASSERT_EQ(buffer->address(), my_copy_src_->address()); + ASSERT_EQ(buffer->device_type(), DeviceAllocationType::kCPU); ASSERT_NE(buffer->data(), nullptr); AssertBufferEqual(*buffer, "some data"); @@ -1000,4 +1014,13 @@ TYPED_TEST(TypedTestBuffer, ResizeOOM) { #endif } +TEST(TestBufferConcatenation, EmptyBuffer) { + // GH-36913: UB shouldn't be triggered by copying from a null pointer + const std::string contents = "hello, world"; + auto buffer = std::make_shared(contents); + auto empty_buffer = std::make_shared(/*data=*/nullptr, /*size=*/0); + ASSERT_OK_AND_ASSIGN(auto result, ConcatenateBuffers({buffer, empty_buffer})); + AssertMyBufferEqual(*result, contents); +} + } // namespace arrow diff --git a/cpp/src/arrow/c/bridge.cc b/cpp/src/arrow/c/bridge.cc index 85a5156d11db2..13355dd6d05ae 100644 --- a/cpp/src/arrow/c/bridge.cc +++ b/cpp/src/arrow/c/bridge.cc @@ -522,6 +522,8 @@ struct ExportedArrayPrivateData : PoolAllocationMixin std::shared_ptr data_; + RawSyncEvent sync_event_; + ExportedArrayPrivateData() = default; ARROW_DEFAULT_MOVE_AND_ASSIGN(ExportedArrayPrivateData); ARROW_DISALLOW_COPY_AND_ASSIGN(ExportedArrayPrivateData); @@ -544,7 +546,12 @@ void ReleaseExportedArray(struct ArrowArray* array) { << "Dictionary release callback should have marked it released"; } DCHECK_NE(array->private_data, nullptr); - delete reinterpret_cast(array->private_data); + auto* pdata = reinterpret_cast(array->private_data); + if (pdata->sync_event_.sync_event != nullptr && + pdata->sync_event_.release_func != nullptr) { + pdata->sync_event_.release_func(pdata->sync_event_.sync_event); + } + delete pdata; ArrowArrayMarkReleased(array); } @@ -584,6 +591,7 @@ struct ArrayExporter { // Store owning pointer to ArrayData export_.data_ = data; + export_.sync_event_ = RawSyncEvent(); return Status::OK(); } @@ -663,6 +671,118 @@ Status ExportRecordBatch(const RecordBatch& batch, struct ArrowArray* out, return Status::OK(); } +////////////////////////////////////////////////////////////////////////// +// C device arrays + +Status ValidateDeviceInfo(const ArrayData& data, + std::optional* device_type, + int64_t* device_id) { + for (const auto& buf : data.buffers) { + if (!buf) { + continue; + } + + if (*device_type == std::nullopt) { + *device_type = buf->device_type(); + *device_id = buf->device()->device_id(); + continue; + } + + if (buf->device_type() != *device_type) { + return Status::Invalid( + "Exporting device array with buffers on more than one device."); + } + + if (buf->device()->device_id() != *device_id) { + return Status::Invalid( + "Exporting device array with buffers on multiple device ids."); + } + } + + for (const auto& child : data.child_data) { + RETURN_NOT_OK(ValidateDeviceInfo(*child, device_type, device_id)); + } + + return Status::OK(); +} + +Result, int64_t>> ValidateDeviceInfo( + const ArrayData& data) { + std::optional device_type; + int64_t device_id = -1; + RETURN_NOT_OK(ValidateDeviceInfo(data, &device_type, &device_id)); + return std::make_pair(device_type, device_id); +} + +Status ExportDeviceArray(const Array& array, RawSyncEvent sync_event, + struct ArrowDeviceArray* out, struct ArrowSchema* out_schema) { + if (sync_event.sync_event != nullptr && sync_event.release_func) { + return Status::Invalid( + "Must provide a release event function if providing a non-null event"); + } + + SchemaExportGuard guard(out_schema); + if (out_schema != nullptr) { + RETURN_NOT_OK(ExportType(*array.type(), out_schema)); + } + + ARROW_ASSIGN_OR_RAISE(auto device_info, ValidateDeviceInfo(*array.data())); + if (!device_info.first) { + out->device_type = ARROW_DEVICE_CPU; + } else { + out->device_type = static_cast(*device_info.first); + } + out->device_id = device_info.second; + + ArrayExporter exporter; + RETURN_NOT_OK(exporter.Export(array.data())); + exporter.Finish(&out->array); + + auto* pdata = reinterpret_cast(out->array.private_data); + pdata->sync_event_ = sync_event; + out->sync_event = sync_event.sync_event; + + guard.Detach(); + return Status::OK(); +} + +Status ExportDeviceRecordBatch(const RecordBatch& batch, RawSyncEvent sync_event, + struct ArrowDeviceArray* out, + struct ArrowSchema* out_schema) { + if (sync_event.sync_event != nullptr && sync_event.release_func == nullptr) { + return Status::Invalid( + "Must provide a release event function if providing a non-null event"); + } + + // XXX perhaps bypass ToStructArray for speed? + ARROW_ASSIGN_OR_RAISE(auto array, batch.ToStructArray()); + + SchemaExportGuard guard(out_schema); + if (out_schema != nullptr) { + // Export the schema, not the struct type, so as not to lose top-level metadata + RETURN_NOT_OK(ExportSchema(*batch.schema(), out_schema)); + } + + ARROW_ASSIGN_OR_RAISE(auto device_info, ValidateDeviceInfo(*array->data())); + if (!device_info.first) { + out->device_type = ARROW_DEVICE_CPU; + } else { + out->device_type = static_cast(*device_info.first); + } + out->device_id = device_info.second; + + ArrayExporter exporter; + RETURN_NOT_OK(exporter.Export(array->data())); + exporter.Finish(&out->array); + + auto* pdata = reinterpret_cast(out->array.private_data); + pdata->sync_event_ = sync_event; + out->sync_event = sync_event.sync_event; + + guard.Detach(); + return Status::OK(); +} + ////////////////////////////////////////////////////////////////////////// // C schema import @@ -1242,6 +1362,7 @@ namespace { // The ArrowArray is released on destruction. struct ImportedArrayData { struct ArrowArray array_; + void* sync_event_; ImportedArrayData() { ArrowArrayMarkReleased(&array_); // Initially released @@ -1267,6 +1388,11 @@ class ImportedBuffer : public Buffer { std::shared_ptr import) : Buffer(data, size), import_(std::move(import)) {} + ImportedBuffer(const uint8_t* data, int64_t size, std::shared_ptr mm, + DeviceAllocationType device_type, + std::shared_ptr import) + : Buffer(data, size, mm, nullptr, device_type), import_(std::move(import)) {} + ~ImportedBuffer() override {} protected: @@ -1275,7 +1401,20 @@ class ImportedBuffer : public Buffer { struct ArrayImporter { explicit ArrayImporter(const std::shared_ptr& type) - : type_(type), zero_size_buffer_(std::make_shared(kZeroSizeArea, 0)) {} + : type_(type), + zero_size_buffer_(std::make_shared(kZeroSizeArea, 0)), + device_type_(DeviceAllocationType::kCPU) {} + + Status Import(struct ArrowDeviceArray* src, const DeviceMemoryMapper& mapper) { + ARROW_ASSIGN_OR_RAISE(memory_mgr_, mapper(src->device_type, src->device_id)); + device_type_ = static_cast(src->device_type); + RETURN_NOT_OK(Import(&src->array)); + import_->sync_event_ = src->sync_event; + // reset internal state before next import + memory_mgr_.reset(); + device_type_ = DeviceAllocationType::kCPU; + return Status::OK(); + } Status Import(struct ArrowArray* src) { if (ArrowArrayIsReleased(src)) { @@ -1588,7 +1727,12 @@ struct ArrayImporter { std::shared_ptr* out = &data_->buffers[buffer_id]; auto data = reinterpret_cast(c_struct_->buffers[buffer_id]); if (data != nullptr) { - *out = std::make_shared(data, buffer_size, import_); + if (memory_mgr_) { + *out = std::make_shared(data, buffer_size, memory_mgr_, + device_type_, import_); + } else { + *out = std::make_shared(data, buffer_size, import_); + } } else if (is_null_bitmap) { out->reset(); } else { @@ -1613,6 +1757,9 @@ struct ArrayImporter { // For imported null buffer pointers std::shared_ptr zero_size_buffer_; + + std::shared_ptr memory_mgr_; + DeviceAllocationType device_type_; }; } // namespace @@ -1652,6 +1799,45 @@ Result> ImportRecordBatch(struct ArrowArray* array, return ImportRecordBatch(array, *maybe_schema); } +Result> ImportDeviceArray(struct ArrowDeviceArray* array, + std::shared_ptr type, + const DeviceMemoryMapper& mapper) { + ArrayImporter importer(type); + RETURN_NOT_OK(importer.Import(array, mapper)); + return importer.MakeArray(); +} + +Result> ImportDeviceArray(struct ArrowDeviceArray* array, + struct ArrowSchema* type, + const DeviceMemoryMapper& mapper) { + auto maybe_type = ImportType(type); + if (!maybe_type.ok()) { + ArrowArrayRelease(&array->array); + return maybe_type.status(); + } + return ImportDeviceArray(array, *maybe_type, mapper); +} + +Result> ImportDeviceRecordBatch( + struct ArrowDeviceArray* array, std::shared_ptr schema, + const DeviceMemoryMapper& mapper) { + auto type = struct_(schema->fields()); + ArrayImporter importer(type); + RETURN_NOT_OK(importer.Import(array, mapper)); + return importer.MakeRecordBatch(std::move(schema)); +} + +Result> ImportDeviceRecordBatch( + struct ArrowDeviceArray* array, struct ArrowSchema* schema, + const DeviceMemoryMapper& mapper) { + auto maybe_schema = ImportSchema(schema); + if (!maybe_schema.ok()) { + ArrowArrayRelease(&array->array); + return maybe_schema.status(); + } + return ImportDeviceRecordBatch(array, *maybe_schema, mapper); +} + ////////////////////////////////////////////////////////////////////////// // C stream export diff --git a/cpp/src/arrow/c/bridge.h b/cpp/src/arrow/c/bridge.h index 3b1a013d20dbf..92707a59729fc 100644 --- a/cpp/src/arrow/c/bridge.h +++ b/cpp/src/arrow/c/bridge.h @@ -17,6 +17,7 @@ #pragma once +#include #include #include @@ -166,6 +167,135 @@ Result> ImportRecordBatch(struct ArrowArray* array, /// @} +/// \defgroup c-data-device-interface Functions for working with the C data device +/// interface. +/// +/// @{ + +/// \brief EXPERIMENTAL: Type for freeing a sync event +/// +/// If synchronization is necessary for accessing the data on a device, +/// a pointer to an event needs to be passed when exporting the device +/// array. It's the responsibility of the release function for the array +/// to release the event. Both can be null if no sync'ing is necessary. +struct RawSyncEvent { + void* sync_event = NULL; + std::function release_func; +}; + +/// \brief EXPERIMENTAL: Export C++ Array as an ArrowDeviceArray. +/// +/// The resulting ArrowDeviceArray struct keeps the array data and buffers alive +/// until its release callback is called by the consumer. All buffers in +/// the provided array MUST have the same device_type, otherwise an error +/// will be returned. +/// +/// If a non-null sync_event is provided, then the sync_release func must also be +/// non-null. If the sync_event is null, then the sync_release parameter is not called. +/// +/// \param[in] array Array object to export +/// \param[in] sync_event A struct containing what is needed for syncing if necessary +/// \param[out] out C struct to export the array to +/// \param[out] out_schema optional C struct to export the array type to +ARROW_EXPORT +Status ExportDeviceArray(const Array& array, RawSyncEvent sync_event, + struct ArrowDeviceArray* out, + struct ArrowSchema* out_schema = NULLPTR); + +/// \brief EXPERIMENTAL: Export C++ RecordBatch as an ArrowDeviceArray. +/// +/// The record batch is exported as if it were a struct array. +/// The resulting ArrowDeviceArray struct keeps the record batch data and buffers alive +/// until its release callback is called by the consumer. +/// +/// All buffers of all columns in the record batch must have the same device_type +/// otherwise an error will be returned. If columns are on different devices, +/// they should be exported using different ArrowDeviceArray instances. +/// +/// If a non-null sync_event is provided, then the sync_release func must also be +/// non-null. If the sync_event is null, then the sync_release parameter is ignored. +/// +/// \param[in] batch Record batch to export +/// \param[in] sync_event A struct containing what is needed for syncing if necessary +/// \param[out] out C struct where to export the record batch +/// \param[out] out_schema optional C struct where to export the record batch schema +ARROW_EXPORT +Status ExportDeviceRecordBatch(const RecordBatch& batch, RawSyncEvent sync_event, + struct ArrowDeviceArray* out, + struct ArrowSchema* out_schema = NULLPTR); + +using DeviceMemoryMapper = + std::function>(ArrowDeviceType, int64_t)>; + +/// \brief EXPERIMENTAL: Import C++ device array from the C data interface. +/// +/// The ArrowArray struct has its contents moved (as per the C data interface +/// specification) to a private object held alive by the resulting array. The +/// buffers of the Array are located on the device indicated by the device_type. +/// +/// \param[in,out] array C data interface struct holding the array data +/// \param[in] type type of the imported array +/// \param[in] mapper A function to map device + id to memory manager +/// \return Imported array object +ARROW_EXPORT +Result> ImportDeviceArray(struct ArrowDeviceArray* array, + std::shared_ptr type, + const DeviceMemoryMapper& mapper); + +/// \brief EXPERIMENTAL: Import C++ device array and its type from the C data interface. +/// +/// The ArrowArray struct has its contents moved (as per the C data interface +/// specification) to a private object held alive by the resulting array. +/// The ArrowSchema struct is released, even if this function fails. The +/// buffers of the Array are located on the device indicated by the device_type. +/// +/// \param[in,out] array C data interface struct holding the array data +/// \param[in,out] type C data interface struct holding the array type +/// \param[in] mapper A function to map device + id to memory manager +/// \return Imported array object +ARROW_EXPORT +Result> ImportDeviceArray(struct ArrowDeviceArray* array, + struct ArrowSchema* type, + const DeviceMemoryMapper& mapper); + +/// \brief EXPERIMENTAL: Import C++ record batch with buffers on a device from the C data +/// interface. +/// +/// The ArrowArray struct has its contents moved (as per the C data interface +/// specification) to a private object held alive by the resulting record batch. +/// The buffers of all columns of the record batch are located on the device +/// indicated by the device type. +/// +/// \param[in,out] array C data interface struct holding the record batch data +/// \param[in] schema schema of the imported record batch +/// \param[in] mapper A function to map device + id to memory manager +/// \return Imported record batch object +ARROW_EXPORT +Result> ImportDeviceRecordBatch( + struct ArrowDeviceArray* array, std::shared_ptr schema, + const DeviceMemoryMapper& mapper); + +/// \brief EXPERIMENTAL: Import C++ record batch with buffers on a device and its schema +/// from the C data interface. +/// +/// The type represented by the ArrowSchema struct must be a struct type array. +/// The ArrowArray struct has its contents moved (as per the C data interface +/// specification) to a private object held alive by the resulting record batch. +/// The ArrowSchema struct is released, even if this function fails. The buffers +/// of all columns of the record batch are located on the device indicated by the +/// device type. +/// +/// \param[in,out] array C data interface struct holding the record batch data +/// \param[in,out] schema C data interface struct holding the record batch schema +/// \param[in] mapper A function to map device + id to memory manager +/// \return Imported record batch object +ARROW_EXPORT +Result> ImportDeviceRecordBatch( + struct ArrowDeviceArray* array, struct ArrowSchema* schema, + const DeviceMemoryMapper& mapper); + +/// @} + /// \defgroup c-stream-interface Functions for working with the C data interface. /// /// @{ diff --git a/cpp/src/arrow/c/bridge_test.cc b/cpp/src/arrow/c/bridge_test.cc index 5fe7b653c8970..5c7de8e4a0783 100644 --- a/cpp/src/arrow/c/bridge_test.cc +++ b/cpp/src/arrow/c/bridge_test.cc @@ -565,6 +565,15 @@ struct ArrayExportChecker { ASSERT_EQ(c_export->children, nullptr); } } + + void operator()(struct ArrowDeviceArray* c_export, const ArrayData& expected_data, + const ArrowDeviceType device_type, const int64_t device_id, + const void* sync_event) { + ASSERT_EQ(c_export->device_type, device_type); + ASSERT_EQ(c_export->device_id, device_id); + ASSERT_EQ(c_export->sync_event, sync_event); + this->operator()(&c_export->array, expected_data); + } }; struct RecordBatchExportChecker { @@ -592,6 +601,15 @@ struct RecordBatchExportChecker { ASSERT_EQ(c_export->children, nullptr); } } + + void operator()(struct ArrowDeviceArray* c_export, const RecordBatch& expected_data, + const ArrowDeviceType device_type, const int64_t device_id, + const void* sync_event) { + ASSERT_EQ(c_export->device_type, device_type); + ASSERT_EQ(c_export->device_id, device_id); + ASSERT_EQ(c_export->sync_event, sync_event); + this->operator()(&c_export->array, expected_data); + } }; class TestArrayExport : public ::testing::Test { @@ -1112,6 +1130,392 @@ TEST_F(TestArrayExport, ExportRecordBatch) { } } +//////////////////////////////////////////////////////////////////////////// +// Device Array Export Tests + +static const char kMyDeviceTypeName[] = "arrowtest::MyDevice"; +static const ArrowDeviceType kMyDeviceType = ARROW_DEVICE_EXT_DEV; + +class MyBuffer final : public MutableBuffer { + public: + using MutableBuffer::MutableBuffer; + + ~MyBuffer() { default_memory_pool()->Free(const_cast(data_), size_); } +}; + +class MyMemoryManager : public CPUMemoryManager { + public: + explicit MyMemoryManager(const std::shared_ptr& device) + : CPUMemoryManager(device, default_memory_pool()) {} + + Result> AllocateBuffer(int64_t size) override { + uint8_t* data; + RETURN_NOT_OK(pool_->Allocate(size, &data)); + return std::make_unique(data, size, shared_from_this()); + } + + protected: + Result> CopyBufferFrom( + const std::shared_ptr& buf, + const std::shared_ptr& from) override { + return CopyNonOwnedFrom(*buf, from); + } + Result> CopyNonOwnedFrom( + const Buffer& buf, const std::shared_ptr& from) override { + if (!from->is_cpu()) { + return nullptr; + } + + ARROW_ASSIGN_OR_RAISE(auto dest, AllocateBuffer(buf.size())); + if (buf.size() > 0) { + memcpy(dest->mutable_data(), buf.data(), static_cast(buf.size())); + } + return std::move(dest); + } +}; + +class MyDevice : public Device { + public: + explicit MyDevice(int value) : Device(true), value_(value) {} + const char* type_name() const override { return kMyDeviceTypeName; } + std::string ToString() const override { return kMyDeviceTypeName; } + bool Equals(const Device& other) const override { + if (other.type_name() != kMyDeviceTypeName || other.device_type() != device_type()) { + return false; + } + return checked_cast(other).value_ == value_; + } + DeviceAllocationType device_type() const override { + return static_cast(kMyDeviceType); + } + int64_t device_id() const override { return value_; } + std::shared_ptr default_memory_manager() override { + return std::make_shared(shared_from_this()); + } + + protected: + int value_; +}; + +class TestDeviceArrayExport : public ::testing::Test { + public: + void SetUp() override { pool_ = default_memory_pool(); } + + static Result> ToDeviceData( + const std::shared_ptr& mm, const ArrayData& data) { + arrow::BufferVector buffers; + for (const auto& buf : data.buffers) { + if (buf) { + ARROW_ASSIGN_OR_RAISE(auto dest, mm->CopyBuffer(buf, mm)); + buffers.push_back(dest); + } else { + buffers.push_back(nullptr); + } + } + + arrow::ArrayDataVector children; + for (const auto& child : data.child_data) { + ARROW_ASSIGN_OR_RAISE(auto dest, ToDeviceData(mm, *child)); + children.push_back(dest); + } + + return ArrayData::Make(data.type, data.length, buffers, children, data.null_count, + data.offset); + } + + static Result> ToDevice(const std::shared_ptr& mm, + const ArrayData& data) { + ARROW_ASSIGN_OR_RAISE(auto result, ToDeviceData(mm, data)); + return MakeArray(result); + } + + template + static std::function>()> ToDeviceFactory( + const std::shared_ptr& mm, ArrayFactory&& factory) { + return [&]() { return ToDevice(mm, *factory()->data()); }; + } + + static std::function>()> JSONArrayFactory( + const std::shared_ptr& mm, std::shared_ptr type, + const char* json) { + return [=]() { return ToDevice(mm, *ArrayFromJSON(type, json)->data()); }; + } + + template + void TestWithArrayFactory(ArrayFactory&& factory, ExportCheckFunc&& check_func) { + auto orig_bytes = pool_->bytes_allocated(); + + std::shared_ptr arr; + ASSERT_OK_AND_ASSIGN(arr, ToResult(factory())); + ARROW_SCOPED_TRACE("type = ", arr->type()->ToString(), + ", array data = ", arr->ToString()); + const ArrayData& data = *arr->data(); // non-owning reference + struct ArrowDeviceArray c_export; + ASSERT_OK(ExportDeviceArray(*arr, {nullptr, nullptr}, &c_export)); + + ArrayExportGuard guard(&c_export.array); + auto new_bytes = pool_->bytes_allocated(); + ASSERT_GT(new_bytes, orig_bytes); + + // Release the shared_ptr, underlying data should be held alive + arr.reset(); + ASSERT_EQ(pool_->bytes_allocated(), new_bytes); + check_func(&c_export, data, kMyDeviceType, 1, nullptr); + + // Release the ArrowArray, underlying data should be destroyed + guard.Release(); + ASSERT_EQ(pool_->bytes_allocated(), orig_bytes); + } + + template + void TestNested(ArrayFactory&& factory) { + ArrayExportChecker checker; + TestWithArrayFactory(std::forward(factory), checker); + } + + void TestNested(const std::shared_ptr& mm, + const std::shared_ptr& type, const char* json) { + TestNested(JSONArrayFactory(mm, type, json)); + } + + template + void TestPrimitive(ArrayFactory&& factory) { + TestNested(std::forward(factory)); + } + + void TestPrimitive(const std::shared_ptr& mm, + const std::shared_ptr& type, const char* json) { + TestNested(mm, type, json); + } + + protected: + MemoryPool* pool_; +}; + +TEST_F(TestDeviceArrayExport, Primitive) { + std::shared_ptr device = std::make_shared(1); + auto mm = device->default_memory_manager(); + + TestPrimitive(mm, int8(), "[1, 2, null, -3]"); + TestPrimitive(mm, int16(), "[1, 2, -3]"); + TestPrimitive(mm, int32(), "[1, 2, null, -3]"); + TestPrimitive(mm, int64(), "[1, 2, -3]"); + TestPrimitive(mm, uint8(), "[1, 2, 3]"); + TestPrimitive(mm, uint16(), "[1, 2, null, 3]"); + TestPrimitive(mm, uint32(), "[1, 2, 3]"); + TestPrimitive(mm, uint64(), "[1, 2, null, 3]"); + + TestPrimitive(mm, boolean(), "[true, false, null]"); + + TestPrimitive(mm, float32(), "[1.5, null]"); + TestPrimitive(mm, float64(), "[1.5, null]"); + + TestPrimitive(mm, fixed_size_binary(3), R"(["foo", "bar", null])"); + TestPrimitive(mm, binary(), R"(["foo", "bar", null])"); + TestPrimitive(mm, large_binary(), R"(["foo", "bar", null])"); + TestPrimitive(mm, utf8(), R"(["foo", "bar", null])"); + TestPrimitive(mm, large_utf8(), R"(["foo", "bar", null])"); + + TestPrimitive(mm, decimal(16, 4), R"(["1234.5670", null])"); + TestPrimitive(mm, decimal256(16, 4), R"(["1234.5670", null])"); + + TestPrimitive(mm, month_day_nano_interval(), R"([[-1, 5, 20], null])"); +} + +TEST_F(TestDeviceArrayExport, PrimitiveSliced) { + std::shared_ptr device = std::make_shared(1); + auto mm = device->default_memory_manager(); + + auto factory = [=]() { + return (*ToDevice(mm, *ArrayFromJSON(int16(), "[1, 2, null, -3]")->data())) + ->Slice(1, 2); + }; + TestPrimitive(factory); +} + +TEST_F(TestDeviceArrayExport, Temporal) { + std::shared_ptr device = std::make_shared(1); + auto mm = device->default_memory_manager(); + + const char* json = "[1, 2, null, 42]"; + TestPrimitive(mm, date32(), json); + TestPrimitive(mm, date64(), json); + TestPrimitive(mm, time32(TimeUnit::SECOND), json); + TestPrimitive(mm, time32(TimeUnit::MILLI), json); + TestPrimitive(mm, time64(TimeUnit::MICRO), json); + TestPrimitive(mm, time64(TimeUnit::NANO), json); + TestPrimitive(mm, duration(TimeUnit::SECOND), json); + TestPrimitive(mm, duration(TimeUnit::MILLI), json); + TestPrimitive(mm, duration(TimeUnit::MICRO), json); + TestPrimitive(mm, duration(TimeUnit::NANO), json); + TestPrimitive(mm, month_interval(), json); + + TestPrimitive(mm, day_time_interval(), "[[7, 600], null]"); + + json = R"(["1970-01-01","2000-02-29","1900-02-28"])"; + TestPrimitive(mm, timestamp(TimeUnit::SECOND), json); + TestPrimitive(mm, timestamp(TimeUnit::SECOND, "Europe/Paris"), json); + TestPrimitive(mm, timestamp(TimeUnit::MILLI), json); + TestPrimitive(mm, timestamp(TimeUnit::MILLI, "Europe/Paris"), json); + TestPrimitive(mm, timestamp(TimeUnit::MICRO), json); + TestPrimitive(mm, timestamp(TimeUnit::MICRO, "Europe/Paris"), json); + TestPrimitive(mm, timestamp(TimeUnit::NANO), json); + TestPrimitive(mm, timestamp(TimeUnit::NANO, "Europe/Paris"), json); +} + +TEST_F(TestDeviceArrayExport, List) { + std::shared_ptr device = std::make_shared(1); + auto mm = device->default_memory_manager(); + + TestNested(mm, list(int8()), "[[1, 2], [3, null], null]"); + TestNested(mm, large_list(uint16()), "[[1, 2], [3, null], null]"); + TestNested(mm, fixed_size_list(int64(), 2), "[[1, 2], [3, null], null]"); + + TestNested(mm, list(large_list(int32())), "[[[1, 2], [3], null], null]"); +} + +TEST_F(TestDeviceArrayExport, ListSliced) { + std::shared_ptr device = std::make_shared(1); + auto mm = device->default_memory_manager(); + + { + auto factory = [=]() { + return (*ToDevice( + mm, *ArrayFromJSON(list(int8()), "[[1, 2], [3, null], [4, 5, 6], null]") + ->data())) + ->Slice(1, 2); + }; + TestNested(factory); + } + { + auto factory = [=]() { + auto values = + (*ToDevice(mm, + *ArrayFromJSON(int16(), "[1, 2, 3, 4, null, 5, 6, 7, 8]")->data())) + ->Slice(1, 6); + auto offsets = (*ToDevice(mm, *ArrayFromJSON(int32(), "[0, 2, 3, 5, 6]")->data())) + ->Slice(2, 4); + return ListArray::FromArrays(*offsets, *values); + }; + TestNested(factory); + } +} + +TEST_F(TestDeviceArrayExport, Struct) { + std::shared_ptr device = std::make_shared(1); + auto mm = device->default_memory_manager(); + + const char* data = R"([[1, "foo"], [2, null]])"; + auto type = struct_({field("a", int8()), field("b", utf8())}); + TestNested(mm, type, data); +} + +TEST_F(TestDeviceArrayExport, Map) { + std::shared_ptr device = std::make_shared(1); + auto mm = device->default_memory_manager(); + + const char* json = R"([[[1, "foo"], [2, null]], [[3, "bar"]]])"; + TestNested(mm, map(int8(), utf8()), json); + TestNested(mm, map(int8(), utf8(), /*keys_sorted=*/true), json); +} + +TEST_F(TestDeviceArrayExport, Union) { + std::shared_ptr device = std::make_shared(1); + auto mm = device->default_memory_manager(); + + const char* data = "[null, [42, 1], [43, true], [42, null], [42, 2]]"; + // Dense + auto field_a = field("a", int8()); + auto field_b = field("b", boolean(), /*nullable=*/false); + auto type = dense_union({field_a, field_b}, {42, 43}); + TestNested(mm, type, data); + // Sparse + field_a = field("a", int8(), /*nullable=*/false); + field_b = field("b", boolean()); + type = sparse_union({field_a, field_b}, {42, 43}); + TestNested(mm, type, data); +} + +TEST_F(TestDeviceArrayExport, Extension) { + std::shared_ptr device = std::make_shared(1); + auto mm = device->default_memory_manager(); + + TestPrimitive(ToDeviceFactory(mm, ExampleUuid)); + TestPrimitive(ToDeviceFactory(mm, ExampleSmallint)); + TestPrimitive(ToDeviceFactory(mm, ExampleComplex128)); +} + +TEST_F(TestDeviceArrayExport, ExportArrayAndType) { + std::shared_ptr device = std::make_shared(1); + auto mm = device->default_memory_manager(); + + struct ArrowSchema c_schema {}; + struct ArrowDeviceArray c_array {}; + SchemaExportGuard schema_guard(&c_schema); + ArrayExportGuard array_guard(&c_array.array); + + auto array = ToDevice(mm, *ArrayFromJSON(int8(), "[1, 2, 3]")->data()).ValueOrDie(); + ASSERT_OK(ExportDeviceArray(*array, {nullptr, nullptr}, &c_array, &c_schema)); + const ArrayData& data = *array->data(); + array.reset(); + ASSERT_FALSE(ArrowSchemaIsReleased(&c_schema)); + ASSERT_FALSE(ArrowArrayIsReleased(&c_array.array)); + ASSERT_EQ(c_schema.format, std::string("c")); + ASSERT_EQ(c_schema.n_children, 0); + ArrayExportChecker checker{}; + checker(&c_array, data, kMyDeviceType, 1, nullptr); +} + +TEST_F(TestDeviceArrayExport, ExportRecordBatch) { + std::shared_ptr device = std::make_shared(1); + auto mm = device->default_memory_manager(); + + struct ArrowSchema c_schema {}; + struct ArrowDeviceArray c_array {}; + + auto schema = ::arrow::schema( + {field("ints", int16()), field("bools", boolean(), /*nullable=*/false)}); + schema = schema->WithMetadata(key_value_metadata(kMetadataKeys2, kMetadataValues2)); + auto arr0 = ToDevice(mm, *ArrayFromJSON(int16(), "[1, 2, null]")->data()).ValueOrDie(); + auto arr1 = ToDevice(mm, *ArrayFromJSON(boolean(), "[false, true, false]")->data()) + .ValueOrDie(); + + auto batch_factory = [&]() { return RecordBatch::Make(schema, 3, {arr0, arr1}); }; + + { + auto batch = batch_factory(); + + ASSERT_OK(ExportDeviceRecordBatch(*batch, {nullptr, nullptr}, &c_array, &c_schema)); + SchemaExportGuard schema_guard(&c_schema); + ArrayExportGuard array_guard(&c_array.array); + RecordBatchExportChecker checker{}; + checker(&c_array, *batch, kMyDeviceType, 1, nullptr); + + // create batch anew, with the same buffer pointers + batch = batch_factory(); + checker(&c_array, *batch, kMyDeviceType, 1, nullptr); + } + { + // Check one can export both schema and record batch at once + auto batch = batch_factory(); + + ASSERT_OK(ExportDeviceRecordBatch(*batch, {nullptr, nullptr}, &c_array, &c_schema)); + SchemaExportGuard schema_guard(&c_schema); + ArrayExportGuard array_guard(&c_array.array); + ASSERT_EQ(c_schema.format, std::string("+s")); + ASSERT_EQ(c_schema.n_children, 2); + ASSERT_NE(c_schema.metadata, nullptr); + ASSERT_EQ(kEncodedMetadata2, + std::string(c_schema.metadata, kEncodedMetadata2.size())); + RecordBatchExportChecker checker{}; + checker(&c_array, *batch, kMyDeviceType, 1, nullptr); + + // Create batch anew, with the same buffer pointers + batch = batch_factory(); + checker(&c_array, *batch, kMyDeviceType, 1, nullptr); + } +} + //////////////////////////////////////////////////////////////////////////// // Schema import tests diff --git a/cpp/src/arrow/compute/api_scalar.cc b/cpp/src/arrow/compute/api_scalar.cc index ae7e82fb2f9e4..d7a61d0a55985 100644 --- a/cpp/src/arrow/compute/api_scalar.cc +++ b/cpp/src/arrow/compute/api_scalar.cc @@ -775,34 +775,6 @@ SCALAR_EAGER_BINARY(Or, "or") SCALAR_EAGER_BINARY(Xor, "xor") SCALAR_EAGER_UNARY(Invert, "invert") -// ---------------------------------------------------------------------- - -Result Compare(const Datum& left, const Datum& right, CompareOptions options, - ExecContext* ctx) { - std::string func_name; - switch (options.op) { - case CompareOperator::EQUAL: - func_name = "equal"; - break; - case CompareOperator::NOT_EQUAL: - func_name = "not_equal"; - break; - case CompareOperator::GREATER: - func_name = "greater"; - break; - case CompareOperator::GREATER_EQUAL: - func_name = "greater_equal"; - break; - case CompareOperator::LESS: - func_name = "less"; - break; - case CompareOperator::LESS_EQUAL: - func_name = "less_equal"; - break; - } - return CallFunction(func_name, {left, right}, nullptr, ctx); -} - // ---------------------------------------------------------------------- // Validity functions diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index 10a2b4bffde6d..0a06a2829f0da 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -970,24 +970,6 @@ Result RoundTemporal( const Datum& arg, RoundTemporalOptions options = RoundTemporalOptions::Defaults(), ExecContext* ctx = NULLPTR); -/// \brief Compare a numeric array with a scalar. -/// -/// \param[in] left datum to compare, must be an Array -/// \param[in] right datum to compare, must be a Scalar of the same type than -/// left Datum. -/// \param[in] options compare options -/// \param[in] ctx the function execution context, optional -/// \return resulting datum -/// -/// Note on floating point arrays, this uses ieee-754 compare semantics. -/// -/// \since 1.0.0 -/// \note API not yet finalized -ARROW_DEPRECATED("Deprecated in 5.0.0. Use each compare function directly") -ARROW_EXPORT -Result Compare(const Datum& left, const Datum& right, CompareOptions options, - ExecContext* ctx = NULLPTR); - /// \brief Invert the values of a boolean datum /// \param[in] value datum to invert /// \param[in] ctx the function execution context, optional diff --git a/cpp/src/arrow/compute/api_vector.cc b/cpp/src/arrow/compute/api_vector.cc index 67595c3308f9b..f73b10e11edd7 100644 --- a/cpp/src/arrow/compute/api_vector.cc +++ b/cpp/src/arrow/compute/api_vector.cc @@ -417,12 +417,5 @@ Result CumulativeMin(const Datum& values, const CumulativeOptions& option return CallFunction("cumulative_min", {Datum(values)}, &options, ctx); } -// ---------------------------------------------------------------------- -// Deprecated functions - -Result> SortToIndices(const Array& values, ExecContext* ctx) { - return SortIndices(values, SortOrder::Ascending, ctx); -} - } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/api_vector.h b/cpp/src/arrow/compute/api_vector.h index c85db1aa3ba88..4f226ac00788a 100644 --- a/cpp/src/arrow/compute/api_vector.h +++ b/cpp/src/arrow/compute/api_vector.h @@ -683,13 +683,5 @@ Result> PairwiseDiff(const Array& array, bool check_overflow = false, ExecContext* ctx = NULLPTR); -// ---------------------------------------------------------------------- -// Deprecated functions - -ARROW_DEPRECATED("Deprecated in 3.0.0. Use SortIndices()") -ARROW_EXPORT -Result> SortToIndices(const Array& values, - ExecContext* ctx = NULLPTR); - } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/expression_test.cc b/cpp/src/arrow/compute/expression_test.cc index d57604583e8be..f90e01a2f81bc 100644 --- a/cpp/src/arrow/compute/expression_test.cc +++ b/cpp/src/arrow/compute/expression_test.cc @@ -31,7 +31,9 @@ #include "arrow/compute/function_internal.h" #include "arrow/compute/registry.h" #include "arrow/testing/gtest_util.h" +#include "arrow/testing/matchers.h" +using testing::Eq; using testing::HasSubstr; using testing::UnorderedElementsAreArray; @@ -78,27 +80,11 @@ Expression add(Expression l, Expression r) { return call("add", {std::move(l), std::move(r)}); } -template -void ExpectResultsEqual(Actual&& actual, Expected&& expected) { - using MaybeActual = typename EnsureResult::type>::type; - using MaybeExpected = typename EnsureResult::type>::type; - - MaybeActual maybe_actual(std::forward(actual)); - MaybeExpected maybe_expected(std::forward(expected)); - - if (maybe_expected.ok()) { - EXPECT_EQ(maybe_actual, maybe_expected); - } else { - EXPECT_RAISES_WITH_CODE_AND_MESSAGE_THAT( - expected.status().code(), HasSubstr(expected.status().message()), maybe_actual); - } -} - const auto no_change = std::nullopt; TEST(ExpressionUtils, Comparison) { - auto Expect = [](Result expected, Datum l, Datum r) { - ExpectResultsEqual(Comparison::Execute(l, r).Map(Comparison::GetName), expected); + auto cmp_name = [](Datum l, Datum r) { + return Comparison::Execute(l, r).Map(Comparison::GetName); }; Datum zero(0), one(1), two(2), null(std::make_shared()); @@ -106,27 +92,28 @@ TEST(ExpressionUtils, Comparison) { Datum dict_str(DictionaryScalar::Make(std::make_shared(0), ArrayFromJSON(utf8(), R"(["a", "b", "c"])"))); - Status not_impl = Status::NotImplemented("no kernel matching input types"); + auto RaisesNotImpl = + Raises(StatusCode::NotImplemented, HasSubstr("no kernel matching input types")); - Expect("equal", one, one); - Expect("less", one, two); - Expect("greater", one, zero); + EXPECT_THAT(cmp_name(one, one), ResultWith(Eq("equal"))); + EXPECT_THAT(cmp_name(one, two), ResultWith(Eq("less"))); + EXPECT_THAT(cmp_name(one, zero), ResultWith(Eq("greater"))); - Expect("na", one, null); - Expect("na", null, one); + EXPECT_THAT(cmp_name(one, null), ResultWith(Eq("na"))); + EXPECT_THAT(cmp_name(null, one), ResultWith(Eq("na"))); // strings and ints are not comparable without explicit casts - Expect(not_impl, str, one); - Expect(not_impl, one, str); - Expect(not_impl, str, null); // not even null ints + EXPECT_THAT(cmp_name(str, one), RaisesNotImpl); + EXPECT_THAT(cmp_name(one, str), RaisesNotImpl); + EXPECT_THAT(cmp_name(str, null), RaisesNotImpl); // not even null ints // string -> binary implicit cast allowed - Expect("equal", str, bin); - Expect("equal", bin, str); + EXPECT_THAT(cmp_name(str, bin), ResultWith(Eq("equal"))); + EXPECT_THAT(cmp_name(bin, str), ResultWith(Eq("equal"))); // dict_str -> string, implicit casts allowed - Expect("less", dict_str, str); - Expect("less", dict_str, bin); + EXPECT_THAT(cmp_name(dict_str, str), ResultWith(Eq("less"))); + EXPECT_THAT(cmp_name(dict_str, bin), ResultWith(Eq("less"))); } TEST(ExpressionUtils, StripOrderPreservingCasts) { diff --git a/cpp/src/arrow/compute/kernels/CMakeLists.txt b/cpp/src/arrow/compute/kernels/CMakeLists.txt index a17d6275a763a..0bd6fe86134ab 100644 --- a/cpp/src/arrow/compute/kernels/CMakeLists.txt +++ b/cpp/src/arrow/compute/kernels/CMakeLists.txt @@ -18,11 +18,20 @@ # ---------------------------------------------------------------------- # Tests that don't require the full kernel library +# Define arrow_compute_testing object library for common test files +if(ARROW_TESTING) + add_library(arrow_compute_kernels_testing OBJECT test_util.cc) + # Even though this is still just an object library we still need to "link" our + # dependencies so that include paths are configured correctly + target_link_libraries(arrow_compute_kernels_testing ${ARROW_GTEST_GTEST}) +endif() + add_arrow_test(scalar_cast_test ${ARROW_COMPUTE_TEST_ARGS} SOURCES scalar_cast_test.cc - test_util.cc) + EXTRA_LINK_LIBS + arrow_compute_kernels_testing) # ---------------------------------------------------------------------- # Scalar kernels @@ -32,25 +41,36 @@ add_arrow_compute_test(scalar_type_test scalar_boolean_test.cc scalar_nested_test.cc scalar_string_test.cc - test_util.cc) + EXTRA_LINK_LIBS + arrow_compute_kernels_testing) -add_arrow_compute_test(scalar_if_else_test SOURCES scalar_if_else_test.cc test_util.cc) +add_arrow_compute_test(scalar_if_else_test + SOURCES + scalar_if_else_test.cc + EXTRA_LINK_LIBS + arrow_compute_kernels_testing) -add_arrow_compute_test(scalar_temporal_test SOURCES scalar_temporal_test.cc test_util.cc) +add_arrow_compute_test(scalar_temporal_test + SOURCES + scalar_temporal_test.cc + EXTRA_LINK_LIBS + arrow_compute_kernels_testing) add_arrow_compute_test(scalar_math_test SOURCES scalar_arithmetic_test.cc scalar_compare_test.cc scalar_round_arithmetic_test.cc - test_util.cc) + EXTRA_LINK_LIBS + arrow_compute_kernels_testing) add_arrow_compute_test(scalar_utility_test SOURCES scalar_random_test.cc scalar_set_lookup_test.cc scalar_validity_test.cc - test_util.cc) + EXTRA_LINK_LIBS + arrow_compute_kernels_testing) add_arrow_benchmark(scalar_arithmetic_benchmark PREFIX "arrow-compute") add_arrow_benchmark(scalar_boolean_benchmark PREFIX "arrow-compute") @@ -75,12 +95,20 @@ add_arrow_compute_test(vector_test vector_replace_test.cc vector_run_end_encode_test.cc select_k_test.cc - test_util.cc) + EXTRA_LINK_LIBS + arrow_compute_kernels_testing) -add_arrow_compute_test(vector_sort_test SOURCES vector_sort_test.cc test_util.cc) +add_arrow_compute_test(vector_sort_test + SOURCES + vector_sort_test.cc + EXTRA_LINK_LIBS + arrow_compute_kernels_testing) -add_arrow_compute_test(vector_selection_test SOURCES vector_selection_test.cc - test_util.cc) +add_arrow_compute_test(vector_selection_test + SOURCES + vector_selection_test.cc + EXTRA_LINK_LIBS + arrow_compute_kernels_testing) add_arrow_benchmark(vector_hash_benchmark PREFIX "arrow-compute") add_arrow_benchmark(vector_sort_benchmark PREFIX "arrow-compute") @@ -94,7 +122,11 @@ add_arrow_benchmark(vector_selection_benchmark PREFIX "arrow-compute") # Aggregates -add_arrow_compute_test(aggregate_test SOURCES aggregate_test.cc test_util.cc) +add_arrow_compute_test(aggregate_test + SOURCES + aggregate_test.cc + EXTRA_LINK_LIBS + arrow_compute_kernels_testing) # ---------------------------------------------------------------------- # Utilities diff --git a/cpp/src/arrow/compute/kernels/hash_aggregate.cc b/cpp/src/arrow/compute/kernels/hash_aggregate.cc index 677b19138995e..47cae538e2e3f 100644 --- a/cpp/src/arrow/compute/kernels/hash_aggregate.cc +++ b/cpp/src/arrow/compute/kernels/hash_aggregate.cc @@ -246,10 +246,10 @@ struct GroupedCountAllImpl : public GroupedAggregator { const ArrayData& group_id_mapping) override { auto other = checked_cast(&raw_other); - auto counts = reinterpret_cast(counts_.mutable_data()); - auto other_counts = reinterpret_cast(other->counts_.data()); + auto* counts = counts_.mutable_data_as(); + const auto* other_counts = other->counts_.data_as(); - auto g = group_id_mapping.GetValues(1); + auto* g = group_id_mapping.GetValues(1); for (int64_t other_g = 0; other_g < group_id_mapping.length; ++other_g, ++g) { counts[*g] += other_counts[other_g]; } @@ -257,8 +257,8 @@ struct GroupedCountAllImpl : public GroupedAggregator { } Status Consume(const ExecSpan& batch) override { - auto counts = reinterpret_cast(counts_.mutable_data()); - auto g_begin = batch[0].array.GetValues(1); + auto* counts = counts_.mutable_data_as(); + auto* g_begin = batch[0].array.GetValues(1); for (auto g_itr = g_begin, end = g_itr + batch.length; g_itr != end; g_itr++) { counts[*g_itr] += 1; } @@ -293,10 +293,10 @@ struct GroupedCountImpl : public GroupedAggregator { const ArrayData& group_id_mapping) override { auto other = checked_cast(&raw_other); - auto counts = reinterpret_cast(counts_.mutable_data()); - auto other_counts = reinterpret_cast(other->counts_.mutable_data()); + auto* counts = counts_.mutable_data_as(); + const auto* other_counts = other->counts_.data_as(); - auto g = group_id_mapping.GetValues(1); + auto* g = group_id_mapping.GetValues(1); for (int64_t other_g = 0; other_g < group_id_mapping.length; ++other_g, ++g) { counts[*g] += other_counts[other_g]; } @@ -344,8 +344,8 @@ struct GroupedCountImpl : public GroupedAggregator { }; Status Consume(const ExecSpan& batch) override { - auto counts = reinterpret_cast(counts_.mutable_data()); - auto g_begin = batch[1].array.GetValues(1); + auto* counts = counts_.mutable_data_as(); + auto* g_begin = batch[1].array.GetValues(1); if (options_.mode == CountOptions::ALL) { for (int64_t i = 0; i < batch.length; ++i, ++g_begin) { @@ -682,7 +682,7 @@ struct GroupedSumNullImpl final : public GroupedNullImpl { std::shared_ptr out_type() const override { return int64(); } void output_empty(const std::shared_ptr& data) override { - std::fill_n(reinterpret_cast(data->mutable_data()), num_groups_, 0); + std::fill_n(data->mutable_data_as(), num_groups_, 0); } }; @@ -722,7 +722,7 @@ struct GroupedProductNullImpl final : public GroupedNullImpl { std::shared_ptr out_type() const override { return int64(); } void output_empty(const std::shared_ptr& data) override { - std::fill_n(reinterpret_cast(data->mutable_data()), num_groups_, 1); + std::fill_n(data->mutable_data_as(), num_groups_, 1); } }; @@ -785,7 +785,7 @@ struct GroupedMeanImpl : public GroupedReducingAggregatordata(); ARROW_ASSIGN_OR_RAISE(std::shared_ptr values, AllocateBuffer(num_groups * sizeof(MeanType), pool)); - MeanType* means = reinterpret_cast(values->mutable_data()); + auto* means = values->mutable_data_as(); for (int64_t i = 0; i < num_groups; ++i) { if (counts[i] >= options.min_count) { ARROW_ASSIGN_OR_RAISE(means[i], DoMean(reduced[i], counts[i])); @@ -814,7 +814,7 @@ struct GroupedMeanNullImpl final : public GroupedNullImpl { std::shared_ptr out_type() const override { return float64(); } void output_empty(const std::shared_ptr& data) override { - std::fill_n(reinterpret_cast(data->mutable_data()), num_groups_, 0); + std::fill_n(data->mutable_data_as(), num_groups_, 0); } }; @@ -915,7 +915,7 @@ struct GroupedVarStdImpl : public GroupedAggregator { ARROW_ASSIGN_OR_RAISE(auto mapping, AllocateBuffer(num_groups_ * sizeof(uint32_t), pool_)); for (uint32_t i = 0; static_cast(i) < num_groups_; i++) { - reinterpret_cast(mapping->mutable_data())[i] = i; + mapping->template mutable_data_as()[i] = i; } ArrayData group_id_mapping(uint32(), num_groups_, {nullptr, std::move(mapping)}, /*null_count=*/0); @@ -932,7 +932,7 @@ struct GroupedVarStdImpl : public GroupedAggregator { // for int32: -2^62 <= sum < 2^62 constexpr int64_t max_length = 1ULL << (63 - sizeof(CType) * 8); - const auto g = batch[1].array.GetValues(1); + const auto* g = batch[1].array.GetValues(1); if (batch[0].is_scalar() && !batch[0].scalar->is_valid) { uint8_t* no_nulls = no_nulls_.mutable_data(); for (int64_t i = 0; i < batch.length; i++) { @@ -946,7 +946,7 @@ struct GroupedVarStdImpl : public GroupedAggregator { ARROW_ASSIGN_OR_RAISE(auto mapping, AllocateBuffer(num_groups_ * sizeof(uint32_t), pool_)); for (uint32_t i = 0; static_cast(i) < num_groups_; i++) { - reinterpret_cast(mapping->mutable_data())[i] = i; + mapping->template mutable_data_as()[i] = i; } ArrayData group_id_mapping(uint32(), num_groups_, {nullptr, std::move(mapping)}, /*null_count=*/0); @@ -1049,7 +1049,7 @@ struct GroupedVarStdImpl : public GroupedAggregator { AllocateBuffer(num_groups_ * sizeof(double), pool_)); int64_t null_count = 0; - double* results = reinterpret_cast(values->mutable_data()); + auto* results = values->mutable_data_as(); const int64_t* counts = counts_.data(); const double* m2s = m2s_.data(); for (int64_t i = 0; i < num_groups_; ++i) { @@ -1223,7 +1223,7 @@ struct GroupedTDigestImpl : public GroupedAggregator { AllocateBuffer(num_values * sizeof(double), pool_)); int64_t null_count = 0; - double* results = reinterpret_cast(values->mutable_data()); + auto* results = values->mutable_data_as(); for (int64_t i = 0; static_cast(i) < tdigests_.size(); ++i) { if (!tdigests_[i].is_empty() && counts[i] >= options_.min_count && (options_.skip_nulls || bit_util::GetBit(no_nulls_.data(), i))) { @@ -1567,7 +1567,7 @@ struct GroupedMinMaxImplmemory_pool())); - offset_type* offsets = reinterpret_cast(raw_offsets->mutable_data()); + auto* offsets = raw_offsets->mutable_data_as(); offsets[0] = 0; offsets++; const uint8_t* null_bitmap = array->buffers[0]->data(); @@ -2100,7 +2100,7 @@ struct GroupedFirstLastImplmemory_pool())); - offset_type* offsets = reinterpret_cast(raw_offsets->mutable_data()); + auto* offsets = raw_offsets->mutable_data_as(); offsets[0] = 0; offsets++; const uint8_t* null_bitmap = array->buffers[0]->data(); @@ -2464,9 +2464,9 @@ struct GroupedCountDistinctImpl : public GroupedAggregator { ARROW_ASSIGN_OR_RAISE(std::shared_ptr remapped_g, AllocateBuffer(uniques.length * sizeof(uint32_t), pool_)); - const auto* g_mapping = group_id_mapping.GetValues(1); - const auto* other_g = uniques[1].array()->GetValues(1); - auto* g = reinterpret_cast(remapped_g->mutable_data()); + const auto* g_mapping = group_id_mapping.buffers[1]->data_as(); + const auto* other_g = uniques[1].array()->buffers[1]->data_as(); + auto* g = remapped_g->mutable_data_as(); for (int64_t i = 0; i < uniques.length; i++) { g[i] = g_mapping[other_g[i]]; @@ -2480,7 +2480,7 @@ struct GroupedCountDistinctImpl : public GroupedAggregator { Result Finalize() override { ARROW_ASSIGN_OR_RAISE(std::shared_ptr values, AllocateBuffer(num_groups_ * sizeof(int64_t), pool_)); - int64_t* counts = reinterpret_cast(values->mutable_data()); + auto* counts = values->mutable_data_as(); std::fill(counts, counts + num_groups_, 0); ARROW_ASSIGN_OR_RAISE(auto uniques, grouper_->GetUniques()); @@ -2524,9 +2524,9 @@ struct GroupedDistinctImpl : public GroupedCountDistinctImpl { static_cast(num_groups_), ctx_)); ARROW_ASSIGN_OR_RAISE( auto list, grouper_->ApplyGroupings(*groupings, *uniques[0].make_array(), ctx_)); - auto values = list->values(); + const auto& values = list->values(); DCHECK_EQ(values->offset(), 0); - int32_t* offsets = reinterpret_cast(list->value_offsets()->mutable_data()); + auto* offsets = list->value_offsets()->mutable_data_as(); if (options_.mode == CountOptions::ALL || (options_.mode == CountOptions::ONLY_VALID && values->null_count() == 0)) { return list; @@ -2754,7 +2754,7 @@ struct GroupedOneImpl::value || ARROW_ASSIGN_OR_RAISE( auto raw_offsets, AllocateBuffer((1 + values.size()) * sizeof(offset_type), ctx_->memory_pool())); - auto* offsets = reinterpret_cast(raw_offsets->mutable_data()); + auto* offsets = raw_offsets->mutable_data_as(); offsets[0] = 0; offsets++; const uint8_t* null_bitmap = array->buffers[0]->data(); @@ -2952,7 +2952,7 @@ struct GroupedListImpl final : public GroupedAggregator { RETURN_NOT_OK(groups_.Append(g[other_raw_groups[other_g]])); } - const uint8_t* values = reinterpret_cast(other->values_.data()); + const auto* values = reinterpret_cast(other->values_.data()); RETURN_NOT_OK(GetSet::AppendBuffers(&values_, values, 0, other->num_args_)); if (other->has_nulls_) { @@ -3093,7 +3093,7 @@ struct GroupedListImpl::value || ARROW_ASSIGN_OR_RAISE( auto raw_offsets, AllocateBuffer((1 + values.size()) * sizeof(offset_type), ctx_->memory_pool())); - auto* offsets = reinterpret_cast(raw_offsets->mutable_data()); + auto* offsets = raw_offsets->mutable_data_as(); offsets[0] = 0; offsets++; const uint8_t* null_bitmap = array->buffers[0]->data(); diff --git a/cpp/src/arrow/compute/kernels/ree_util_internal.cc b/cpp/src/arrow/compute/kernels/ree_util_internal.cc index 00c885f6fa9db..d35c000678ba4 100644 --- a/cpp/src/arrow/compute/kernels/ree_util_internal.cc +++ b/cpp/src/arrow/compute/kernels/ree_util_internal.cc @@ -59,7 +59,7 @@ Result> PreallocateRunEndsArray( Result> PreallocateValuesArray( const std::shared_ptr& value_type, bool has_validity_buffer, int64_t length, - int64_t null_count, MemoryPool* pool, int64_t data_buffer_size) { + MemoryPool* pool, int64_t data_buffer_size) { std::vector> values_data_buffers; std::shared_ptr validity_buffer = NULLPTR; if (has_validity_buffer) { @@ -79,20 +79,22 @@ Result> PreallocateValuesArray( } else { values_data_buffers = {std::move(validity_buffer), std::move(values_buffer)}; } - return ArrayData::Make(value_type, length, std::move(values_data_buffers), null_count); + auto data = ArrayData::Make(value_type, length, std::move(values_data_buffers), + kUnknownNullCount); + DCHECK(!(has_validity_buffer && length > 0) || data->buffers[0]); + return data; } Result> PreallocateREEArray( std::shared_ptr ree_type, bool has_validity_buffer, - int64_t logical_length, int64_t physical_length, int64_t physical_null_count, - MemoryPool* pool, int64_t data_buffer_size) { + int64_t logical_length, int64_t physical_length, MemoryPool* pool, + int64_t data_buffer_size) { ARROW_ASSIGN_OR_RAISE( auto run_ends_data, PreallocateRunEndsArray(ree_type->run_end_type(), physical_length, pool)); - ARROW_ASSIGN_OR_RAISE( - auto values_data, - PreallocateValuesArray(ree_type->value_type(), has_validity_buffer, physical_length, - physical_null_count, pool, data_buffer_size)); + ARROW_ASSIGN_OR_RAISE(auto values_data, PreallocateValuesArray( + ree_type->value_type(), has_validity_buffer, + physical_length, pool, data_buffer_size)); return ArrayData::Make(std::move(ree_type), logical_length, {NULLPTR}, {std::move(run_ends_data), std::move(values_data)}, diff --git a/cpp/src/arrow/compute/kernels/ree_util_internal.h b/cpp/src/arrow/compute/kernels/ree_util_internal.h index 080d23c06a1f9..3293e754d3b65 100644 --- a/cpp/src/arrow/compute/kernels/ree_util_internal.h +++ b/cpp/src/arrow/compute/kernels/ree_util_internal.h @@ -333,18 +333,39 @@ Result> PreallocateRunEndsArray( const std::shared_ptr& run_end_type, int64_t physical_length, MemoryPool* pool); +/// \brief Preallocate the physical values array for a run-end encoded array +/// +/// data_buffer_size is passed here pre-calculated so this function doesn't have +/// to be template-specialized for each type. +/// +/// The null_count is left as kUnknownNullCount (or 0 if length is 0) and, if +/// after writing the values, the caller knows the null count, it can be set. +/// +/// \post if has_validity_buffer and length > 0, then data.buffer[0] != NULLPTR +/// +/// \param has_validity_buffer a validity buffer must be allocated +/// \param length the length of the values array +/// \param data_buffer_size the size of the data buffer for string and binary types Result> PreallocateValuesArray( const std::shared_ptr& value_type, bool has_validity_buffer, int64_t length, - int64_t null_count, MemoryPool* pool, int64_t data_buffer_size); + MemoryPool* pool, int64_t data_buffer_size); /// \brief Preallocate the ArrayData for the run-end encoded version /// of the flat input array /// +/// The top-level null_count is set to 0 (REEs keep all the data in child +/// arrays). The null_count of the values array (child_data[1]) is left as +/// kUnknownNullCount (or 0 if physical_length is 0) and, if after writing +/// the values, the caller knows the null count, it can be set. +/// +/// \post if has_validity_buffer and physical_length > 0, then +/// data.child_data[1].buffer[0] != NULLPTR +/// /// \param data_buffer_size the size of the data buffer for string and binary types Result> PreallocateREEArray( std::shared_ptr ree_type, bool has_validity_buffer, - int64_t logical_length, int64_t physical_length, int64_t physical_null_count, - MemoryPool* pool, int64_t data_buffer_size); + int64_t logical_length, int64_t physical_length, MemoryPool* pool, + int64_t data_buffer_size); /// \brief Writes a single run-end to the first slot of the pre-allocated /// run-end encoded array in out diff --git a/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc b/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc index 2c7363b3ca486..c305028be19c9 100644 --- a/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc +++ b/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc @@ -30,6 +30,7 @@ #include "arrow/compute/kernels/common_internal.h" #include "arrow/compute/kernels/util_internal.h" #include "arrow/type.h" +#include "arrow/type_fwd.h" #include "arrow/type_traits.h" #include "arrow/util/decimal.h" #include "arrow/util/int_util_overflow.h" @@ -1509,6 +1510,13 @@ void RegisterScalarArithmetic(FunctionRegistry* registry) { DCHECK_OK( divide->AddKernel({duration(unit), int64()}, duration(unit), std::move(exec))); } + + // Add divide(duration, duration) -> float64 + for (auto unit : TimeUnit::values()) { + auto exec = ScalarBinaryNotNull::Exec; + DCHECK_OK( + divide->AddKernel({duration(unit), duration(unit)}, float64(), std::move(exec))); + } DCHECK_OK(registry->AddFunction(std::move(divide))); // ---------------------------------------------------------------------- @@ -1523,6 +1531,14 @@ void RegisterScalarArithmetic(FunctionRegistry* registry) { std::move(exec))); } + // Add divide_checked(duration, duration) -> float64 + for (auto unit : TimeUnit::values()) { + auto exec = + ScalarBinaryNotNull::Exec; + DCHECK_OK(divide_checked->AddKernel({duration(unit), duration(unit)}, float64(), + std::move(exec))); + } + DCHECK_OK(registry->AddFunction(std::move(divide_checked))); // ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/compute/kernels/scalar_arithmetic_test.cc b/cpp/src/arrow/compute/kernels/scalar_arithmetic_test.cc index a37996458c0b5..756b3028c4a59 100644 --- a/cpp/src/arrow/compute/kernels/scalar_arithmetic_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_arithmetic_test.cc @@ -911,11 +911,11 @@ TYPED_TEST(TestBinaryArithmeticFloating, Power) { this->AssertBinop(Power, "[null, 1, 3.3, null, 2]", "[1, 4, 2, 5, 0.1]", "[null, 1, 10.89, null, 1.07177346]"); // Scalar exponentiated by array - this->AssertBinop(Power, 10.0F, "[null, 1, 2.5, null, 2, 5]", - "[null, 10, 316.227766017, null, 100, 100000]"); + this->AssertBinop(Power, 4.0F, "[null, 1, 0.5, null, 2, 5]", + "[null, 4, 2.0, null, 16, 1024]"); // Array exponentiated by scalar - this->AssertBinop(Power, "[null, 1, 2.5, null, 2, 5]", 10.0F, - "[null, 1, 9536.74316406, null, 1024, 9765625]"); + this->AssertBinop(Power, "[null, 1, 0.5, null, 2, 5]", 4.0F, + "[null, 1, 0.0625, null, 16, 625]"); // Array with infinity this->AssertBinop(Power, "[3.4, Inf, -Inf, 1.1, 100000]", "[1, 2, 3, Inf, 100000]", "[3.4, Inf, -Inf, Inf, Inf]"); @@ -925,7 +925,7 @@ TYPED_TEST(TestBinaryArithmeticFloating, Power) { this->AssertBinop(Power, 21.0F, 3.0F, 9261.0F); // Divide by zero this->AssertBinop(Power, "[0.0, 0.0]", "[-1.0, -3.0]", "[Inf, Inf]"); - // Check overflow behaviour + // Check overflow behavior this->AssertBinop(Power, max, 10, INFINITY); } diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc index 083a85eb346c5..1db06a762544b 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc @@ -1025,7 +1025,8 @@ TEST(Cast, DecimalToFloating) { } } - // Edge cases are tested for Decimal128::ToReal() and Decimal256::ToReal() + // Edge cases are tested for Decimal128::ToReal() and Decimal256::ToReal() in + // decimal_test.cc } TEST(Cast, DecimalToString) { diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc b/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc index 879d6285f3441..ded73f0371435 100644 --- a/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc @@ -126,75 +126,115 @@ TYPED_TEST(TestIfElsePrimitive, IfElseFixedSizeRand) { CheckIfElseOutput(cond, left, right, expected_data); } -void CheckWithDifferentShapes(const std::shared_ptr& cond, - const std::shared_ptr& left, - const std::shared_ptr& right, - const std::shared_ptr& expected) { - // this will check for whole arrays, every scalar at i'th index and slicing (offset) - CheckScalar("if_else", {cond, left, right}, expected); - - auto len = left->length(); - std::vector array_indices = {-1}; // sentinel for make_input - std::vector scalar_indices(len); - std::iota(scalar_indices.begin(), scalar_indices.end(), 0); - auto make_input = [&](const std::shared_ptr& array, int64_t index, Datum* input, - Datum* input_broadcast, std::string* trace) { - if (index >= 0) { - // Use scalar from array[index] as input; broadcast scalar for computing expected - // result - ASSERT_OK_AND_ASSIGN(auto scalar, array->GetScalar(index)); - *trace += "@" + std::to_string(index) + "=" + scalar->ToString(); - *input = std::move(scalar); - ASSERT_OK_AND_ASSIGN(*input_broadcast, MakeArrayFromScalar(*input->scalar(), len)); +Datum ArrayOrBroadcastScalar(const Datum& input, int64_t length) { + if (input.is_scalar()) { + EXPECT_OK_AND_ASSIGN(auto array, MakeArrayFromScalar(*input.scalar(), length)); + return array; + } + EXPECT_TRUE(input.is_array()); + return input; +} + +Result ExpectedFromIfElse( + const Datum& cond, const Datum& left, const Datum& right, + std::shared_ptr type, + const std::shared_ptr& expected_if_all_operands_are_arrays) { + if (cond.is_scalar() && left.is_scalar() && right.is_scalar()) { + const auto& scalar = cond.scalar_as(); + Datum expected; + if (scalar.is_valid) { + expected = scalar.value ? left : right; } else { - // Use array as input - *trace += "=Array"; - *input = *input_broadcast = array; + expected = MakeNullScalar(left.type()); } - }; - - enum { COND_SCALAR = 1, LEFT_SCALAR = 2, RIGHT_SCALAR = 4 }; - for (int mask = 1; mask <= (COND_SCALAR | LEFT_SCALAR | RIGHT_SCALAR); ++mask) { - for (int64_t cond_idx : (mask & COND_SCALAR) ? scalar_indices : array_indices) { - Datum cond_in, cond_bcast; - std::string trace_cond = "Cond"; - make_input(cond, cond_idx, &cond_in, &cond_bcast, &trace_cond); - - for (int64_t left_idx : (mask & LEFT_SCALAR) ? scalar_indices : array_indices) { - Datum left_in, left_bcast; - std::string trace_left = "Left"; - make_input(left, left_idx, &left_in, &left_bcast, &trace_left); - - for (int64_t right_idx : (mask & RIGHT_SCALAR) ? scalar_indices : array_indices) { - Datum right_in, right_bcast; - std::string trace_right = "Right"; - make_input(right, right_idx, &right_in, &right_bcast, &trace_right); - - SCOPED_TRACE(trace_right); - SCOPED_TRACE(trace_left); - SCOPED_TRACE(trace_cond); - - Datum expected; - ASSERT_OK_AND_ASSIGN(auto actual, IfElse(cond_in, left_in, right_in)); - if (mask == (COND_SCALAR | LEFT_SCALAR | RIGHT_SCALAR)) { - const auto& scalar = cond_in.scalar_as(); - if (scalar.is_valid) { - expected = scalar.value ? left_in : right_in; - } else { - expected = MakeNullScalar(left_in.type()); - } - if (!left_in.type()->Equals(*right_in.type())) { - ASSERT_OK_AND_ASSIGN(expected, - Cast(expected, CastOptions::Safe(actual.type()))); - } - } else { - ASSERT_OK_AND_ASSIGN(expected, IfElse(cond_bcast, left_bcast, right_bcast)); - } - AssertDatumsEqual(expected, actual, /*verbose=*/true); + if (!left.type()->Equals(*right.type())) { + return Cast(expected, CastOptions::Safe(std::move(type))); + } + return expected; + } + if (cond.is_array() && left.is_array() && right.is_array()) { + return expected_if_all_operands_are_arrays; + } + // When at least one of the inputs is an array, we expect the output + // to be the same as if all the scalars were broadcast to arrays. + const auto expected_length = + std::max(cond.length(), std::max(left.length(), right.length())); + SCOPED_TRACE("IfElseAAACall"); + return IfElse(ArrayOrBroadcastScalar(cond, expected_length), + ArrayOrBroadcastScalar(left, expected_length), + ArrayOrBroadcastScalar(right, expected_length)); +} + +bool NextScalarOrWholeArray(const std::shared_ptr& array, int* index, Datum* out) { + if (*index <= array->length()) { + if (*index < array->length()) { + EXPECT_OK_AND_ASSIGN(auto scalar, array->GetScalar(*index)); + *out = std::move(scalar); + } else { + *out = array; + } + *index += 1; + return true; + } + return false; +} + +std::string CodedCallName(const Datum& cond, const Datum& left, const Datum& right) { + std::string coded = "IfElse"; + coded += cond.is_scalar() ? "S" : "A"; + coded += left.is_scalar() ? "S" : "A"; + coded += right.is_scalar() ? "S" : "A"; + coded += "Call"; + return coded; +} + +void DoCheckWithDifferentShapes(const std::shared_ptr& cond, + const std::shared_ptr& left, + const std::shared_ptr& right, + const std::shared_ptr& expected) { + auto make_trace([&](const char* name, const Datum& datum, int index) { + std::string trace = name; + trace += " : "; + if (datum.is_scalar()) { + trace += "Scalar@" + std::to_string(index) + " = " + datum.scalar()->ToString(); + } else { + EXPECT_TRUE(datum.is_array()); + trace += "Array = [...]"; + } + return trace; + }); + Datum cond_in; + Datum left_in; + Datum right_in; + int cond_index = 0; + int left_index = 0; + int right_index = 0; + while (NextScalarOrWholeArray(cond, &cond_index, &cond_in)) { + SCOPED_TRACE(make_trace("Cond", cond_in, cond_index)); + while (NextScalarOrWholeArray(left, &left_index, &left_in)) { + SCOPED_TRACE(make_trace("Left", left_in, left_index)); + while (NextScalarOrWholeArray(right, &right_index, &right_in)) { + SCOPED_TRACE(make_trace("Right", right_in, right_index)); + Datum actual; + { + SCOPED_TRACE(CodedCallName(cond_in, left_in, right_in)); + ASSERT_OK_AND_ASSIGN(actual, IfElse(cond_in, left_in, right_in)); } + ASSERT_OK_AND_ASSIGN( + auto adjusted_expected, + ExpectedFromIfElse(cond_in, left_in, right_in, actual.type(), expected)); + AssertDatumsEqual(adjusted_expected, actual, /*verbose=*/true); } } - } // for (mask) + } +} + +void CheckWithDifferentShapes(const std::shared_ptr& cond, + const std::shared_ptr& left, + const std::shared_ptr& right, + const std::shared_ptr& expected) { + CheckScalar("if_else", {cond, left, right}, expected); + DoCheckWithDifferentShapes(cond, left, right, expected); } TYPED_TEST(TestIfElsePrimitive, IfElseFixedSize) { diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc index 4581e6377a7fc..ff14f5e7a5c5d 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc @@ -2091,6 +2091,14 @@ TYPED_TEST(TestStringKernels, SliceCodeunitsPosPos) { options_step_neg.stop = 0; this->CheckUnary("utf8_slice_codeunits", R"(["", "𝑓", "𝑓ö", "𝑓öõ", "𝑓öõḍ","𝑓öõḍš"])", this->type(), R"(["", "", "ö", "õ", "ḍö", "šõ"])", &options_step_neg); + + constexpr auto max = std::numeric_limits::max(); + SliceOptions options_max_step{1, max, 2}; + this->CheckUnary("utf8_slice_codeunits", R"(["", "𝑓", "𝑓ö", "𝑓öõ", "𝑓öõḍ", "𝑓öõḍš"])", + this->type(), R"(["", "", "ö", "ö", "öḍ", "öḍ"])", &options_max_step); + SliceOptions options_max_step_neg{1, max, -2}; + this->CheckUnary("utf8_slice_codeunits", R"(["", "𝑓", "𝑓ö", "𝑓öõ", "𝑓öõḍ", "𝑓öõḍš"])", + this->type(), R"(["", "", "", "", "", ""])", &options_max_step_neg); } TYPED_TEST(TestStringKernels, SliceCodeunitsPosNeg) { @@ -2107,6 +2115,15 @@ TYPED_TEST(TestStringKernels, SliceCodeunitsPosNeg) { this->CheckUnary("utf8_slice_codeunits", R"(["", "𝑓", "𝑓ö", "𝑓öõ", "𝑓öõḍ","𝑓öõḍš"])", this->type(), R"(["", "𝑓", "ö", "õ𝑓", "ḍö", "ḍö"])", &options_step_neg); + + constexpr auto min = std::numeric_limits::min(); + SliceOptions options_min_step{2, min, 2}; + this->CheckUnary("utf8_slice_codeunits", R"(["", "𝑓", "𝑓ö", "𝑓öõ", "𝑓öõḍ", "𝑓öõḍš"])", + this->type(), R"(["", "", "", "", "", ""])", &options_min_step); + SliceOptions options_min_step_neg{2, min, -2}; + this->CheckUnary("utf8_slice_codeunits", R"(["", "𝑓", "𝑓ö", "𝑓öõ", "𝑓öõḍ", "𝑓öõḍš"])", + this->type(), R"(["", "𝑓", "ö", "õ𝑓", "õ𝑓", "õ𝑓"])", + &options_min_step_neg); } TYPED_TEST(TestStringKernels, SliceCodeunitsNegNeg) { @@ -2123,6 +2140,15 @@ TYPED_TEST(TestStringKernels, SliceCodeunitsNegNeg) { this->CheckUnary("utf8_slice_codeunits", R"(["", "𝑓", "𝑓ö", "𝑓öõ", "𝑓öõḍ", "𝑓öõḍš"])", this->type(), R"(["", "𝑓", "ö", "õ𝑓", "ḍö", "šõ"])", &options_step_neg); + + constexpr auto min = std::numeric_limits::min(); + SliceOptions options_min_step{-2, min, 2}; + this->CheckUnary("utf8_slice_codeunits", R"(["", "𝑓", "𝑓ö", "𝑓öõ", "𝑓öõḍ", "𝑓öõḍš"])", + this->type(), R"(["", "", "", "", "", ""])", &options_min_step); + SliceOptions options_min_step_neg{-2, min, -2}; + this->CheckUnary("utf8_slice_codeunits", R"(["", "𝑓", "𝑓ö", "𝑓öõ", "𝑓öõḍ", "𝑓öõḍš"])", + this->type(), R"(["", "", "𝑓", "ö", "õ𝑓", "ḍö"])", + &options_min_step_neg); } TYPED_TEST(TestStringKernels, SliceCodeunitsNegPos) { @@ -2138,6 +2164,15 @@ TYPED_TEST(TestStringKernels, SliceCodeunitsNegPos) { options_step_neg.stop = 0; this->CheckUnary("utf8_slice_codeunits", R"(["", "𝑓", "𝑓ö", "𝑓öõ", "𝑓öõḍ", "𝑓öõḍš"])", this->type(), R"(["", "", "ö", "õ", "ḍö", "šõ"])", &options_step_neg); + + constexpr auto max = std::numeric_limits::max(); + SliceOptions options_max_step{-3, max, 2}; + this->CheckUnary("utf8_slice_codeunits", R"(["", "𝑓", "𝑓ö", "𝑓öõ", "𝑓öõḍ", "𝑓öõḍš"])", + this->type(), R"(["", "𝑓", "𝑓", "𝑓õ", "öḍ", "õš"])", + &options_max_step); + SliceOptions options_max_step_neg{-3, max, -2}; + this->CheckUnary("utf8_slice_codeunits", R"(["", "𝑓", "𝑓ö", "𝑓öõ", "𝑓öõḍ", "𝑓öõḍš"])", + this->type(), R"(["", "", "", "", "", ""])", &options_max_step_neg); } #endif // ARROW_WITH_UTF8PROC diff --git a/cpp/src/arrow/compute/kernels/scalar_string_utf8.cc b/cpp/src/arrow/compute/kernels/scalar_string_utf8.cc index fb197e13a688b..cf8a697fea411 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string_utf8.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string_utf8.cc @@ -1090,7 +1090,8 @@ struct SliceCodeunitsTransform : StringSliceTransformBase { // on the resulting slice lengths, so return a worst case estimate. return input_ncodeunits; } - int64_t max_slice_codepoints = (opt.stop - opt.start + opt.step - 1) / opt.step; + int64_t stop = std::clamp(opt.stop, -input_ncodeunits, input_ncodeunits); + int64_t max_slice_codepoints = (stop - opt.start + opt.step - 1) / opt.step; // The maximum UTF8 byte size of a codepoint is 4 return std::min(input_ncodeunits, 4 * ninputs * std::max(0, max_slice_codepoints)); @@ -1133,7 +1134,7 @@ struct SliceCodeunitsTransform : StringSliceTransformBase { } else if (opt.stop < 0) { // or from the end (but we will never need to < begin_sliced) RETURN_IF_UTF8_ERROR(arrow::util::UTF8AdvanceCodepointsReverse( - begin_sliced, end, &end_sliced, -opt.stop)); + begin_sliced, end, &end_sliced, Negate(opt.stop))); } else { // zero length slice return 0; @@ -1158,7 +1159,7 @@ struct SliceCodeunitsTransform : StringSliceTransformBase { // or begin_sliced), but begin_sliced and opt.start can be 'out of sync', // for instance when start=-100, when the string length is only 10. RETURN_IF_UTF8_ERROR(arrow::util::UTF8AdvanceCodepointsReverse( - begin_sliced, end, &end_sliced, -opt.stop)); + begin_sliced, end, &end_sliced, Negate(opt.stop))); } else { // zero length slice return 0; @@ -1214,11 +1215,12 @@ struct SliceCodeunitsTransform : StringSliceTransformBase { // similar to opt.start if (opt.stop >= 0) { + int64_t length = std::min(opt.stop, std::numeric_limits::max() - 1) + 1; RETURN_IF_UTF8_ERROR( - arrow::util::UTF8AdvanceCodepoints(begin, end, &end_sliced, opt.stop + 1)); + arrow::util::UTF8AdvanceCodepoints(begin, end, &end_sliced, length)); } else { RETURN_IF_UTF8_ERROR(arrow::util::UTF8AdvanceCodepointsReverse( - begin, end, &end_sliced, -opt.stop - 1)); + begin, end, &end_sliced, Negate(opt.stop) - 1)); } end_sliced--; @@ -1240,6 +1242,12 @@ struct SliceCodeunitsTransform : StringSliceTransformBase { } #undef RETURN_IF_UTF8_ERROR + + private: + static int64_t Negate(int64_t v) { + constexpr auto max = std::numeric_limits::max(); + return -max > v ? max : -v; + } }; template diff --git a/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc b/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc index cd8abf6e923c8..4c7975add0308 100644 --- a/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc @@ -26,6 +26,7 @@ #include "arrow/testing/matchers.h" #include "arrow/testing/util.h" #include "arrow/type.h" +#include "arrow/type_fwd.h" #include "arrow/type_traits.h" #include "arrow/util/checked_cast.h" #include "arrow/util/formatting.h" @@ -1695,6 +1696,7 @@ TEST_F(ScalarTemporalTest, TestTemporalMultiplyDuration) { } TEST_F(ScalarTemporalTest, TestTemporalDivideDuration) { + // div(duration, integer) -> integer for (auto u : TimeUnit::values()) { for (auto numeric : NumericTypes()) { if (!is_integer(numeric->id())) continue; @@ -1718,6 +1720,24 @@ TEST_F(ScalarTemporalTest, TestTemporalDivideDuration) { CallFunction("divide_checked", {durations, zeros})); } } + + // div(duration, duration) -> float64 + auto left = ArrayFromJSON(duration(TimeUnit::SECOND), "[1, 2, 3, 4]"); + auto right = ArrayFromJSON(duration(TimeUnit::MILLI), "[4000, 300, 20, 1]"); + auto expected_left_by_right = + ArrayFromJSON(float64(), "[0.25, 6.666666666666667, 150, 4000]"); + auto expected_right_by_left = + ArrayFromJSON(float64(), "[4, 0.15, 0.006666666666666667, 0.00025]"); + CheckScalarBinary("divide", left, right, expected_left_by_right); + CheckScalarBinary("divide_checked", left, right, expected_left_by_right); + CheckScalarBinary("divide", right, left, expected_right_by_left); + CheckScalarBinary("divide_checked", right, left, expected_right_by_left); + + // Check dispatching + CheckDispatchBest("divide", {duration(TimeUnit::SECOND), duration(TimeUnit::MILLI)}, + {duration(TimeUnit::MILLI), duration(TimeUnit::MILLI)}); + CheckDispatchBest("divide", {duration(TimeUnit::NANO), duration(TimeUnit::MILLI)}, + {duration(TimeUnit::NANO), duration(TimeUnit::NANO)}); } TEST_F(ScalarTemporalTest, TestTemporalDifferenceWeeks) { diff --git a/cpp/src/arrow/compute/kernels/vector_hash.cc b/cpp/src/arrow/compute/kernels/vector_hash.cc index 2eab7ae8afaf2..a7bb2d88c291b 100644 --- a/cpp/src/arrow/compute/kernels/vector_hash.cc +++ b/cpp/src/arrow/compute/kernels/vector_hash.cc @@ -27,6 +27,7 @@ #include "arrow/array/dict_internal.h" #include "arrow/array/util.h" #include "arrow/compute/api_vector.h" +#include "arrow/compute/cast.h" #include "arrow/compute/kernels/common_internal.h" #include "arrow/result.h" #include "arrow/util/hashing.h" @@ -762,6 +763,38 @@ const FunctionDoc dictionary_encode_doc( ("Return a dictionary-encoded version of the input array."), {"array"}, "DictionaryEncodeOptions"); +// ---------------------------------------------------------------------- +// This function does not use any hashing utilities +// but is kept in this file to be near dictionary_encode +// Dictionary decode implementation + +const FunctionDoc dictionary_decode_doc{ + "Decodes a DictionaryArray to an Array", + ("Return a plain-encoded version of the array input\n" + "This function does nothing if the input is not a dictionary."), + {"dictionary_array"}}; + +class DictionaryDecodeMetaFunction : public MetaFunction { + public: + DictionaryDecodeMetaFunction() + : MetaFunction("dictionary_decode", Arity::Unary(), dictionary_decode_doc) {} + + Result ExecuteImpl(const std::vector& args, + const FunctionOptions* options, + ExecContext* ctx) const override { + if (args[0].type() == nullptr || args[0].type()->id() != Type::DICTIONARY) { + return args[0]; + } + + if (args[0].is_array() || args[0].is_chunked_array()) { + DictionaryType* dict_type = checked_cast(args[0].type().get()); + CastOptions cast_options = CastOptions::Safe(dict_type->value_type()); + return CallFunction("cast", args, &cast_options, ctx); + } else { + return Status::TypeError("Expected an Array or a Chunked Array"); + } + } +}; } // namespace void RegisterVectorHash(FunctionRegistry* registry) { @@ -819,6 +852,10 @@ void RegisterVectorHash(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunction(std::move(dict_encode))); } +void RegisterDictionaryDecode(FunctionRegistry* registry) { + DCHECK_OK(registry->AddFunction(std::make_shared())); +} + } // namespace internal } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/vector_nested_test.cc b/cpp/src/arrow/compute/kernels/vector_nested_test.cc index 277f8169bd261..eef1b6835ffb5 100644 --- a/cpp/src/arrow/compute/kernels/vector_nested_test.cc +++ b/cpp/src/arrow/compute/kernels/vector_nested_test.cc @@ -51,10 +51,12 @@ TEST(TestVectorNested, ListFlattenNulls) { TEST(TestVectorNested, ListFlattenChunkedArray) { for (auto ty : {list(int16()), large_list(int16())}) { + ARROW_SCOPED_TRACE(ty->ToString()); auto input = ChunkedArrayFromJSON(ty, {"[[0, null, 1], null]", "[[2, 3], []]"}); auto expected = ChunkedArrayFromJSON(int16(), {"[0, null, 1]", "[2, 3]"}); CheckVectorUnary("list_flatten", input, expected); + ARROW_SCOPED_TRACE("empty"); input = ChunkedArrayFromJSON(ty, {}); expected = ChunkedArrayFromJSON(int16(), {}); CheckVectorUnary("list_flatten", input, expected); diff --git a/cpp/src/arrow/compute/kernels/vector_run_end_encode.cc b/cpp/src/arrow/compute/kernels/vector_run_end_encode.cc index eef816a149c93..943fdcd6b147f 100644 --- a/cpp/src/arrow/compute/kernels/vector_run_end_encode.cc +++ b/cpp/src/arrow/compute/kernels/vector_run_end_encode.cc @@ -179,7 +179,9 @@ class RunEndEncodeImpl { ARROW_ASSIGN_OR_RAISE( auto output_array_data, ree_util::PreallocateREEArray(std::move(ree_type), has_validity_buffer, - input_length, 0, 0, ctx_->memory_pool(), 0)); + /*logical_length=*/input_length, + /*physical_length=*/0, ctx_->memory_pool(), + /*data_buffer_size=*/0)); output_->value = std::move(output_array_data); return Status::OK(); } @@ -196,17 +198,22 @@ class RunEndEncodeImpl { /*output_run_ends=*/NULLPTR); std::tie(num_valid_runs, num_output_runs, data_buffer_size) = counting_loop.CountNumberOfRuns(); + const auto physical_null_count = num_output_runs - num_valid_runs; + DCHECK(!has_validity_buffer || physical_null_count > 0) + << "has_validity_buffer is expected to imply physical_null_count > 0"; ARROW_ASSIGN_OR_RAISE( auto output_array_data, ree_util::PreallocateREEArray( - std::move(ree_type), has_validity_buffer, input_length, num_output_runs, - num_output_runs - num_valid_runs, ctx_->memory_pool(), data_buffer_size)); + std::move(ree_type), has_validity_buffer, /*logical_length=*/input_length, + /*physical_length=*/num_output_runs, ctx_->memory_pool(), data_buffer_size)); // Initialize the output pointers auto* output_run_ends = output_array_data->child_data[0]->template GetMutableValues(1, 0); auto* output_values_array_data = output_array_data->child_data[1].get(); + // Set the null_count on the physical array + output_values_array_data->null_count = physical_null_count; // Second pass: write the runs RunEndEncodingLoop writing_loop( @@ -254,7 +261,7 @@ struct RunEndEncodeExec { return RunEndEncodeNullArray(TypeTraits::type_singleton(), ctx, input_array, result); } else { - const bool has_validity_buffer = input_array.MayHaveNulls(); + const bool has_validity_buffer = input_array.GetNullCount() > 0; if (has_validity_buffer) { return RunEndEncodeImpl(ctx, input_array, result) .Exec(); @@ -398,10 +405,10 @@ class RunEndDecodeImpl { } } - ARROW_ASSIGN_OR_RAISE(auto output_array_data, - ree_util::PreallocateValuesArray( - ree_type->value_type(), has_validity_buffer, length, - kUnknownNullCount, ctx_->memory_pool(), data_buffer_size)); + ARROW_ASSIGN_OR_RAISE( + auto output_array_data, + ree_util::PreallocateValuesArray(ree_type->value_type(), has_validity_buffer, + length, ctx_->memory_pool(), data_buffer_size)); int64_t output_null_count = 0; if (length > 0) { @@ -435,7 +442,7 @@ struct RunEndDecodeExec { return RunEndDecodeNullREEArray(ctx, input_array, result); } else { const bool has_validity_buffer = - arrow::ree_util::ValuesArray(input_array).MayHaveNulls(); + arrow::ree_util::ValuesArray(input_array).GetNullCount() > 0; if (has_validity_buffer) { return RunEndDecodeImpl(ctx, input_array, result) .Exec(); diff --git a/cpp/src/arrow/compute/kernels/vector_run_end_encode_test.cc b/cpp/src/arrow/compute/kernels/vector_run_end_encode_test.cc index f718d82774dcd..0bd8e3386e7cc 100644 --- a/cpp/src/arrow/compute/kernels/vector_run_end_encode_test.cc +++ b/cpp/src/arrow/compute/kernels/vector_run_end_encode_test.cc @@ -72,11 +72,19 @@ struct REETestData { std::vector inputs_json, std::vector expected_values_json, std::vector expected_run_ends_json, - int64_t input_offset = 0) { + int64_t input_offset = 0, + bool force_validity_bitmap = false) { std::vector> inputs; inputs.reserve(inputs_json.size()); for (const auto& input_json : inputs_json) { - inputs.push_back(ArrayFromJSON(data_type, input_json)); + auto chunk = ArrayFromJSON(data_type, input_json); + auto& data = chunk->data(); + if (force_validity_bitmap && !data->HasValidityBitmap()) { + EXPECT_OK_AND_ASSIGN(auto validity, AllocateBitmap(data->length)); + memset(validity->mutable_data(), 0xFF, validity->size()); + data->buffers[0] = std::move(validity); + } + inputs.push_back(std::move(chunk)); } auto chunked_input = std::make_shared(std::move(inputs)); @@ -165,47 +173,52 @@ class TestRunEndEncodeDecode : public ::testing::TestWithParam< DCHECK(datum.is_chunked_array()); return datum.chunked_array(); } -}; - -TEST_P(TestRunEndEncodeDecode, EncodeDecodeArray) { - auto [data, run_end_type] = GetParam(); - ASSERT_OK_AND_ASSIGN( - Datum encoded_datum, - RunEndEncode(data.InputDatum(), RunEndEncodeOptions{run_end_type})); - - auto encoded = AsChunkedArray(encoded_datum); - ASSERT_OK(encoded->ValidateFull()); - ASSERT_EQ(data.input->length(), encoded->length()); + void TestEncodeDecodeArray(REETestData& data, + const std::shared_ptr& run_end_type) { + ASSERT_OK_AND_ASSIGN( + Datum encoded_datum, + RunEndEncode(data.InputDatum(), RunEndEncodeOptions{run_end_type})); + + auto encoded = AsChunkedArray(encoded_datum); + ASSERT_OK(encoded->ValidateFull()); + ASSERT_EQ(data.input->length(), encoded->length()); + + for (int i = 0; i < encoded->num_chunks(); i++) { + auto& chunk = encoded->chunk(i); + auto run_ends_array = MakeArray(chunk->data()->child_data[0]); + auto values_array = MakeArray(chunk->data()->child_data[1]); + ASSERT_OK(chunk->ValidateFull()); + ASSERT_ARRAYS_EQUAL(*ArrayFromJSON(run_end_type, data.expected_run_ends_json[i]), + *run_ends_array); + ASSERT_ARRAYS_EQUAL(*values_array, *data.expected_values[i]); + ASSERT_EQ(chunk->data()->buffers.size(), 1); + ASSERT_EQ(chunk->data()->buffers[0], NULLPTR); + ASSERT_EQ(chunk->data()->child_data.size(), 2); + ASSERT_EQ(run_ends_array->data()->buffers[0], NULLPTR); + ASSERT_EQ(run_ends_array->length(), data.expected_values[i]->length()); + ASSERT_EQ(run_ends_array->offset(), 0); + ASSERT_EQ(chunk->data()->length, data.input->chunk(i)->length()); + ASSERT_EQ(chunk->data()->offset, 0); + ASSERT_EQ(*chunk->data()->type, + RunEndEncodedType(run_end_type, data.input->type())); + ASSERT_EQ(chunk->data()->null_count, 0); + } - for (int i = 0; i < encoded->num_chunks(); i++) { - auto& chunk = encoded->chunk(i); - auto run_ends_array = MakeArray(chunk->data()->child_data[0]); - auto values_array = MakeArray(chunk->data()->child_data[1]); - ASSERT_OK(chunk->ValidateFull()); - ASSERT_ARRAYS_EQUAL(*ArrayFromJSON(run_end_type, data.expected_run_ends_json[i]), - *run_ends_array); - ASSERT_ARRAYS_EQUAL(*values_array, *data.expected_values[i]); - ASSERT_EQ(chunk->data()->buffers.size(), 1); - ASSERT_EQ(chunk->data()->buffers[0], NULLPTR); - ASSERT_EQ(chunk->data()->child_data.size(), 2); - ASSERT_EQ(run_ends_array->data()->buffers[0], NULLPTR); - ASSERT_EQ(run_ends_array->length(), data.expected_values[i]->length()); - ASSERT_EQ(run_ends_array->offset(), 0); - ASSERT_EQ(chunk->data()->length, data.input->chunk(i)->length()); - ASSERT_EQ(chunk->data()->offset, 0); - ASSERT_EQ(*chunk->data()->type, RunEndEncodedType(run_end_type, data.input->type())); - ASSERT_EQ(chunk->data()->null_count, 0); + ASSERT_OK_AND_ASSIGN(Datum decoded_datum, data.chunked + ? RunEndDecode(encoded) + : RunEndDecode(encoded->chunk(0))); + auto decoded = AsChunkedArray(decoded_datum); + ASSERT_OK(decoded->ValidateFull()); + for (int i = 0; i < decoded->num_chunks(); i++) { + ASSERT_ARRAYS_EQUAL(*decoded->chunk(i), *data.input->chunk(i)); + } } +}; - ASSERT_OK_AND_ASSIGN(Datum decoded_datum, data.chunked - ? RunEndDecode(encoded) - : RunEndDecode(encoded->chunk(0))); - auto decoded = AsChunkedArray(decoded_datum); - ASSERT_OK(decoded->ValidateFull()); - for (int i = 0; i < decoded->num_chunks(); i++) { - ASSERT_ARRAYS_EQUAL(*decoded->chunk(i), *data.input->chunk(i)); - } +TEST_P(TestRunEndEncodeDecode, EncodeDecodeArray) { + auto [data, run_end_type] = GetParam(); + TestEncodeDecodeArray(data, run_end_type); } // Encoding an input with an offset results in a completely new encoded array without an @@ -254,6 +267,17 @@ TEST_P(TestRunEndEncodeDecode, DecodeWithOffset) { } } +// GH-36708 +TEST_P(TestRunEndEncodeDecode, InputWithValidityAndNoNulls) { + auto data = + REETestData::JSONChunked(int32(), + /*inputs=*/{"[1, 1, 2, 2, 2, 3]", "[4, 5, 5, 5, 6, 6]"}, + /*expected_values=*/{"[1, 2, 3]", "[4, 5, 6]"}, + /*expected_run_ends=*/{"[2, 5, 6]", "[1, 4, 6]"}, + /*input_offset=*/0, /*force_validity_bitmap=*/true); + TestEncodeDecodeArray(data, int32()); +} + // This test creates an run-end encoded array with an offset in the child array, which // removes the first run in the test data. It's no-op for chunked input. TEST_P(TestRunEndEncodeDecode, DecodeWithOffsetInChildArray) { diff --git a/cpp/src/arrow/compute/key_hash.cc b/cpp/src/arrow/compute/key_hash.cc index 3fcfbf3d8312d..f5867b405ec71 100644 --- a/cpp/src/arrow/compute/key_hash.cc +++ b/cpp/src/arrow/compute/key_hash.cc @@ -236,7 +236,7 @@ void Hashing32::HashVarLen(int64_t hardware_flags, bool combine_hashes, uint32_t const uint32_t* offsets, const uint8_t* concatenated_keys, uint32_t* hashes, uint32_t* hashes_temp_for_combine) { uint32_t num_processed = 0; -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) if (hardware_flags & arrow::internal::CpuInfo::AVX2) { num_processed = HashVarLen_avx2(combine_hashes, num_rows, offsets, concatenated_keys, hashes, hashes_temp_for_combine); @@ -255,7 +255,7 @@ void Hashing32::HashVarLen(int64_t hardware_flags, bool combine_hashes, uint32_t const uint64_t* offsets, const uint8_t* concatenated_keys, uint32_t* hashes, uint32_t* hashes_temp_for_combine) { uint32_t num_processed = 0; -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) if (hardware_flags & arrow::internal::CpuInfo::AVX2) { num_processed = HashVarLen_avx2(combine_hashes, num_rows, offsets, concatenated_keys, hashes, hashes_temp_for_combine); @@ -361,7 +361,7 @@ void Hashing32::HashFixed(int64_t hardware_flags, bool combine_hashes, uint32_t } uint32_t num_processed = 0; -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) if (hardware_flags & arrow::internal::CpuInfo::AVX2) { num_processed = HashFixedLen_avx2(combine_hashes, num_rows, length, keys, hashes, hashes_temp_for_combine); diff --git a/cpp/src/arrow/compute/key_hash.h b/cpp/src/arrow/compute/key_hash.h index e43d7b8df523d..b193716c9bdfd 100644 --- a/cpp/src/arrow/compute/key_hash.h +++ b/cpp/src/arrow/compute/key_hash.h @@ -17,7 +17,7 @@ #pragma once -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) #include #endif @@ -115,7 +115,7 @@ class ARROW_EXPORT Hashing32 { static void HashInt(bool combine_hashes, uint32_t num_keys, uint64_t length_key, const uint8_t* keys, uint32_t* hashes); -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) static inline __m256i Avalanche_avx2(__m256i hash); static inline __m256i CombineHashesImp_avx2(__m256i previous_hash, __m256i hash); template diff --git a/cpp/src/arrow/compute/key_hash_avx2.cc b/cpp/src/arrow/compute/key_hash_avx2.cc index f30c3460bda60..1b444b576784f 100644 --- a/cpp/src/arrow/compute/key_hash_avx2.cc +++ b/cpp/src/arrow/compute/key_hash_avx2.cc @@ -23,8 +23,6 @@ namespace arrow { namespace compute { -#if defined(ARROW_HAVE_AVX2) - inline __m256i Hashing32::Avalanche_avx2(__m256i hash) { hash = _mm256_xor_si256(hash, _mm256_srli_epi32(hash, 15)); hash = _mm256_mullo_epi32(hash, _mm256_set1_epi32(PRIME32_2)); @@ -315,7 +313,5 @@ uint32_t Hashing32::HashVarLen_avx2(bool combine_hashes, uint32_t num_rows, } } -#endif - } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/key_hash_test.cc b/cpp/src/arrow/compute/key_hash_test.cc index d10645391b413..3e6d41525cf44 100644 --- a/cpp/src/arrow/compute/key_hash_test.cc +++ b/cpp/src/arrow/compute/key_hash_test.cc @@ -21,18 +21,26 @@ #include #include #include + #include "arrow/array/builder_binary.h" #include "arrow/compute/key_hash.h" #include "arrow/testing/gtest_util.h" +#include "arrow/testing/util.h" #include "arrow/util/cpu_info.h" #include "arrow/util/pcg_random.h" namespace arrow { using internal::checked_pointer_cast; +using internal::CpuInfo; namespace compute { +std::vector HardwareFlagsForTesting() { + // Our key-hash and key-map routines currently only have AVX2 optimizations + return GetSupportedHardwareFlags({CpuInfo::AVX2}); +} + class TestVectorHash { private: template ::ArrayType> @@ -131,85 +139,79 @@ class TestVectorHash { const offset_t* key_offsets = reinterpret_cast(keys_array->raw_value_offsets()); - std::vector hashes_scalar32; - std::vector hashes_scalar64; - hashes_scalar32.resize(num_rows); - hashes_scalar64.resize(num_rows); - std::vector hashes_simd32; - std::vector hashes_simd64; - hashes_simd32.resize(num_rows); - hashes_simd64.resize(num_rows); - - int64_t hardware_flags_scalar = 0LL; - int64_t hardware_flags_simd = ::arrow::internal::CpuInfo::AVX2; + // For each tested hardware flags, we will compute the hashes and check + // them for consistency. + const auto hardware_flags_for_testing = HardwareFlagsForTesting(); + ASSERT_GT(hardware_flags_for_testing.size(), 0); + std::vector> hashes32(hardware_flags_for_testing.size()); + std::vector> hashes64(hardware_flags_for_testing.size()); + for (auto& h : hashes32) { + h.resize(num_rows); + } + for (auto& h : hashes64) { + h.resize(num_rows); + } constexpr int mini_batch_size = 1024; std::vector temp_buffer; temp_buffer.resize(mini_batch_size * 4); - for (bool use_simd : {false, true}) { + for (int i = 0; i < static_cast(hardware_flags_for_testing.size()); ++i) { + const auto hardware_flags = hardware_flags_for_testing[i]; if (use_32bit_hash) { if (!use_varlen_input) { - Hashing32::HashFixed(use_simd ? hardware_flags_simd : hardware_flags_scalar, + Hashing32::HashFixed(hardware_flags, /*combine_hashes=*/false, num_rows, fixed_length, keys, - use_simd ? hashes_simd32.data() : hashes_scalar32.data(), - temp_buffer.data()); + hashes32[i].data(), temp_buffer.data()); } else { for (int first_row = 0; first_row < num_rows;) { int batch_size_next = std::min(num_rows - first_row, mini_batch_size); - Hashing32::HashVarLen( - use_simd ? hardware_flags_simd : hardware_flags_scalar, - /*combine_hashes=*/false, batch_size_next, key_offsets + first_row, keys, - (use_simd ? hashes_simd32.data() : hashes_scalar32.data()) + first_row, - temp_buffer.data()); + Hashing32::HashVarLen(hardware_flags, + /*combine_hashes=*/false, batch_size_next, + key_offsets + first_row, keys, + hashes32[i].data() + first_row, temp_buffer.data()); first_row += batch_size_next; } } + for (int j = 0; j < num_rows; ++j) { + hashes64[i][j] = hashes32[i][j]; + } } else { if (!use_varlen_input) { Hashing64::HashFixed( - /*combine_hashes=*/false, num_rows, fixed_length, keys, - use_simd ? hashes_simd64.data() : hashes_scalar64.data()); + /*combine_hashes=*/false, num_rows, fixed_length, keys, hashes64[i].data()); } else { Hashing64::HashVarLen( - /*combine_hashes=*/false, num_rows, key_offsets, keys, - use_simd ? hashes_simd64.data() : hashes_scalar64.data()); + /*combine_hashes=*/false, num_rows, key_offsets, keys, hashes64[i].data()); } } } - if (use_32bit_hash) { - for (int i = 0; i < num_rows; ++i) { - hashes_scalar64[i] = hashes_scalar32[i]; - hashes_simd64[i] = hashes_simd32[i]; - } - } - - // Verify that both scalar and AVX2 implementations give the same hashes + // Verify that all implementations (scalar, SIMD) give the same hashes // - for (int i = 0; i < num_rows; ++i) { - ASSERT_EQ(hashes_scalar64[i], hashes_simd64[i]) - << "scalar and simd approaches yielded different hashes"; + const auto& hashes_scalar64 = hashes64[0]; + for (int i = 0; i < static_cast(hardware_flags_for_testing.size()); ++i) { + for (int j = 0; j < num_rows; ++j) { + ASSERT_EQ(hashes64[i][j], hashes_scalar64[j]) + << "scalar and simd approaches yielded different hashes"; + } } // Verify that the same key appearing multiple times generates the same hash // each time. Measure the number of unique hashes and compare to the number // of unique keys. // - std::map unique_key_to_hash; - std::set unique_hashes; + std::unordered_map unique_key_to_hash; + std::unordered_set unique_hashes; for (int i = 0; i < num_rows; ++i) { - std::map::iterator iter = unique_key_to_hash.find(row_ids[i]); - if (iter == unique_key_to_hash.end()) { - unique_key_to_hash.insert(std::make_pair(row_ids[i], hashes_scalar64[i])); - } else { - ASSERT_EQ(iter->second, hashes_scalar64[i]); - } - if (unique_hashes.find(hashes_scalar64[i]) == unique_hashes.end()) { - unique_hashes.insert(hashes_scalar64[i]); + auto [it, inserted] = + unique_key_to_hash.try_emplace(row_ids[i], hashes_scalar64[i]); + if (!inserted) { + ASSERT_EQ(it->second, hashes_scalar64[i]); } + unique_hashes.insert(hashes_scalar64[i]); } float percent_hash_collisions = 100.0f * static_cast(num_unique - unique_hashes.size()) / diff --git a/cpp/src/arrow/compute/key_map.cc b/cpp/src/arrow/compute/key_map.cc index fd5c404a07f8d..71ca56c91a9ff 100644 --- a/cpp/src/arrow/compute/key_map.cc +++ b/cpp/src/arrow/compute/key_map.cc @@ -133,7 +133,7 @@ void SwissTable::extract_group_ids(const int num_keys, const uint16_t* optional_ // Optimistically use simplified lookup involving only a start block to find // a single group id candidate for every input. -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) int num_group_id_bytes = num_group_id_bits / 8; if ((hardware_flags_ & arrow::internal::CpuInfo::AVX2) && !optional_selection) { num_processed = extract_group_ids_avx2(num_keys, hashes, local_slots, out_group_ids, @@ -301,7 +301,7 @@ void SwissTable::early_filter(const int num_keys, const uint32_t* hashes, // Optimistically use simplified lookup involving only a start block to find // a single group id candidate for every input. int num_processed = 0; -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) if (hardware_flags_ & arrow::internal::CpuInfo::AVX2) { if (log_blocks_ <= 4) { num_processed = early_filter_imp_avx2_x32(num_keys, hashes, out_match_bitvector, diff --git a/cpp/src/arrow/compute/key_map.h b/cpp/src/arrow/compute/key_map.h index 7ab48470f21e4..95fb3be274288 100644 --- a/cpp/src/arrow/compute/key_map.h +++ b/cpp/src/arrow/compute/key_map.h @@ -163,7 +163,7 @@ class ARROW_EXPORT SwissTable { // void early_filter_imp(const int num_keys, const uint32_t* hashes, uint8_t* out_match_bitvector, uint8_t* out_local_slots) const; -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) int early_filter_imp_avx2_x8(const int num_hashes, const uint32_t* hashes, uint8_t* out_match_bitvector, uint8_t* out_local_slots) const; diff --git a/cpp/src/arrow/compute/key_map_avx2.cc b/cpp/src/arrow/compute/key_map_avx2.cc index eb318ff188fbb..731553511044f 100644 --- a/cpp/src/arrow/compute/key_map_avx2.cc +++ b/cpp/src/arrow/compute/key_map_avx2.cc @@ -23,8 +23,6 @@ namespace arrow { namespace compute { -#if defined(ARROW_HAVE_AVX2) - // This is more or less translation of equivalent scalar code, adjusted for a // different instruction set (e.g. missing leading zero count instruction). // @@ -412,7 +410,5 @@ int SwissTable::extract_group_ids_avx2(const int num_keys, const uint32_t* hashe return num_keys - (num_keys % unroll); } -#endif - } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/registry.cc b/cpp/src/arrow/compute/registry.cc index a4b484a2069ea..7a54f78a03736 100644 --- a/cpp/src/arrow/compute/registry.cc +++ b/cpp/src/arrow/compute/registry.cc @@ -275,6 +275,7 @@ static std::unique_ptr CreateBuiltInRegistry() { // Register core kernels RegisterScalarCast(registry.get()); + RegisterDictionaryDecode(registry.get()); RegisterVectorHash(registry.get()); RegisterVectorSelection(registry.get()); diff --git a/cpp/src/arrow/compute/registry_internal.h b/cpp/src/arrow/compute/registry_internal.h index b4239701d9573..cdc9f804e72f1 100644 --- a/cpp/src/arrow/compute/registry_internal.h +++ b/cpp/src/arrow/compute/registry_internal.h @@ -28,6 +28,7 @@ namespace internal { void RegisterScalarArithmetic(FunctionRegistry* registry); void RegisterScalarBoolean(FunctionRegistry* registry); void RegisterScalarCast(FunctionRegistry* registry); +void RegisterDictionaryDecode(FunctionRegistry* registry); void RegisterScalarComparison(FunctionRegistry* registry); void RegisterScalarIfElse(FunctionRegistry* registry); void RegisterScalarNested(FunctionRegistry* registry); diff --git a/cpp/src/arrow/compute/row/compare_internal.cc b/cpp/src/arrow/compute/row/compare_internal.cc index 39ac33932b548..7c402e7a2384d 100644 --- a/cpp/src/arrow/compute/row/compare_internal.cc +++ b/cpp/src/arrow/compute/row/compare_internal.cc @@ -42,7 +42,7 @@ void KeyCompare::NullUpdateColumnToRow(uint32_t id_col, uint32_t num_rows_to_com return; } uint32_t num_processed = 0; -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) if (ctx->has_avx2()) { num_processed = NullUpdateColumnToRow_avx2(use_selection, id_col, num_rows_to_compare, sel_left_maybe_null, left_to_right_map, @@ -130,7 +130,7 @@ void KeyCompare::CompareBinaryColumnToRow(uint32_t offset_within_row, const RowTableImpl& rows, uint8_t* match_bytevector) { uint32_t num_processed = 0; -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) if (ctx->has_avx2()) { num_processed = CompareBinaryColumnToRow_avx2( use_selection, offset_within_row, num_rows_to_compare, sel_left_maybe_null, @@ -297,7 +297,7 @@ void KeyCompare::CompareVarBinaryColumnToRow(uint32_t id_varbinary_col, const RowTableImpl& rows, uint8_t* match_bytevector) { uint32_t num_processed = 0; -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) if (ctx->has_avx2()) { num_processed = CompareVarBinaryColumnToRow_avx2( use_selection, is_first_varbinary_col, id_varbinary_col, num_rows_to_compare, @@ -313,7 +313,7 @@ void KeyCompare::CompareVarBinaryColumnToRow(uint32_t id_varbinary_col, void KeyCompare::AndByteVectors(LightContext* ctx, uint32_t num_elements, uint8_t* bytevector_A, const uint8_t* bytevector_B) { uint32_t num_processed = 0; -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) if (ctx->has_avx2()) { num_processed = AndByteVectors_avx2(num_elements, bytevector_A, bytevector_B); } diff --git a/cpp/src/arrow/compute/row/compare_internal.h b/cpp/src/arrow/compute/row/compare_internal.h index 638b8c2ec721f..db953fbe11271 100644 --- a/cpp/src/arrow/compute/row/compare_internal.h +++ b/cpp/src/arrow/compute/row/compare_internal.h @@ -86,7 +86,7 @@ class ARROW_EXPORT KeyCompare { static void AndByteVectors(LightContext* ctx, uint32_t num_elements, uint8_t* bytevector_A, const uint8_t* bytevector_B); -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) template static uint32_t NullUpdateColumnToRowImp_avx2( diff --git a/cpp/src/arrow/compute/row/compare_internal_avx2.cc b/cpp/src/arrow/compute/row/compare_internal_avx2.cc index 95f37ab617db5..ff407c51b83cb 100644 --- a/cpp/src/arrow/compute/row/compare_internal_avx2.cc +++ b/cpp/src/arrow/compute/row/compare_internal_avx2.cc @@ -24,8 +24,6 @@ namespace arrow { namespace compute { -#if defined(ARROW_HAVE_AVX2) - inline __m256i set_first_n_bytes_avx2(int n) { constexpr uint64_t kByteSequence0To7 = 0x0706050403020100ULL; constexpr uint64_t kByteSequence8To15 = 0x0f0e0d0c0b0a0908ULL; @@ -670,7 +668,5 @@ uint32_t KeyCompare::CompareVarBinaryColumnToRow_avx2( return num_rows_to_compare; } -#endif - } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/row/encode_internal.cc b/cpp/src/arrow/compute/row/encode_internal.cc index 3a6a85b0272f8..01d552ef8270f 100644 --- a/cpp/src/arrow/compute/row/encode_internal.cc +++ b/cpp/src/arrow/compute/row/encode_internal.cc @@ -455,7 +455,7 @@ void EncoderBinary::Decode(uint32_t start_row, uint32_t num_rows, bool is_row_fixed_length = rows.metadata().is_fixed_length; -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) if (ctx->has_avx2()) { DecodeHelper_avx2(is_row_fixed_length, start_row, num_rows, offset_within_row, rows, col); @@ -466,7 +466,7 @@ void EncoderBinary::Decode(uint32_t start_row, uint32_t num_rows, } else { DecodeImp(start_row, num_rows, offset_within_row, rows, col); } -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) } #endif @@ -524,7 +524,7 @@ void EncoderBinaryPair::Decode(uint32_t start_row, uint32_t num_rows, bool is_row_fixed_length = rows.metadata().is_fixed_length; uint32_t num_processed = 0; -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) if (ctx->has_avx2() && col_width1 == col_width2) { num_processed = DecodeHelper_avx2(is_row_fixed_length, col_width1, start_row, num_rows, @@ -772,7 +772,7 @@ void EncoderVarBinary::Decode(uint32_t start_row, uint32_t num_rows, KeyColumnArray* col, LightContext* ctx) { // Output column varbinary buffer needs an extra 32B // at the end in avx2 version and 8B otherwise. -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) if (ctx->has_avx2()) { DecodeHelper_avx2(start_row, num_rows, varbinary_col_id, rows, col); } else { @@ -782,7 +782,7 @@ void EncoderVarBinary::Decode(uint32_t start_row, uint32_t num_rows, } else { DecodeImp(start_row, num_rows, varbinary_col_id, rows, col); } -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) } #endif } diff --git a/cpp/src/arrow/compute/row/encode_internal.h b/cpp/src/arrow/compute/row/encode_internal.h index b83767b694cfd..6091fb66982af 100644 --- a/cpp/src/arrow/compute/row/encode_internal.h +++ b/cpp/src/arrow/compute/row/encode_internal.h @@ -187,7 +187,7 @@ class EncoderBinary { template static void DecodeImp(uint32_t start_row, uint32_t num_rows, uint32_t offset_within_row, const RowTableImpl& rows, KeyColumnArray* col); -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) static void DecodeHelper_avx2(bool is_row_fixed_length, uint32_t start_row, uint32_t num_rows, uint32_t offset_within_row, const RowTableImpl& rows, KeyColumnArray* col); @@ -213,7 +213,7 @@ class EncoderBinaryPair { static void DecodeImp(uint32_t num_rows_to_skip, uint32_t start_row, uint32_t num_rows, uint32_t offset_within_row, const RowTableImpl& rows, KeyColumnArray* col1, KeyColumnArray* col2); -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) static uint32_t DecodeHelper_avx2(bool is_row_fixed_length, uint32_t col_width, uint32_t start_row, uint32_t num_rows, uint32_t offset_within_row, const RowTableImpl& rows, @@ -300,7 +300,7 @@ class EncoderVarBinary { template static void DecodeImp(uint32_t start_row, uint32_t num_rows, uint32_t varbinary_col_id, const RowTableImpl& rows, KeyColumnArray* col); -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) static void DecodeHelper_avx2(uint32_t start_row, uint32_t num_rows, uint32_t varbinary_col_id, const RowTableImpl& rows, KeyColumnArray* col); diff --git a/cpp/src/arrow/compute/row/encode_internal_avx2.cc b/cpp/src/arrow/compute/row/encode_internal_avx2.cc index 02ba310bded20..50969c7bd6034 100644 --- a/cpp/src/arrow/compute/row/encode_internal_avx2.cc +++ b/cpp/src/arrow/compute/row/encode_internal_avx2.cc @@ -22,8 +22,6 @@ namespace arrow { namespace compute { -#if defined(ARROW_HAVE_AVX2) - void EncoderBinary::DecodeHelper_avx2(bool is_row_fixed_length, uint32_t start_row, uint32_t num_rows, uint32_t offset_within_row, const RowTableImpl& rows, KeyColumnArray* col) { @@ -230,7 +228,5 @@ void EncoderVarBinary::DecodeImp_avx2(uint32_t start_row, uint32_t num_rows, }); } -#endif - } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/util.cc b/cpp/src/arrow/compute/util.cc index 78f90ea37f7af..faf3e0c87e4d2 100644 --- a/cpp/src/arrow/compute/util.cc +++ b/cpp/src/arrow/compute/util.cc @@ -56,7 +56,9 @@ void TempVectorStack::release(int id, uint32_t num_bytes) { --num_vectors_; } -inline uint64_t bit_util::SafeLoadUpTo8Bytes(const uint8_t* bytes, int num_bytes) { +namespace bit_util { + +inline uint64_t SafeLoadUpTo8Bytes(const uint8_t* bytes, int num_bytes) { // This will not be correct on big-endian architectures. #if !ARROW_LITTLE_ENDIAN ARROW_DCHECK(false); @@ -73,7 +75,7 @@ inline uint64_t bit_util::SafeLoadUpTo8Bytes(const uint8_t* bytes, int num_bytes } } -inline void bit_util::SafeStoreUpTo8Bytes(uint8_t* bytes, int num_bytes, uint64_t value) { +inline void SafeStoreUpTo8Bytes(uint8_t* bytes, int num_bytes, uint64_t value) { // This will not be correct on big-endian architectures. #if !ARROW_LITTLE_ENDIAN ARROW_DCHECK(false); @@ -88,8 +90,8 @@ inline void bit_util::SafeStoreUpTo8Bytes(uint8_t* bytes, int num_bytes, uint64_ } } -inline void bit_util::bits_to_indexes_helper(uint64_t word, uint16_t base_index, - int* num_indexes, uint16_t* indexes) { +inline void bits_to_indexes_helper(uint64_t word, uint16_t base_index, int* num_indexes, + uint16_t* indexes) { int n = *num_indexes; while (word) { indexes[n++] = base_index + static_cast(CountTrailingZeros(word)); @@ -98,9 +100,8 @@ inline void bit_util::bits_to_indexes_helper(uint64_t word, uint16_t base_index, *num_indexes = n; } -inline void bit_util::bits_filter_indexes_helper(uint64_t word, - const uint16_t* input_indexes, - int* num_indexes, uint16_t* indexes) { +inline void bits_filter_indexes_helper(uint64_t word, const uint16_t* input_indexes, + int* num_indexes, uint16_t* indexes) { int n = *num_indexes; while (word) { indexes[n++] = input_indexes[CountTrailingZeros(word)]; @@ -110,21 +111,21 @@ inline void bit_util::bits_filter_indexes_helper(uint64_t word, } template -void bit_util::bits_to_indexes_internal(int64_t hardware_flags, const int num_bits, - const uint8_t* bits, - const uint16_t* input_indexes, int* num_indexes, - uint16_t* indexes, uint16_t base_index) { +void bits_to_indexes_internal(int64_t hardware_flags, const int num_bits, + const uint8_t* bits, const uint16_t* input_indexes, + int* num_indexes, uint16_t* indexes, + uint16_t base_index = 0) { // 64 bits at a time constexpr int unroll = 64; int tail = num_bits % unroll; -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) if (hardware_flags & arrow::internal::CpuInfo::AVX2) { if (filter_input_indexes) { - bits_filter_indexes_avx2(bit_to_search, num_bits - tail, bits, input_indexes, - num_indexes, indexes); + avx2::bits_filter_indexes_avx2(bit_to_search, num_bits - tail, bits, input_indexes, + num_indexes, indexes); } else { - bits_to_indexes_avx2(bit_to_search, num_bits - tail, bits, num_indexes, indexes, - base_index); + avx2::bits_to_indexes_avx2(bit_to_search, num_bits - tail, bits, num_indexes, + indexes, base_index); } } else { #endif @@ -140,7 +141,7 @@ void bit_util::bits_to_indexes_internal(int64_t hardware_flags, const int num_bi bits_to_indexes_helper(word, i * 64 + base_index, num_indexes, indexes); } } -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) } #endif // Optionally process the last partial word with masking out bits outside range @@ -160,9 +161,9 @@ void bit_util::bits_to_indexes_internal(int64_t hardware_flags, const int num_bi } } -void bit_util::bits_to_indexes(int bit_to_search, int64_t hardware_flags, int num_bits, - const uint8_t* bits, int* num_indexes, uint16_t* indexes, - int bit_offset) { +void bits_to_indexes(int bit_to_search, int64_t hardware_flags, int num_bits, + const uint8_t* bits, int* num_indexes, uint16_t* indexes, + int bit_offset) { bits += bit_offset / 8; bit_offset %= 8; *num_indexes = 0; @@ -193,10 +194,9 @@ void bit_util::bits_to_indexes(int bit_to_search, int64_t hardware_flags, int nu *num_indexes += num_indexes_new; } -void bit_util::bits_filter_indexes(int bit_to_search, int64_t hardware_flags, - const int num_bits, const uint8_t* bits, - const uint16_t* input_indexes, int* num_indexes, - uint16_t* indexes, int bit_offset) { +void bits_filter_indexes(int bit_to_search, int64_t hardware_flags, const int num_bits, + const uint8_t* bits, const uint16_t* input_indexes, + int* num_indexes, uint16_t* indexes, int bit_offset) { bits += bit_offset / 8; bit_offset %= 8; if (bit_offset != 0) { @@ -226,10 +226,9 @@ void bit_util::bits_filter_indexes(int bit_to_search, int64_t hardware_flags, } } -void bit_util::bits_split_indexes(int64_t hardware_flags, const int num_bits, - const uint8_t* bits, int* num_indexes_bit0, - uint16_t* indexes_bit0, uint16_t* indexes_bit1, - int bit_offset) { +void bits_split_indexes(int64_t hardware_flags, const int num_bits, const uint8_t* bits, + int* num_indexes_bit0, uint16_t* indexes_bit0, + uint16_t* indexes_bit1, int bit_offset) { bits_to_indexes(0, hardware_flags, num_bits, bits, num_indexes_bit0, indexes_bit0, bit_offset); int num_indexes_bit1; @@ -237,8 +236,8 @@ void bit_util::bits_split_indexes(int64_t hardware_flags, const int num_bits, bit_offset); } -void bit_util::bits_to_bytes(int64_t hardware_flags, const int num_bits, - const uint8_t* bits, uint8_t* bytes, int bit_offset) { +void bits_to_bytes(int64_t hardware_flags, const int num_bits, const uint8_t* bits, + uint8_t* bytes, int bit_offset) { bits += bit_offset / 8; bit_offset %= 8; if (bit_offset != 0) { @@ -254,11 +253,11 @@ void bit_util::bits_to_bytes(int64_t hardware_flags, const int num_bits, } int num_processed = 0; -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) if (hardware_flags & arrow::internal::CpuInfo::AVX2) { // The function call below processes whole 32 bit chunks together. num_processed = num_bits - (num_bits % 32); - bits_to_bytes_avx2(num_processed, bits, bytes); + avx2::bits_to_bytes_avx2(num_processed, bits, bytes); } #endif // Processing 8 bits at a time @@ -290,8 +289,8 @@ void bit_util::bits_to_bytes(int64_t hardware_flags, const int num_bits, } } -void bit_util::bytes_to_bits(int64_t hardware_flags, const int num_bits, - const uint8_t* bytes, uint8_t* bits, int bit_offset) { +void bytes_to_bits(int64_t hardware_flags, const int num_bits, const uint8_t* bytes, + uint8_t* bits, int bit_offset) { bits += bit_offset / 8; bit_offset %= 8; if (bit_offset != 0) { @@ -310,11 +309,11 @@ void bit_util::bytes_to_bits(int64_t hardware_flags, const int num_bits, } int num_processed = 0; -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) if (hardware_flags & arrow::internal::CpuInfo::AVX2) { // The function call below processes whole 32 bit chunks together. num_processed = num_bits - (num_bits % 32); - bytes_to_bits_avx2(num_processed, bytes, bits); + avx2::bytes_to_bits_avx2(num_processed, bytes, bits); } #endif // Process 8 bits at a time @@ -338,11 +337,11 @@ void bit_util::bytes_to_bits(int64_t hardware_flags, const int num_bits, } } -bool bit_util::are_all_bytes_zero(int64_t hardware_flags, const uint8_t* bytes, - uint32_t num_bytes) { -#if defined(ARROW_HAVE_AVX2) +bool are_all_bytes_zero(int64_t hardware_flags, const uint8_t* bytes, + uint32_t num_bytes) { +#if defined(ARROW_HAVE_RUNTIME_AVX2) if (hardware_flags & arrow::internal::CpuInfo::AVX2) { - return are_all_bytes_zero_avx2(bytes, num_bytes); + return avx2::are_all_bytes_zero_avx2(bytes, num_bytes); } #endif uint64_t result_or = 0; @@ -358,6 +357,7 @@ bool bit_util::are_all_bytes_zero(int64_t hardware_flags, const uint8_t* bytes, return result_or == 0; } +} // namespace bit_util } // namespace util } // namespace arrow diff --git a/cpp/src/arrow/compute/util.h b/cpp/src/arrow/compute/util.h index 6e1bb79674cba..730e59f346a52 100644 --- a/cpp/src/arrow/compute/util.h +++ b/cpp/src/arrow/compute/util.h @@ -139,69 +139,55 @@ class TempVectorHolder { uint32_t num_elements_; }; -class ARROW_EXPORT bit_util { - public: - static void bits_to_indexes(int bit_to_search, int64_t hardware_flags, - const int num_bits, const uint8_t* bits, int* num_indexes, - uint16_t* indexes, int bit_offset = 0); +namespace bit_util { - static void bits_filter_indexes(int bit_to_search, int64_t hardware_flags, +ARROW_EXPORT void bits_to_indexes(int bit_to_search, int64_t hardware_flags, const int num_bits, const uint8_t* bits, - const uint16_t* input_indexes, int* num_indexes, - uint16_t* indexes, int bit_offset = 0); + int* num_indexes, uint16_t* indexes, + int bit_offset = 0); - // Input and output indexes may be pointing to the same data (in-place filtering). - static void bits_split_indexes(int64_t hardware_flags, const int num_bits, - const uint8_t* bits, int* num_indexes_bit0, - uint16_t* indexes_bit0, uint16_t* indexes_bit1, - int bit_offset = 0); +ARROW_EXPORT void bits_filter_indexes(int bit_to_search, int64_t hardware_flags, + const int num_bits, const uint8_t* bits, + const uint16_t* input_indexes, int* num_indexes, + uint16_t* indexes, int bit_offset = 0); - // Bit 1 is replaced with byte 0xFF. - static void bits_to_bytes(int64_t hardware_flags, const int num_bits, - const uint8_t* bits, uint8_t* bytes, int bit_offset = 0); +// Input and output indexes may be pointing to the same data (in-place filtering). +ARROW_EXPORT void bits_split_indexes(int64_t hardware_flags, const int num_bits, + const uint8_t* bits, int* num_indexes_bit0, + uint16_t* indexes_bit0, uint16_t* indexes_bit1, + int bit_offset = 0); - // Return highest bit of each byte. - static void bytes_to_bits(int64_t hardware_flags, const int num_bits, - const uint8_t* bytes, uint8_t* bits, int bit_offset = 0); +// Bit 1 is replaced with byte 0xFF. +ARROW_EXPORT void bits_to_bytes(int64_t hardware_flags, const int num_bits, + const uint8_t* bits, uint8_t* bytes, int bit_offset = 0); - static bool are_all_bytes_zero(int64_t hardware_flags, const uint8_t* bytes, - uint32_t num_bytes); +// Return highest bit of each byte. +ARROW_EXPORT void bytes_to_bits(int64_t hardware_flags, const int num_bits, + const uint8_t* bytes, uint8_t* bits, int bit_offset = 0); - private: - inline static uint64_t SafeLoadUpTo8Bytes(const uint8_t* bytes, int num_bytes); - inline static void SafeStoreUpTo8Bytes(uint8_t* bytes, int num_bytes, uint64_t value); - inline static void bits_to_indexes_helper(uint64_t word, uint16_t base_index, - int* num_indexes, uint16_t* indexes); - inline static void bits_filter_indexes_helper(uint64_t word, - const uint16_t* input_indexes, - int* num_indexes, uint16_t* indexes); - template - static void bits_to_indexes_internal(int64_t hardware_flags, const int num_bits, - const uint8_t* bits, const uint16_t* input_indexes, - int* num_indexes, uint16_t* indexes, - uint16_t base_index = 0); - -#if defined(ARROW_HAVE_AVX2) - static void bits_to_indexes_avx2(int bit_to_search, const int num_bits, - const uint8_t* bits, int* num_indexes, - uint16_t* indexes, uint16_t base_index = 0); - static void bits_filter_indexes_avx2(int bit_to_search, const int num_bits, - const uint8_t* bits, const uint16_t* input_indexes, - int* num_indexes, uint16_t* indexes); - template - static void bits_to_indexes_imp_avx2(const int num_bits, const uint8_t* bits, - int* num_indexes, uint16_t* indexes, - uint16_t base_index = 0); - template - static void bits_filter_indexes_imp_avx2(const int num_bits, const uint8_t* bits, +ARROW_EXPORT bool are_all_bytes_zero(int64_t hardware_flags, const uint8_t* bytes, + uint32_t num_bytes); + +#if defined(ARROW_HAVE_RUNTIME_AVX2) + +namespace avx2 { +ARROW_EXPORT void bits_filter_indexes_avx2(int bit_to_search, const int num_bits, + const uint8_t* bits, const uint16_t* input_indexes, int* num_indexes, uint16_t* indexes); - static void bits_to_bytes_avx2(const int num_bits, const uint8_t* bits, uint8_t* bytes); - static void bytes_to_bits_avx2(const int num_bits, const uint8_t* bytes, uint8_t* bits); - static bool are_all_bytes_zero_avx2(const uint8_t* bytes, uint32_t num_bytes); +ARROW_EXPORT void bits_to_indexes_avx2(int bit_to_search, const int num_bits, + const uint8_t* bits, int* num_indexes, + uint16_t* indexes, uint16_t base_index = 0); +ARROW_EXPORT void bits_to_bytes_avx2(const int num_bits, const uint8_t* bits, + uint8_t* bytes); +ARROW_EXPORT void bytes_to_bits_avx2(const int num_bits, const uint8_t* bytes, + uint8_t* bits); +ARROW_EXPORT bool are_all_bytes_zero_avx2(const uint8_t* bytes, uint32_t num_bytes); +} // namespace avx2 + #endif -}; +} // namespace bit_util } // namespace util namespace compute { diff --git a/cpp/src/arrow/compute/util_avx2.cc b/cpp/src/arrow/compute/util_avx2.cc index 7c2a378254562..0191ab06f9532 100644 --- a/cpp/src/arrow/compute/util_avx2.cc +++ b/cpp/src/arrow/compute/util_avx2.cc @@ -16,30 +16,16 @@ // under the License. #include +#include -#include "arrow/acero/util.h" #include "arrow/util/bit_util.h" +#include "arrow/util/logging.h" -namespace arrow { -namespace util { - -#if defined(ARROW_HAVE_AVX2) - -void bit_util::bits_to_indexes_avx2(int bit_to_search, const int num_bits, - const uint8_t* bits, int* num_indexes, - uint16_t* indexes, uint16_t base_index) { - if (bit_to_search == 0) { - bits_to_indexes_imp_avx2<0>(num_bits, bits, num_indexes, indexes, base_index); - } else { - ARROW_DCHECK(bit_to_search == 1); - bits_to_indexes_imp_avx2<1>(num_bits, bits, num_indexes, indexes, base_index); - } -} +namespace arrow::util::bit_util::avx2 { template -void bit_util::bits_to_indexes_imp_avx2(const int num_bits, const uint8_t* bits, - int* num_indexes, uint16_t* indexes, - uint16_t base_index) { +void bits_to_indexes_imp_avx2(const int num_bits, const uint8_t* bits, int* num_indexes, + uint16_t* indexes, uint16_t base_index = 0) { // 64 bits at a time constexpr int unroll = 64; @@ -82,21 +68,20 @@ void bit_util::bits_to_indexes_imp_avx2(const int num_bits, const uint8_t* bits, } } -void bit_util::bits_filter_indexes_avx2(int bit_to_search, const int num_bits, - const uint8_t* bits, - const uint16_t* input_indexes, int* num_indexes, - uint16_t* indexes) { +void bits_to_indexes_avx2(int bit_to_search, const int num_bits, const uint8_t* bits, + int* num_indexes, uint16_t* indexes, uint16_t base_index) { if (bit_to_search == 0) { - bits_filter_indexes_imp_avx2<0>(num_bits, bits, input_indexes, num_indexes, indexes); + bits_to_indexes_imp_avx2<0>(num_bits, bits, num_indexes, indexes, base_index); } else { - bits_filter_indexes_imp_avx2<1>(num_bits, bits, input_indexes, num_indexes, indexes); + ARROW_DCHECK(bit_to_search == 1); + bits_to_indexes_imp_avx2<1>(num_bits, bits, num_indexes, indexes, base_index); } } template -void bit_util::bits_filter_indexes_imp_avx2(const int num_bits, const uint8_t* bits, - const uint16_t* input_indexes, - int* out_num_indexes, uint16_t* indexes) { +void bits_filter_indexes_imp_avx2(const int num_bits, const uint8_t* bits, + const uint16_t* input_indexes, int* out_num_indexes, + uint16_t* indexes) { // 64 bits at a time constexpr int unroll = 64; @@ -167,8 +152,17 @@ void bit_util::bits_filter_indexes_imp_avx2(const int num_bits, const uint8_t* b *out_num_indexes = num_indexes; } -void bit_util::bits_to_bytes_avx2(const int num_bits, const uint8_t* bits, - uint8_t* bytes) { +void bits_filter_indexes_avx2(int bit_to_search, const int num_bits, const uint8_t* bits, + const uint16_t* input_indexes, int* num_indexes, + uint16_t* indexes) { + if (bit_to_search == 0) { + bits_filter_indexes_imp_avx2<0>(num_bits, bits, input_indexes, num_indexes, indexes); + } else { + bits_filter_indexes_imp_avx2<1>(num_bits, bits, input_indexes, num_indexes, indexes); + } +} + +void bits_to_bytes_avx2(const int num_bits, const uint8_t* bits, uint8_t* bytes) { constexpr int unroll = 32; constexpr uint64_t kEachByteIs1 = 0x0101010101010101ULL; @@ -188,8 +182,7 @@ void bit_util::bits_to_bytes_avx2(const int num_bits, const uint8_t* bits, } } -void bit_util::bytes_to_bits_avx2(const int num_bits, const uint8_t* bytes, - uint8_t* bits) { +void bytes_to_bits_avx2(const int num_bits, const uint8_t* bytes, uint8_t* bits) { constexpr int unroll = 32; // Processing 32 bits at a time for (int i = 0; i < num_bits / unroll; ++i) { @@ -198,7 +191,7 @@ void bit_util::bytes_to_bits_avx2(const int num_bits, const uint8_t* bytes, } } -bool bit_util::are_all_bytes_zero_avx2(const uint8_t* bytes, uint32_t num_bytes) { +bool are_all_bytes_zero_avx2(const uint8_t* bytes, uint32_t num_bytes) { __m256i result_or = _mm256_setzero_si256(); uint32_t i; for (i = 0; i < num_bytes / 32; ++i) { @@ -216,7 +209,4 @@ bool bit_util::are_all_bytes_zero_avx2(const uint8_t* bytes, uint32_t num_bytes) return result_or32 == 0; } -#endif // ARROW_HAVE_AVX2 - -} // namespace util -} // namespace arrow +} // namespace arrow::util::bit_util::avx2 diff --git a/cpp/src/arrow/csv/reader.cc b/cpp/src/arrow/csv/reader.cc index fdc7fcb1380e5..bf703b6c6ba28 100644 --- a/cpp/src/arrow/csv/reader.cc +++ b/cpp/src/arrow/csv/reader.cc @@ -1246,27 +1246,6 @@ Result> TableReader::Make( parse_options, convert_options); } -Result> TableReader::Make( - MemoryPool* pool, io::IOContext io_context, std::shared_ptr input, - const ReadOptions& read_options, const ParseOptions& parse_options, - const ConvertOptions& convert_options) { - return MakeTableReader(pool, io_context, std::move(input), read_options, parse_options, - convert_options); -} - -Result> StreamingReader::Make( - MemoryPool* pool, std::shared_ptr input, - const ReadOptions& read_options, const ParseOptions& parse_options, - const ConvertOptions& convert_options) { - auto io_context = io::IOContext(pool); - auto cpu_executor = arrow::internal::GetCpuThreadPool(); - auto reader_fut = MakeStreamingReader(io_context, std::move(input), cpu_executor, - read_options, parse_options, convert_options); - auto reader_result = reader_fut.result(); - ARROW_ASSIGN_OR_RAISE(auto reader, reader_result); - return reader; -} - Result> StreamingReader::Make( io::IOContext io_context, std::shared_ptr input, const ReadOptions& read_options, const ParseOptions& parse_options, diff --git a/cpp/src/arrow/csv/reader.h b/cpp/src/arrow/csv/reader.h index 03b953d0055e5..bae301dc14815 100644 --- a/cpp/src/arrow/csv/reader.h +++ b/cpp/src/arrow/csv/reader.h @@ -52,13 +52,6 @@ class ARROW_EXPORT TableReader { const ReadOptions&, const ParseOptions&, const ConvertOptions&); - - ARROW_DEPRECATED( - "Deprecated in 4.0.0. " - "Use MemoryPool-less variant (the IOContext holds a pool already)") - static Result> Make( - MemoryPool* pool, io::IOContext io_context, std::shared_ptr input, - const ReadOptions&, const ParseOptions&, const ConvertOptions&); }; /// \brief A class that reads a CSV file incrementally @@ -105,12 +98,6 @@ class ARROW_EXPORT StreamingReader : public RecordBatchReader { static Result> Make( io::IOContext io_context, std::shared_ptr input, const ReadOptions&, const ParseOptions&, const ConvertOptions&); - - ARROW_DEPRECATED("Deprecated in 4.0.0. Use IOContext-based overload") - static Result> Make( - MemoryPool* pool, std::shared_ptr input, - const ReadOptions& read_options, const ParseOptions& parse_options, - const ConvertOptions& convert_options); }; /// \brief Count the logical rows of data in a CSV file (i.e. the diff --git a/cpp/src/arrow/dataset/dataset.h b/cpp/src/arrow/dataset/dataset.h index 1db230b16e9c2..39936fbd7b5b2 100644 --- a/cpp/src/arrow/dataset/dataset.h +++ b/cpp/src/arrow/dataset/dataset.h @@ -82,7 +82,7 @@ class ARROW_DS_EXPORT FragmentSelection { /// \brief Instructions for scanning a particular fragment /// -/// The fragment scan request is dervied from ScanV2Options. The main +/// The fragment scan request is derived from ScanV2Options. The main /// difference is that the scan options are based on the dataset schema /// while the fragment request is based on the fragment schema. struct ARROW_DS_EXPORT FragmentScanRequest { diff --git a/cpp/src/arrow/device.h b/cpp/src/arrow/device.h index 67c62a5181f28..9cc68fe8c82ce 100644 --- a/cpp/src/arrow/device.h +++ b/cpp/src/arrow/device.h @@ -29,6 +29,24 @@ namespace arrow { +/// \brief EXPERIMENTAL: Device type enum which matches up with C Data Device types +enum class DeviceAllocationType : char { + kCPU = 1, + kCUDA = 2, + kCUDA_HOST = 3, + kOPENCL = 4, + kVULKAN = 7, + kMETAL = 8, + kVPI = 9, + kROCM = 10, + kROCM_HOST = 11, + kEXT_DEV = 12, + kCUDA_MANAGED = 13, + kONEAPI = 14, + kWEBGPU = 15, + kHEXAGON = 16, +}; + class MemoryManager; /// \brief EXPERIMENTAL: Abstract interface for hardware devices @@ -58,6 +76,12 @@ class ARROW_EXPORT Device : public std::enable_shared_from_this, /// \brief Whether this instance points to the same device as another one. virtual bool Equals(const Device&) const = 0; + /// \brief A device ID to identify this device if there are multiple of this type. + /// + /// If there is no "device_id" equivalent (such as for the main CPU device on + /// non-numa systems) returns -1. + virtual int64_t device_id() const { return -1; } + /// \brief Whether this device is the main CPU device. /// /// This shorthand method is very useful when deciding whether a memory address @@ -71,6 +95,9 @@ class ARROW_EXPORT Device : public std::enable_shared_from_this, /// MemoryManager instances with non-default parameters. virtual std::shared_ptr default_memory_manager() = 0; + /// \brief Return the DeviceAllocationType of this device + virtual DeviceAllocationType device_type() const = 0; + protected: ARROW_DISALLOW_COPY_AND_ASSIGN(Device); explicit Device(bool is_cpu = false) : is_cpu_(is_cpu) {} @@ -172,6 +199,7 @@ class ARROW_EXPORT CPUDevice : public Device { const char* type_name() const override; std::string ToString() const override; bool Equals(const Device&) const override; + DeviceAllocationType device_type() const override { return DeviceAllocationType::kCPU; } std::shared_ptr default_memory_manager() override; diff --git a/cpp/src/arrow/filesystem/CMakeLists.txt b/cpp/src/arrow/filesystem/CMakeLists.txt index 97aa01ea9f995..b997ca0a387a6 100644 --- a/cpp/src/arrow/filesystem/CMakeLists.txt +++ b/cpp/src/arrow/filesystem/CMakeLists.txt @@ -47,6 +47,15 @@ if(ARROW_GCS) Boost::system) endif() +if(ARROW_AZURE) + add_arrow_test(azurefs_test + EXTRA_LABELS + filesystem + EXTRA_LINK_LIBS + Boost::filesystem + Boost::system) +endif() + if(ARROW_S3) add_arrow_test(s3fs_test SOURCES diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc new file mode 100644 index 0000000000000..0158c0cec74e1 --- /dev/null +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -0,0 +1,154 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/filesystem/azurefs.h" + +#include "arrow/result.h" +#include "arrow/util/checked_cast.h" + +namespace arrow { +namespace fs { + +// ----------------------------------------------------------------------- +// AzureOptions Implementation + +AzureOptions::AzureOptions() {} + +bool AzureOptions::Equals(const AzureOptions& other) const { + return (account_dfs_url == other.account_dfs_url && + account_blob_url == other.account_blob_url && + credentials_kind == other.credentials_kind); +} + +// ----------------------------------------------------------------------- +// AzureFilesystem Implementation + +class AzureFileSystem::Impl { + public: + io::IOContext io_context_; + bool is_hierarchical_namespace_enabled_; + AzureOptions options_; + + explicit Impl(AzureOptions options, io::IOContext io_context) + : io_context_(io_context), options_(std::move(options)) {} + + Status Init() { + if (options_.backend == AzureBackend::Azurite) { + // gen1Client_->GetAccountInfo().Value.IsHierarchicalNamespaceEnabled + // throws error in azurite + is_hierarchical_namespace_enabled_ = false; + } + return Status::OK(); + } + + const AzureOptions& options() const { return options_; } +}; + +const AzureOptions& AzureFileSystem::options() const { return impl_->options(); } + +bool AzureFileSystem::Equals(const FileSystem& other) const { + if (this == &other) { + return true; + } + if (other.type_name() != type_name()) { + return false; + } + const auto& azure_fs = ::arrow::internal::checked_cast(other); + return options().Equals(azure_fs.options()); +} + +Result AzureFileSystem::GetFileInfo(const std::string& path) { + return Status::NotImplemented("The Azure FileSystem is not fully implemented"); +} + +Result AzureFileSystem::GetFileInfo(const FileSelector& select) { + return Status::NotImplemented("The Azure FileSystem is not fully implemented"); +} + +Status AzureFileSystem::CreateDir(const std::string& path, bool recursive) { + return Status::NotImplemented("The Azure FileSystem is not fully implemented"); +} + +Status AzureFileSystem::DeleteDir(const std::string& path) { + return Status::NotImplemented("The Azure FileSystem is not fully implemented"); +} + +Status AzureFileSystem::DeleteDirContents(const std::string& path, bool missing_dir_ok) { + return Status::NotImplemented("The Azure FileSystem is not fully implemented"); +} + +Status AzureFileSystem::DeleteRootDirContents() { + return Status::NotImplemented("The Azure FileSystem is not fully implemented"); +} + +Status AzureFileSystem::DeleteFile(const std::string& path) { + return Status::NotImplemented("The Azure FileSystem is not fully implemented"); +} + +Status AzureFileSystem::Move(const std::string& src, const std::string& dest) { + return Status::NotImplemented("The Azure FileSystem is not fully implemented"); +} + +Status AzureFileSystem::CopyFile(const std::string& src, const std::string& dest) { + return Status::NotImplemented("The Azure FileSystem is not fully implemented"); +} + +Result> AzureFileSystem::OpenInputStream( + const std::string& path) { + return Status::NotImplemented("The Azure FileSystem is not fully implemented"); +} + +Result> AzureFileSystem::OpenInputStream( + const FileInfo& info) { + return Status::NotImplemented("The Azure FileSystem is not fully implemented"); +} + +Result> AzureFileSystem::OpenInputFile( + const std::string& path) { + return Status::NotImplemented("The Azure FileSystem is not fully implemented"); +} + +Result> AzureFileSystem::OpenInputFile( + const FileInfo& info) { + return Status::NotImplemented("The Azure FileSystem is not fully implemented"); +} + +Result> AzureFileSystem::OpenOutputStream( + const std::string& path, const std::shared_ptr& metadata) { + return Status::NotImplemented("The Azure FileSystem is not fully implemented"); +} + +Result> AzureFileSystem::OpenAppendStream( + const std::string&, const std::shared_ptr&) { + return Status::NotImplemented("The Azure FileSystem is not fully implemented"); +} + +Result> AzureFileSystem::Make( + const AzureOptions& options, const io::IOContext& io_context) { + std::shared_ptr ptr(new AzureFileSystem(options, io_context)); + RETURN_NOT_OK(ptr->impl_->Init()); + return ptr; +} + +AzureFileSystem::AzureFileSystem(const AzureOptions& options, + const io::IOContext& io_context) + : FileSystem(io_context), impl_(std::make_unique(options, io_context)) { + default_async_is_sync_ = false; +} + +} // namespace fs +} // namespace arrow diff --git a/cpp/src/arrow/filesystem/azurefs.h b/cpp/src/arrow/filesystem/azurefs.h new file mode 100644 index 0000000000000..e5af4d23aabe5 --- /dev/null +++ b/cpp/src/arrow/filesystem/azurefs.h @@ -0,0 +1,159 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "arrow/filesystem/filesystem.h" +#include "arrow/util/macros.h" +#include "arrow/util/uri.h" + +namespace Azure { +namespace Core { +namespace Credentials { + +class TokenCredential; + +} // namespace Credentials +} // namespace Core +namespace Storage { + +class StorageSharedKeyCredential; + +} // namespace Storage +} // namespace Azure + +namespace arrow { +namespace fs { + +enum class AzureCredentialsKind : int8_t { + /// Anonymous access (no credentials used), public + Anonymous, + /// Use explicitly-provided access key pair + StorageCredentials, + /// Use ServicePrincipleCredentials + ServicePrincipleCredentials, + /// Use Sas Token to authenticate + Sas, + /// Use Connection String + ConnectionString +}; + +enum class AzureBackend : bool { + /// Official Azure Remote Backend + Azure, + /// Local Simulated Storage + Azurite +}; + +/// Options for the AzureFileSystem implementation. +struct ARROW_EXPORT AzureOptions { + std::string account_dfs_url; + std::string account_blob_url; + AzureBackend backend = AzureBackend::Azure; + AzureCredentialsKind credentials_kind = AzureCredentialsKind::Anonymous; + + std::string sas_token; + std::string connection_string; + std::shared_ptr + storage_credentials_provider; + std::shared_ptr + service_principle_credentials_provider; + + AzureOptions(); + + bool Equals(const AzureOptions& other) const; +}; + +/// \brief Azure-backed FileSystem implementation for ABFS and ADLS. +/// +/// ABFS (Azure Blob Storage - https://azure.microsoft.com/en-us/products/storage/blobs/) +/// object-based cloud storage system. +/// +/// ADLS (Azure Data Lake Storage - +/// https://azure.microsoft.com/en-us/products/storage/data-lake-storage/) +/// is a scalable data storage system designed for big-data applications. +/// ADLS provides filesystem semantics, file-level security, and Hadoop +/// compatibility. Gen1 exists as a separate object that will retired +/// on Feb 29, 2024. New ADLS accounts will use Gen2 instead, which is +/// implemented on top of ABFS. +/// +/// TODO: GH-18014 Complete the internal implementation +/// and review the documentation +class ARROW_EXPORT AzureFileSystem : public FileSystem { + public: + ~AzureFileSystem() override = default; + + std::string type_name() const override { return "abfs"; } + + /// Return the original Azure options when constructing the filesystem + const AzureOptions& options() const; + + bool Equals(const FileSystem& other) const override; + + Result GetFileInfo(const std::string& path) override; + + Result GetFileInfo(const FileSelector& select) override; + + Status CreateDir(const std::string& path, bool recursive = true) override; + + Status DeleteDir(const std::string& path) override; + + Status DeleteDirContents(const std::string& path, bool missing_dir_ok = false) override; + + Status DeleteRootDirContents() override; + + Status DeleteFile(const std::string& path) override; + + Status Move(const std::string& src, const std::string& dest) override; + + Status CopyFile(const std::string& src, const std::string& dest) override; + + Result> OpenInputStream( + const std::string& path) override; + + Result> OpenInputStream(const FileInfo& info) override; + + Result> OpenInputFile( + const std::string& path) override; + + Result> OpenInputFile( + const FileInfo& info) override; + + Result> OpenOutputStream( + const std::string& path, + const std::shared_ptr& metadata = {}) override; + + Result> OpenAppendStream( + const std::string& path, + const std::shared_ptr& metadata = {}) override; + + static Result> Make( + const AzureOptions& options, const io::IOContext& = io::default_io_context()); + + private: + explicit AzureFileSystem(const AzureOptions& options, const io::IOContext& io_context); + + class Impl; + std::unique_ptr impl_; +}; + +} // namespace fs +} // namespace arrow diff --git a/cpp/src/arrow/flight/try_compile/check_tls_opts_143.cc b/cpp/src/arrow/filesystem/azurefs_test.cc similarity index 54% rename from cpp/src/arrow/flight/try_compile/check_tls_opts_143.cc rename to cpp/src/arrow/filesystem/azurefs_test.cc index 2fdaac9d6ef7e..0f03e88393aeb 100644 --- a/cpp/src/arrow/flight/try_compile/check_tls_opts_143.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -15,23 +15,32 @@ // specific language governing permissions and limitations // under the License. -// Dummy file for checking if TlsCredentialsOptions supports -// set_verify_server_certs. gRPC starting from 1.43 added this boolean -// flag as opposed to prior versions which used an enum. This is for -// supporting disabling server validation when using TLS. - -#include -#include -#include - -static void check() { - // 1.36 uses an enum; 1.43 uses booleans - auto options = std::make_shared(); - options->set_check_call_host(false); - options->set_verify_server_certs(false); -} +#include "arrow/filesystem/azurefs.h" + +#include +#include +#include + +#include + +#include "arrow/testing/gtest_util.h" +#include "arrow/testing/util.h" -int main(int argc, const char** argv) { - check(); - return 0; +namespace arrow { +namespace fs { +namespace { + +using ::testing::IsEmpty; +using ::testing::Not; +using ::testing::NotNull; + +// Placeholder test for file structure +// TODO: GH-18014 Remove once a proper test is added +TEST(AzureFileSystem, OptionsCompare) { + AzureOptions options; + EXPECT_TRUE(options.Equals(options)); } + +} // namespace +} // namespace fs +} // namespace arrow diff --git a/cpp/src/arrow/filesystem/filesystem_test.cc b/cpp/src/arrow/filesystem/filesystem_test.cc index b450a69913135..c76c3d27e8f8e 100644 --- a/cpp/src/arrow/filesystem/filesystem_test.cc +++ b/cpp/src/arrow/filesystem/filesystem_test.cc @@ -87,6 +87,34 @@ TEST(PathUtil, SplitAbstractPath) { AssertPartsEqual(parts, {"abc", "def.ghi"}); } +TEST(PathUtil, SliceAbstractPath) { + std::string path = "abc"; + ASSERT_EQ("abc", SliceAbstractPath(path, 0, 1)); + ASSERT_EQ("abc", SliceAbstractPath(path, 0, 2)); + ASSERT_EQ("", SliceAbstractPath(path, 0, 0)); + ASSERT_EQ("", SliceAbstractPath(path, 1, 0)); + + path = "abc/def\\x/y.ext"; + ASSERT_EQ("abc/def\\x/y.ext", SliceAbstractPath(path, 0, 4)); + ASSERT_EQ("abc/def\\x/y.ext", SliceAbstractPath(path, 0, 3)); + ASSERT_EQ("abc/def\\x", SliceAbstractPath(path, 0, 2)); + ASSERT_EQ("abc", SliceAbstractPath(path, 0, 1)); + ASSERT_EQ("def\\x/y.ext", SliceAbstractPath(path, 1, 2)); + ASSERT_EQ("def\\x/y.ext", SliceAbstractPath(path, 1, 3)); + ASSERT_EQ("def\\x", SliceAbstractPath(path, 1, 1)); + ASSERT_EQ("y.ext", SliceAbstractPath(path, 2, 1)); + ASSERT_EQ("", SliceAbstractPath(path, 3, 1)); + + path = "x/y\\z"; + ASSERT_EQ("x", SliceAbstractPath(path, 0, 1)); + ASSERT_EQ("x/y", SliceAbstractPath(path, 0, 1, /*sep=*/'\\')); + + // Invalid cases but we shouldn't crash + ASSERT_EQ("", SliceAbstractPath(path, -1, 1)); + ASSERT_EQ("", SliceAbstractPath(path, 0, -1)); + ASSERT_EQ("", SliceAbstractPath(path, -1, -1)); +} + TEST(PathUtil, GetAbstractPathExtension) { ASSERT_EQ(GetAbstractPathExtension("abc.txt"), "txt"); ASSERT_EQ(GetAbstractPathExtension("dir/abc.txt"), "txt"); @@ -98,6 +126,19 @@ TEST(PathUtil, GetAbstractPathExtension) { ASSERT_EQ(GetAbstractPathExtension("/run.d/abc"), ""); } +TEST(PathUtil, GetAbstractPathDepth) { + ASSERT_EQ(0, GetAbstractPathDepth("")); + ASSERT_EQ(0, GetAbstractPathDepth("/")); + ASSERT_EQ(1, GetAbstractPathDepth("foo")); + ASSERT_EQ(1, GetAbstractPathDepth("foo/")); + ASSERT_EQ(1, GetAbstractPathDepth("/foo")); + ASSERT_EQ(1, GetAbstractPathDepth("/foo/")); + ASSERT_EQ(2, GetAbstractPathDepth("/foo/bar")); + ASSERT_EQ(2, GetAbstractPathDepth("/foo/bar/")); + ASSERT_EQ(2, GetAbstractPathDepth("foo/bar")); + ASSERT_EQ(2, GetAbstractPathDepth("foo/bar/")); +} + TEST(PathUtil, GetAbstractPathParent) { std::pair pair; diff --git a/cpp/src/arrow/filesystem/path_util.cc b/cpp/src/arrow/filesystem/path_util.cc index e25e544f0341f..90af3c66ff8d4 100644 --- a/cpp/src/arrow/filesystem/path_util.cc +++ b/cpp/src/arrow/filesystem/path_util.cc @@ -17,6 +17,7 @@ #include #include +#include #include "arrow/filesystem/path_util.h" #include "arrow/filesystem/util_internal.h" @@ -66,6 +67,42 @@ std::vector SplitAbstractPath(const std::string& path, char sep) { return parts; } +std::string SliceAbstractPath(const std::string& s, int offset, int length, char sep) { + if (offset < 0 || length < 0) { + return ""; + } + std::vector components = SplitAbstractPath(s, sep); + std::stringstream combined; + if (offset >= static_cast(components.size())) { + return ""; + } + int end = offset + length; + if (end > static_cast(components.size())) { + end = static_cast(components.size()); + } + for (int i = offset; i < end; i++) { + combined << components[i]; + if (i < end - 1) { + combined << sep; + } + } + return combined.str(); +} + +int GetAbstractPathDepth(std::string_view path) { + if (path.empty()) { + return 0; + } + int depth = static_cast(std::count(path.begin(), path.end(), kSep)) + 1; + if (path.back() == kSep) { + depth -= 1; + } + if (path.front() == kSep) { + depth -= 1; + } + return depth; +} + std::pair GetAbstractPathParent(const std::string& s) { // XXX should strip trailing slash? diff --git a/cpp/src/arrow/filesystem/path_util.h b/cpp/src/arrow/filesystem/path_util.h index b821e79338490..13a74b7fa12c8 100644 --- a/cpp/src/arrow/filesystem/path_util.h +++ b/cpp/src/arrow/filesystem/path_util.h @@ -38,9 +38,25 @@ constexpr char kSep = '/'; ARROW_EXPORT std::vector SplitAbstractPath(const std::string& path, char sep = kSep); -// Return the extension of the file +// Slice the individual components of an abstract path and combine them +// +// If offset or length are negative then an empty string is returned +// If offset is >= the number of components then an empty string is returned +// If offset + length is >= the number of components then length is truncated ARROW_EXPORT -std::string GetAbstractPathExtension(const std::string& s); +std::string SliceAbstractPath(const std::string& path, int offset, int length, + char sep = kSep); + +// Return the extension of the file +ARROW_EXPORT std::string GetAbstractPathExtension(const std::string& s); + +// Return the depth (number of components) of an abstract path +// +// Trailing slashes do not count towards depth +// Leading slashes do not count towards depth +// +// The root path ("/") has depth 0 +ARROW_EXPORT int GetAbstractPathDepth(std::string_view path); // Return the parent directory and basename of an abstract path. Both values may be // empty. diff --git a/cpp/src/arrow/filesystem/s3_test_util.h b/cpp/src/arrow/filesystem/s3_test_util.h index 17245e0a89e09..e270a6e1c469a 100644 --- a/cpp/src/arrow/filesystem/s3_test_util.h +++ b/cpp/src/arrow/filesystem/s3_test_util.h @@ -26,6 +26,7 @@ #include "arrow/filesystem/s3fs.h" #include "arrow/status.h" #include "arrow/testing/gtest_util.h" +#include "arrow/testing/util.h" #include "arrow/util/checked_cast.h" #include "arrow/util/macros.h" @@ -76,6 +77,13 @@ class MinioTestEnvironment : public ::testing::Environment { class S3Environment : public ::testing::Environment { public: + // We set this environment variable to speed up tests by ensuring + // DefaultAWSCredentialsProviderChain does not query (inaccessible) + // EC2 metadata endpoint. + // This must be done before spawning any Minio child process to avoid any race + // condition accessing environment variables. + S3Environment() : ec2_metadata_disabled_guard_("AWS_EC2_METADATA_DISABLED", "true") {} + void SetUp() override { // Change this to increase logging during tests S3GlobalOptions options; @@ -84,6 +92,9 @@ class S3Environment : public ::testing::Environment { } void TearDown() override { ASSERT_OK(FinalizeS3()); } + + private: + EnvVarGuard ec2_metadata_disabled_guard_; }; } // namespace fs diff --git a/cpp/src/arrow/filesystem/s3fs.cc b/cpp/src/arrow/filesystem/s3fs.cc index f05d89963ce45..c67f7668ffa4d 100644 --- a/cpp/src/arrow/filesystem/s3fs.cc +++ b/cpp/src/arrow/filesystem/s3fs.cc @@ -29,6 +29,7 @@ #include #include #include +#include #include #ifdef _WIN32 @@ -59,6 +60,7 @@ #endif #include #include +#include #include #include #include @@ -91,6 +93,7 @@ #include "arrow/result.h" #include "arrow/status.h" #include "arrow/util/async_generator.h" +#include "arrow/util/async_util.h" #include "arrow/util/checked_cast.h" #include "arrow/util/future.h" #include "arrow/util/io_util.h" @@ -127,6 +130,7 @@ using internal::ToAwsString; using internal::ToURLEncodedAwsString; static const char kSep = '/'; +constexpr char kAwsEndpointUrlEnvVar[] = "AWS_ENDPOINT_URL"; // ----------------------------------------------------------------------- // S3ProxyOptions implementation @@ -337,6 +341,10 @@ Result S3Options::FromUri(const Uri& uri, std::string* out_path) { } else { options.ConfigureDefaultCredentials(); } + auto endpoint_env = arrow::internal::GetEnvVar(kAwsEndpointUrlEnvVar); + if (endpoint_env.ok()) { + options.endpoint_override = *endpoint_env; + } bool region_set = false; for (const auto& kv : options_map) { @@ -712,6 +720,7 @@ void DisableRedirects(Aws::Client::ClientConfiguration* c) { // To prevent such issues, we wrap all S3Client instances in a special // structure (S3ClientHolder) that prevents usage of S3Client after // S3 was finalized. +// Please make sure you read the comments in S3ClientLock::Move below. // // See: GH-36346, GH-15054. @@ -722,6 +731,22 @@ class S3ClientLock { S3Client* get() { return client_.get(); } S3Client* operator->() { return client_.get(); } + // Move this S3ClientLock into a temporary instance + // + // It is counter-intuitive, but lock ordering issues can happen even + // with a shared mutex locked in shared mode. + // The reason is that locking again in shared mode can block while + // there are threads waiting to take the lock in exclusive mode. + // Therefore, we should avoid obtaining the S3ClientLock when + // we already have it locked. + // + // This methods helps by moving the S3ClientLock into a temporary + // that is immediately destroyed so the lock will be released as + // soon as we are done making the call to the underlying client. + // + // (see GH-36523) + S3ClientLock Move() { return std::move(*this); } + protected: friend class S3ClientHolder; @@ -800,11 +825,28 @@ class S3ClientFinalizer : public std::enable_shared_from_this }; Result S3ClientHolder::Lock() { - std::lock_guard lock(mutex_); - auto finalizer = finalizer_.lock(); + std::shared_ptr finalizer; + std::shared_ptr client; + { + std::unique_lock lock(mutex_); + finalizer = finalizer_.lock(); + client = client_; + } + // Do not hold mutex while taking finalizer lock below. + // + // Acquiring a shared_mutex in shared mode may block even if not already + // acquired in exclusive mode, because of pending writers: + // https://github.com/google/sanitizers/issues/1668#issuecomment-1624985664 + // """It is implementation-defined whether the calling thread acquires + // the lock when a writer does not hold the lock and there are writers + // blocked on the lock""". + // + // Therefore, we want to avoid potential lock ordering issues + // even when a shared lock is involved (GH-36523). if (!finalizer) { return ErrorS3Finalized(); } + S3ClientLock client_lock; // Lock the finalizer before examining it client_lock.lock_ = finalizer->LockShared(); @@ -812,14 +854,18 @@ Result S3ClientHolder::Lock() { return ErrorS3Finalized(); } // (the client can be cleared only if finalizer->finalized_ is true) - DCHECK(client_) << "inconsistent S3ClientHolder"; - client_lock.client_ = client_; + DCHECK(client) << "inconsistent S3ClientHolder"; + client_lock.client_ = std::move(client); return client_lock; } void S3ClientHolder::Finalize() { - std::lock_guard lock(mutex_); - client_.reset(); + std::shared_ptr client; + { + std::unique_lock lock(mutex_); + client = std::move(client_); + } + // Do not hold mutex while ~S3Client potentially runs } std::shared_ptr GetClientFinalizer() { @@ -1158,7 +1204,7 @@ class ObjectInputFile final : public io::RandomAccessFile { req.SetKey(ToAwsString(path_.key)); ARROW_ASSIGN_OR_RAISE(auto client_lock, holder_->Lock()); - auto outcome = client_lock->HeadObject(req); + auto outcome = client_lock.Move()->HeadObject(req); if (!outcome.IsSuccess()) { if (IsNotFound(outcome.GetError())) { return PathNotFound(path_); @@ -1343,7 +1389,7 @@ class ObjectOutputStream final : public io::OutputStream { req.SetContentType("application/octet-stream"); } - auto outcome = client_lock->CreateMultipartUpload(req); + auto outcome = client_lock.Move()->CreateMultipartUpload(req); if (!outcome.IsSuccess()) { return ErrorToStatus( std::forward_as_tuple("When initiating multiple part upload for key '", @@ -1368,7 +1414,7 @@ class ObjectOutputStream final : public io::OutputStream { req.SetKey(ToAwsString(path_.key)); req.SetUploadId(upload_id_); - auto outcome = client_lock->AbortMultipartUpload(req); + auto outcome = client_lock.Move()->AbortMultipartUpload(req); if (!outcome.IsSuccess()) { return ErrorToStatus( std::forward_as_tuple("When aborting multiple part upload for key '", path_.key, @@ -1418,7 +1464,8 @@ class ObjectOutputStream final : public io::OutputStream { req.SetUploadId(upload_id_); req.SetMultipartUpload(std::move(completed_upload)); - auto outcome = client_lock->CompleteMultipartUploadWithErrorFixup(std::move(req)); + auto outcome = + client_lock.Move()->CompleteMultipartUploadWithErrorFixup(std::move(req)); if (!outcome.IsSuccess()) { return ErrorToStatus( std::forward_as_tuple("When completing multiple part upload for key '", @@ -1527,8 +1574,6 @@ class ObjectOutputStream final : public io::OutputStream { Status UploadPart(const void* data, int64_t nbytes, std::shared_ptr owned_buffer = nullptr) { - ARROW_ASSIGN_OR_RAISE(auto client_lock, holder_->Lock()); - S3Model::UploadPartRequest req; req.SetBucket(ToAwsString(path_.bucket)); req.SetKey(ToAwsString(path_.key)); @@ -1538,7 +1583,8 @@ class ObjectOutputStream final : public io::OutputStream { if (!background_writes_) { req.SetBody(std::make_shared(data, nbytes)); - auto outcome = client_lock->UploadPart(req); + ARROW_ASSIGN_OR_RAISE(auto client_lock, holder_->Lock()); + auto outcome = client_lock.Move()->UploadPart(req); if (!outcome.IsSuccess()) { return UploadPartError(req, outcome); } else { @@ -1562,21 +1608,17 @@ class ObjectOutputStream final : public io::OutputStream { upload_state_->pending_parts_completed = Future<>::Make(); } } - // XXX This callback returns Aws::Utils::Outcome, it cannot easily call - // `holder->Lock()` which returns arrow::Result. - ARROW_ASSIGN_OR_RAISE( - auto fut, - SubmitIO(io_context_, [client_lock = std::move(client_lock), req]() mutable { - return client_lock->UploadPart(req); - })); + // The closure keeps the buffer and the upload state alive - auto state = upload_state_; - auto part_number = part_number_; - auto handler = [owned_buffer, state, part_number, - req](const Result& result) -> void { - HandleUploadOutcome(state, part_number, req, result); + auto deferred = [owned_buffer, holder = holder_, req = std::move(req), + state = upload_state_, + part_number = part_number_]() mutable -> Status { + ARROW_ASSIGN_OR_RAISE(auto client_lock, holder->Lock()); + auto outcome = client_lock.Move()->UploadPart(req); + HandleUploadOutcome(state, part_number, req, outcome); + return Status::OK(); }; - fut.AddCallback(std::move(handler)); + RETURN_NOT_OK(SubmitIO(io_context_, std::move(deferred))); } ++part_number_; @@ -1667,146 +1709,6 @@ void FileObjectToInfo(const S3Model::Object& obj, FileInfo* info) { info->set_mtime(FromAwsDatetime(obj.GetLastModified())); } -struct TreeWalker : public std::enable_shared_from_this { - using ResultHandler = std::function; - using ErrorHandler = std::function& error)>; - using RecursionHandler = std::function(int32_t nesting_depth)>; - - std::shared_ptr holder_; - io::IOContext io_context_; - const std::string bucket_; - const std::string base_dir_; - const int32_t max_keys_; - const ResultHandler result_handler_; - const ErrorHandler error_handler_; - const RecursionHandler recursion_handler_; - - template - static Status Walk(Args&&... args) { - return WalkAsync(std::forward(args)...).status(); - } - - template - static Future<> WalkAsync(Args&&... args) { - auto self = std::make_shared(std::forward(args)...); - return self->DoWalk(); - } - - TreeWalker(std::shared_ptr holder, io::IOContext io_context, - std::string bucket, std::string base_dir, int32_t max_keys, - ResultHandler result_handler, ErrorHandler error_handler, - RecursionHandler recursion_handler) - : holder_(std::move(holder)), - io_context_(io_context), - bucket_(std::move(bucket)), - base_dir_(std::move(base_dir)), - max_keys_(max_keys), - result_handler_(std::move(result_handler)), - error_handler_(std::move(error_handler)), - recursion_handler_(std::move(recursion_handler)) {} - - private: - std::shared_ptr task_group_; - std::mutex mutex_; - - Future<> DoWalk() { - task_group_ = - TaskGroup::MakeThreaded(io_context_.executor(), io_context_.stop_token()); - WalkChild(base_dir_, /*nesting_depth=*/0); - // When this returns, ListObjectsV2 tasks either have finished or will exit early - return task_group_->FinishAsync(); - } - - bool ok() const { return task_group_->ok(); } - - struct ListObjectsV2Handler { - std::shared_ptr walker; - std::string prefix; - int32_t nesting_depth; - S3Model::ListObjectsV2Request req; - - Status operator()(const Result& result) { - // Serialize calls to operation-specific handlers - if (!walker->ok()) { - // Early exit: avoid executing handlers if DoWalk() returned - return Status::OK(); - } - if (!result.ok()) { - return result.status(); - } - const auto& outcome = *result; - if (!outcome.IsSuccess()) { - { - std::lock_guard guard(walker->mutex_); - return walker->error_handler_(outcome.GetError()); - } - } - return HandleResult(outcome.GetResult()); - } - - void SpawnListObjectsV2() { - auto cb = *this; - walker->task_group_->Append([cb]() mutable { - ARROW_ASSIGN_OR_RAISE(auto client_lock, cb.walker->holder_->Lock()); - Result result = client_lock->ListObjectsV2(cb.req); - return cb(result); - }); - } - - Status HandleResult(const S3Model::ListObjectsV2Result& result) { - bool recurse; - { - // Only one thread should be running result_handler_/recursion_handler_ at a time - std::lock_guard guard(walker->mutex_); - recurse = result.GetCommonPrefixes().size() > 0; - if (recurse) { - ARROW_ASSIGN_OR_RAISE(auto maybe_recurse, - walker->recursion_handler_(nesting_depth + 1)); - recurse &= maybe_recurse; - } - RETURN_NOT_OK(walker->result_handler_(prefix, result)); - } - if (recurse) { - walker->WalkChildren(result, nesting_depth + 1); - } - // If the result was truncated, issue a continuation request to get - // further directory entries. - if (result.GetIsTruncated()) { - DCHECK(!result.GetNextContinuationToken().empty()); - req.SetContinuationToken(result.GetNextContinuationToken()); - SpawnListObjectsV2(); - } - return Status::OK(); - } - - void Start() { - req.SetBucket(ToAwsString(walker->bucket_)); - if (!prefix.empty()) { - req.SetPrefix(ToAwsString(prefix) + kSep); - } - req.SetDelimiter(Aws::String() + kSep); - req.SetMaxKeys(walker->max_keys_); - SpawnListObjectsV2(); - } - }; - - void WalkChild(std::string key, int32_t nesting_depth) { - ListObjectsV2Handler handler{shared_from_this(), std::move(key), nesting_depth, {}}; - handler.Start(); - } - - void WalkChildren(const S3Model::ListObjectsV2Result& result, int32_t nesting_depth) { - for (const auto& prefix : result.GetCommonPrefixes()) { - const auto child_key = - internal::RemoveTrailingSlash(FromAwsString(prefix.GetPrefix())); - WalkChild(std::string{child_key}, nesting_depth); - } - } - - friend struct ListObjectsV2Handler; -}; - } // namespace // ----------------------------------------------------------------------- @@ -1819,11 +1721,9 @@ class S3FileSystem::Impl : public std::enable_shared_from_this holder_; std::optional backend_; - const int32_t kListObjectsMaxKeys = 1000; + static constexpr int32_t kListObjectsMaxKeys = 1000; // At most 1000 keys per multiple-delete request - const int32_t kMultipleDeleteMaxKeys = 1000; - // Limit recursing depth, since a recursion bomb can be created - const int32_t kMaxNestingDepth = 100; + static constexpr int32_t kMultipleDeleteMaxKeys = 1000; explicit Impl(S3Options options, io::IOContext io_context) : builder_(std::move(options)), io_context_(io_context) {} @@ -1850,7 +1750,7 @@ class S3FileSystem::Impl : public std::enable_shared_from_thisHeadBucket(req); + auto outcome = client_lock.Move()->HeadBucket(req); if (!outcome.IsSuccess()) { if (!IsNotFound(outcome.GetError())) { return ErrorToStatus(std::forward_as_tuple( @@ -1864,13 +1764,12 @@ class S3FileSystem::Impl : public std::enable_shared_from_thisLock()); - // Check bucket exists first. { S3Model::HeadBucketRequest req; req.SetBucket(ToAwsString(bucket)); - auto outcome = client_lock->HeadBucket(req); + ARROW_ASSIGN_OR_RAISE(auto client_lock, holder_->Lock()); + auto outcome = client_lock.Move()->HeadBucket(req); if (outcome.IsSuccess()) { return Status::OK(); @@ -1900,7 +1799,8 @@ class S3FileSystem::Impl : public std::enable_shared_from_thisCreateBucket(req); + ARROW_ASSIGN_OR_RAISE(auto client_lock, holder_->Lock()); + auto outcome = client_lock.Move()->CreateBucket(req); if (!outcome.IsSuccess() && !IsAlreadyExists(outcome.GetError())) { return ErrorToStatus(std::forward_as_tuple("When creating bucket '", bucket, "': "), "CreateBucket", outcome.GetError()); @@ -1918,7 +1818,7 @@ class S3FileSystem::Impl : public std::enable_shared_from_this("")); return OutcomeToStatus( std::forward_as_tuple("When creating key '", key, "' in bucket '", bucket, "': "), - "PutObject", client_lock->PutObject(req)); + "PutObject", client_lock.Move()->PutObject(req)); } Status CreateEmptyDir(const std::string& bucket, const std::string& key) { @@ -1934,7 +1834,7 @@ class S3FileSystem::Impl : public std::enable_shared_from_thisDeleteObject(req)); + "DeleteObject", client_lock.Move()->DeleteObject(req)); } Status CopyObject(const S3Path& src_path, const S3Path& dest_path) { @@ -1950,7 +1850,7 @@ class S3FileSystem::Impl : public std::enable_shared_from_thisCopyObject(req)); + "CopyObject", client_lock.Move()->CopyObject(req)); } // On Minio, an empty "directory" doesn't satisfy the same API requests as @@ -1989,7 +1889,7 @@ class S3FileSystem::Impl : public std::enable_shared_from_thisHeadObject(req); + auto outcome = client_lock.Move()->HeadObject(req); if (outcome.IsSuccess()) { return true; } @@ -2022,7 +1922,7 @@ class S3FileSystem::Impl : public std::enable_shared_from_thisListObjectsV2(req); + auto outcome = client_lock.Move()->ListObjectsV2(req); if (outcome.IsSuccess()) { const S3Model::ListObjectsV2Result& r = outcome.GetResult(); // In some cases, there may be 0 keys but some prefixes @@ -2037,204 +1937,308 @@ class S3FileSystem::Impl : public std::enable_shared_from_this= kMaxNestingDepth) { - return Status::IOError("S3 filesystem tree exceeds maximum nesting depth (", - kMaxNestingDepth, ")"); + static FileInfo MakeDirectoryInfo(std::string dirname) { + FileInfo dir; + dir.set_type(FileType::Directory); + dir.set_path(std::move(dirname)); + return dir; + } + + static std::vector MakeDirectoryInfos(std::vector dirnames) { + std::vector dir_infos; + for (auto& dirname : dirnames) { + dir_infos.push_back(MakeDirectoryInfo(std::move(dirname))); } - return Status::OK(); + return dir_infos; } - // A helper class for Walk and WalkAsync - struct FileInfoCollector { - FileInfoCollector(std::string bucket, std::string key, const FileSelector& select) - : bucket(std::move(bucket)), - key(std::move(key)), - allow_not_found(select.allow_not_found) {} + using FileInfoSink = PushGenerator>::Producer; - Status Collect(const std::string& prefix, const S3Model::ListObjectsV2Result& result, - std::vector* out) { - // Walk "directories" - for (const auto& child_prefix : result.GetCommonPrefixes()) { - is_empty = false; - const auto child_key = - internal::RemoveTrailingSlash(FromAwsString(child_prefix.GetPrefix())); - std::stringstream child_path; - child_path << bucket << kSep << child_key; - FileInfo info; - info.set_path(child_path.str()); - info.set_type(FileType::Directory); - out->push_back(std::move(info)); + struct FileListerState { + FileInfoSink files_queue; + const bool allow_not_found; + const int max_recursion; + const bool include_implicit_dirs; + const io::IOContext io_context; + S3ClientHolder* const holder; + + S3Model::ListObjectsV2Request req; + std::unordered_set directories; + bool empty = true; + + FileListerState(PushGenerator>::Producer files_queue, + FileSelector select, const std::string& bucket, + const std::string& key, bool include_implicit_dirs, + io::IOContext io_context, S3ClientHolder* holder) + : files_queue(std::move(files_queue)), + allow_not_found(select.allow_not_found), + max_recursion(select.max_recursion), + include_implicit_dirs(include_implicit_dirs), + io_context(std::move(io_context)), + holder(holder) { + req.SetBucket(bucket); + req.SetMaxKeys(kListObjectsMaxKeys); + if (!key.empty()) { + req.SetPrefix(key + kSep); } - // Walk "files" - for (const auto& obj : result.GetContents()) { - is_empty = false; - FileInfo info; - const auto child_key = internal::RemoveTrailingSlash(FromAwsString(obj.GetKey())); - if (child_key == std::string_view(prefix)) { - // Amazon can return the "directory" key itself as part of the results, skip - continue; - } - std::stringstream child_path; - child_path << bucket << kSep << child_key; - info.set_path(child_path.str()); - FileObjectToInfo(obj, &info); - out->push_back(std::move(info)); + if (!select.recursive) { + req.SetDelimiter(Aws::String() + kSep); } - return Status::OK(); } - Status Finish(Impl* impl) { - // If no contents were found, perhaps it's an empty "directory", - // or perhaps it's a nonexistent entry. Check. - if (is_empty && !allow_not_found) { - ARROW_ASSIGN_OR_RAISE(bool is_actually_empty, - impl->IsEmptyDirectory(bucket, key)); - if (!is_actually_empty) { - return PathNotFound(bucket, key); - } + void Finish() { + // `empty` means that we didn't get a single file info back from S3. This may be + // a situation that we should consider as PathNotFound. + // + // * If the prefix is empty then we were querying the contents of an entire bucket + // and this is not a PathNotFound case because if the bucket didn't exist then + // we would have received an error and not an empty set of results. + // + // * If the prefix is not empty then we asked for all files under a particular + // directory. S3 will also return the directory itself, if it exists. So if + // we get zero results then we know that there are no files under the directory + // and the directory itself doesn't exist. This should be considered PathNotFound + if (empty && !allow_not_found && !req.GetPrefix().empty()) { + files_queue.Push(PathNotFound(req.GetBucket(), req.GetPrefix())); } - return Status::OK(); } - std::string bucket; - std::string key; - bool allow_not_found; - bool is_empty = true; + // Given a path, iterate through all possible sub-paths and, if we haven't + // seen that sub-path before, return it. + // + // For example, given A/B/C we might return A/B and A if we have not seen + // those paths before. This allows us to consider "implicit" directories which + // don't exist as objects in S3 but can be inferred. + std::vector GetNewDirectories(const std::string_view& path) { + std::string current(path); + std::string base = req.GetBucket(); + if (!req.GetPrefix().empty()) { + base = base + kSep + std::string(internal::RemoveTrailingSlash(req.GetPrefix())); + } + std::vector new_directories; + while (true) { + const std::string parent_dir = internal::GetAbstractPathParent(current).first; + if (parent_dir.empty()) { + break; + } + current = parent_dir; + if (current == base) { + break; + } + if (directories.insert(parent_dir).second) { + new_directories.push_back(std::move(parent_dir)); + } + } + return new_directories; + } }; - // Workhorse for GetFileInfo(FileSelector...) - Status Walk(const FileSelector& select, const std::string& bucket, - const std::string& key, std::vector* out) { - RETURN_NOT_OK(CheckS3Initialized()); + struct FileListerTask : public util::AsyncTaskScheduler::Task { + std::shared_ptr state; + util::AsyncTaskScheduler* scheduler; - FileInfoCollector collector(bucket, key, select); + FileListerTask(std::shared_ptr state, + util::AsyncTaskScheduler* scheduler) + : state(std::move(state)), scheduler(scheduler) {} - auto handle_error = [&](const AWSError& error) -> Status { - if (select.allow_not_found && IsNotFound(error)) { - return Status::OK(); + std::vector ToFileInfos(const std::string& bucket, + const std::string& prefix, + const S3Model::ListObjectsV2Result& result) { + std::vector file_infos; + // If this is a non-recursive listing we may see "common prefixes" which represent + // directories we did not recurse into. We will add those as directories. + for (const auto& child_prefix : result.GetCommonPrefixes()) { + const auto child_key = + internal::RemoveTrailingSlash(FromAwsString(child_prefix.GetPrefix())); + std::stringstream child_path_ss; + child_path_ss << bucket << kSep << child_key; + FileInfo info; + info.set_path(child_path_ss.str()); + info.set_type(FileType::Directory); + file_infos.push_back(std::move(info)); } - return ErrorToStatus(std::forward_as_tuple("When listing objects under key '", key, - "' in bucket '", bucket, "': "), - "ListObjectsV2", error); - }; - - auto handle_recursion = [&](int32_t nesting_depth) -> Result { - RETURN_NOT_OK(CheckNestingDepth(nesting_depth)); - return select.recursive && nesting_depth <= select.max_recursion; - }; - - auto handle_results = [&](const std::string& prefix, - const S3Model::ListObjectsV2Result& result) -> Status { - return collector.Collect(prefix, result, out); - }; - - RETURN_NOT_OK(TreeWalker::Walk(holder_, io_context_, bucket, key, kListObjectsMaxKeys, - handle_results, handle_error, handle_recursion)); - - // If no contents were found, perhaps it's an empty "directory", - // or perhaps it's a nonexistent entry. Check. - RETURN_NOT_OK(collector.Finish(this)); - // Sort results for convenience, since they can come massively out of order - std::sort(out->begin(), out->end(), FileInfo::ByPath{}); - return Status::OK(); - } - - // Workhorse for GetFileInfoGenerator(FileSelector...) - FileInfoGenerator WalkAsync(const FileSelector& select, const std::string& bucket, - const std::string& key) { - PushGenerator> gen; - auto producer = gen.producer(); - auto collector = std::make_shared(bucket, key, select); - auto self = shared_from_this(); + // S3 doesn't have any concept of "max depth" and so we emulate it by counting the + // number of '/' characters. E.g. if the user is searching bucket/subdirA/subdirB + // then the starting depth is 2. + // A file subdirA/subdirB/somefile will have a child depth of 2 and a "depth" of 0. + // A file subdirA/subdirB/subdirC/somefile will have a child depth of 3 and a + // "depth" of 1 + int base_depth = internal::GetAbstractPathDepth(prefix); + for (const auto& obj : result.GetContents()) { + if (obj.GetKey() == prefix) { + // S3 will return the basedir itself (if it is a file / empty file). We don't + // want that. But this is still considered "finding the basedir" and so we mark + // it "not empty". + state->empty = false; + continue; + } + std::string child_key = + std::string(internal::RemoveTrailingSlash(FromAwsString(obj.GetKey()))); + bool had_trailing_slash = child_key.size() != obj.GetKey().size(); + int child_depth = internal::GetAbstractPathDepth(child_key); + // Recursion depth is 1 smaller because a path with depth 1 (e.g. foo) is + // considered to have a "recursion" of 0 + int recursion_depth = child_depth - base_depth - 1; + if (recursion_depth > state->max_recursion) { + // If we have A/B/C/D and max_recursion is 2 then we ignore this (don't add it + // to file_infos) but we still want to potentially add A and A/B as directories. + // So we "pretend" like we have a file A/B/C for the call to GetNewDirectories + // below + int to_trim = recursion_depth - state->max_recursion - 1; + if (to_trim > 0) { + child_key = bucket + kSep + + internal::SliceAbstractPath(child_key, 0, child_depth - to_trim); + } else { + child_key = bucket + kSep + child_key; + } + } else { + // If the file isn't beyond our max recursion then count it as a file + // unless it's empty and then it depends on whether or not the file ends + // with a trailing slash + std::stringstream child_path_ss; + child_path_ss << bucket << kSep << child_key; + child_key = child_path_ss.str(); + if (obj.GetSize() > 0 || !had_trailing_slash) { + // We found a real file + FileInfo info; + info.set_path(child_key); + FileObjectToInfo(obj, &info); + file_infos.push_back(std::move(info)); + } else { + // We found an empty file and we want to treat it like a directory. Only + // add it if we haven't seen this directory before. + if (state->directories.insert(child_key).second) { + file_infos.push_back(MakeDirectoryInfo(child_key)); + } + } + } - auto handle_error = [select, bucket, key](const AWSError& error) -> Status { - if (select.allow_not_found && IsNotFound(error)) { - return Status::OK(); + if (state->include_implicit_dirs) { + // Now that we've dealt with the file itself we need to look at each of the + // parent paths and potentially add them as directories. For example, after + // finding a file A/B/C/D we want to consider adding directories A, A/B, and + // A/B/C. + for (const auto& newdir : state->GetNewDirectories(child_key)) { + file_infos.push_back(MakeDirectoryInfo(newdir)); + } + } } - return ErrorToStatus(std::forward_as_tuple("When listing objects under key '", key, - "' in bucket '", bucket, "': "), - "ListObjectsV2", error); - }; - - auto handle_recursion = [producer, select, - self](int32_t nesting_depth) -> Result { - if (producer.is_closed()) { - return false; + if (file_infos.size() > 0) { + state->empty = false; } - RETURN_NOT_OK(self->CheckNestingDepth(nesting_depth)); - return select.recursive && nesting_depth <= select.max_recursion; - }; + return file_infos; + } - auto handle_results = - [collector, producer]( - const std::string& prefix, - const S3Model::ListObjectsV2Result& result) mutable -> Status { - std::vector out; - RETURN_NOT_OK(collector->Collect(prefix, result, &out)); - if (!out.empty()) { - producer.Push(std::move(out)); + void Run() { + // We are on an I/O thread now so just synchronously make the call and interpret the + // results. + Result client_lock = state->holder->Lock(); + if (!client_lock.ok()) { + state->files_queue.Push(client_lock.status()); + return; } - return Status::OK(); - }; - - TreeWalker::WalkAsync(holder_, io_context_, bucket, key, kListObjectsMaxKeys, - handle_results, handle_error, handle_recursion) - .AddCallback([collector, producer, self](const Status& status) mutable { - auto st = collector->Finish(self.get()); - if (!st.ok()) { - producer.Push(st); - } - producer.Close(); - }); - return gen; - } - - struct WalkResult { - std::vector file_keys; - std::vector dir_keys; - }; - Future> WalkForDeleteDirAsync(const std::string& bucket, - const std::string& key) { - auto state = std::make_shared(); - - auto handle_results = [state](const std::string& prefix, - const S3Model::ListObjectsV2Result& result) -> Status { - // Walk "files" - state->file_keys.reserve(state->file_keys.size() + result.GetContents().size()); - for (const auto& obj : result.GetContents()) { - state->file_keys.emplace_back(FromAwsString(obj.GetKey())); + S3Model::ListObjectsV2Outcome outcome = + client_lock->Move()->ListObjectsV2(state->req); + if (!outcome.IsSuccess()) { + const auto& err = outcome.GetError(); + if (state->allow_not_found && IsNotFound(err)) { + return; + } + state->files_queue.Push( + ErrorToStatus(std::forward_as_tuple("When listing objects under key '", + state->req.GetPrefix(), "' in bucket '", + state->req.GetBucket(), "': "), + "ListObjectsV2", err)); + return; } - // Walk "directories" - state->dir_keys.reserve(state->dir_keys.size() + result.GetCommonPrefixes().size()); - for (const auto& prefix : result.GetCommonPrefixes()) { - state->dir_keys.emplace_back(FromAwsString(prefix.GetPrefix())); + const S3Model::ListObjectsV2Result& result = outcome.GetResult(); + // We could immediately schedule the continuation (if there are enough results to + // trigger paging) but that would introduce race condition complexity for arguably + // little benefit. + std::vector file_infos = + ToFileInfos(state->req.GetBucket(), state->req.GetPrefix(), result); + if (file_infos.size() > 0) { + state->files_queue.Push(std::move(file_infos)); } - return Status::OK(); - }; - auto handle_error = [=](const AWSError& error) -> Status { - return ErrorToStatus(std::forward_as_tuple("When listing objects under key '", key, - "' in bucket '", bucket, "': "), - "ListObjectsV2", error); - }; + // If there are enough files to warrant a continuation then go ahead and schedule + // that now. + if (result.GetIsTruncated()) { + DCHECK(!result.GetNextContinuationToken().empty()); + state->req.SetContinuationToken(result.GetNextContinuationToken()); + scheduler->AddTask(std::make_unique(state, scheduler)); + } else { + // Otherwise, we have finished listing all the files + state->Finish(); + } + } - auto self = shared_from_this(); - auto handle_recursion = [self](int32_t nesting_depth) -> Result { - RETURN_NOT_OK(self->CheckNestingDepth(nesting_depth)); - return true; // Recurse - }; + Result> operator()() override { + return state->io_context.executor()->Submit([this] { + Run(); + return Status::OK(); + }); + } + std::string_view name() const override { return "S3ListFiles"; } + }; - return TreeWalker::WalkAsync(holder_, io_context_, bucket, key, kListObjectsMaxKeys, - handle_results, handle_error, handle_recursion) - .Then([state]() { return state; }); + // Lists all file, potentially recursively, in a bucket + // + // include_implicit_dirs controls whether or not implicit directories should be + // included. These are directories that are not actually file objects but instead are + // inferred from other objects. + // + // For example, if a file exists with path A/B/C then implicit directories A/ and A/B/ + // will exist even if there are no file objects with these paths. + void ListAsync(const FileSelector& select, const std::string& bucket, + const std::string& key, bool include_implicit_dirs, + util::AsyncTaskScheduler* scheduler, FileInfoSink sink) { + // We can only fetch kListObjectsMaxKeys files at a time and so we create a + // scheduler and schedule a task to grab the first batch. Once that's done we + // schedule a new task for the next batch. All of these tasks share the same + // FileListerState object but none of these tasks run in parallel so there is + // no need to worry about mutexes + auto state = std::make_shared(sink, select, bucket, key, + include_implicit_dirs, io_context_, + this->holder_.get()); + + // Create the first file lister task (it may spawn more) + auto file_lister_task = std::make_unique(state, scheduler); + scheduler->AddTask(std::move(file_lister_task)); + } + + // Fully list all files from all buckets + void FullListAsync(bool include_implicit_dirs, util::AsyncTaskScheduler* scheduler, + FileInfoSink sink, bool recursive) { + scheduler->AddSimpleTask( + [this, scheduler, sink, include_implicit_dirs, recursive]() mutable { + return ListBucketsAsync().Then( + [this, scheduler, sink, include_implicit_dirs, + recursive](const std::vector& buckets) mutable { + // Return the buckets themselves as directories + std::vector buckets_as_directories = + MakeDirectoryInfos(buckets); + sink.Push(std::move(buckets_as_directories)); + + if (recursive) { + // Recursively list each bucket (these will run in parallel but sink + // should be thread safe and so this is ok) + for (const auto& bucket : buckets) { + FileSelector select; + select.allow_not_found = true; + select.recursive = true; + select.base_dir = bucket; + ListAsync(select, bucket, "", include_implicit_dirs, scheduler, sink); + } + } + }); + }, + std::string_view("FullListBucketScan")); } // Delete multiple objects at once Future<> DeleteObjectsAsync(const std::string& bucket, const std::vector& keys) { - ARROW_ASSIGN_OR_RAISE(auto client_lock, holder_->Lock()); - struct DeleteCallback { std::string bucket; @@ -2263,12 +2267,14 @@ class S3FileSystem::Impl : public std::enable_shared_from_this> futures; - futures.reserve(keys.size() / chunk_size + 1); + futures.reserve(bit_util::CeilDiv(keys.size(), chunk_size)); for (size_t start = 0; start < keys.size(); start += chunk_size) { S3Model::DeleteObjectsRequest req; S3Model::Delete del; - for (size_t i = start; i < std::min(keys.size(), chunk_size); ++i) { + size_t remaining = keys.size() - start; + size_t next_chunk_size = std::min(remaining, chunk_size); + for (size_t i = start; i < start + next_chunk_size; ++i) { del.AddObjects(S3Model::ObjectIdentifier().WithKey(ToAwsString(keys[i]))); } req.SetBucket(ToAwsString(bucket)); @@ -2278,40 +2284,154 @@ class S3FileSystem::Impl : public std::enable_shared_from_this Status { ARROW_ASSIGN_OR_RAISE(auto client_lock, holder->Lock()); - return delete_cb(client_lock->DeleteObjects(req)); + return delete_cb(client_lock.Move()->DeleteObjects(req)); })); futures.push_back(std::move(fut)); } - return AllComplete(futures); + return AllFinished(futures); } Status DeleteObjects(const std::string& bucket, const std::vector& keys) { return DeleteObjectsAsync(bucket, keys).status(); } + // Check to make sure the given path is not a file + // + // Returns true if the path seems to be a directory, false if it is a file + Future EnsureIsDirAsync(const std::string& bucket, const std::string& key) { + if (key.empty()) { + // There is no way for a bucket to be a file + return Future::MakeFinished(true); + } + auto self = shared_from_this(); + return DeferNotOk( + SubmitIO(io_context_, [self, bucket, key]() mutable -> Result { + S3Model::HeadObjectRequest req; + req.SetBucket(ToAwsString(bucket)); + req.SetKey(ToAwsString(key)); + + ARROW_ASSIGN_OR_RAISE(auto client_lock, self->holder_->Lock()); + auto outcome = client_lock.Move()->HeadObject(req); + if (outcome.IsSuccess()) { + const auto& result = outcome.GetResult(); + // A directory should be empty and have a trailing slash. Anything else + // we can consider a file + return result.GetContentLength() <= 0 && key[key.size() - 1] == '/'; + } + if (IsNotFound(outcome.GetError())) { + // If we can't find it then it isn't a file. + return true; + } else { + return ErrorToStatus( + std::forward_as_tuple("When getting information for key '", key, + "' in bucket '", bucket, "': "), + "HeadObject", outcome.GetError()); + } + })); + } + + // Some operations require running multiple S3 calls, either in parallel or serially. We + // need to ensure that the S3 filesystem instance stays valid and that S3 isn't + // finalized. We do this by wrapping all the tasks in a scheduler which keeps the + // resources alive + Future<> RunInScheduler( + std::function callable) { + auto self = shared_from_this(); + FnOnce initial_task = + [callable = std::move(callable), + this](util::AsyncTaskScheduler* scheduler) mutable { + return callable(scheduler, this); + }; + Future<> scheduler_fut = util::AsyncTaskScheduler::Make( + std::move(initial_task), + /*abort_callback=*/ + [](const Status& st) { + // No need for special abort logic. + }, + io_context_.stop_token()); + // Keep self alive until all tasks finish + return scheduler_fut.Then([self]() { return Status::OK(); }); + } + + Future<> DoDeleteDirContentsAsync(const std::string& bucket, const std::string& key) { + return RunInScheduler( + [bucket, key](util::AsyncTaskScheduler* scheduler, S3FileSystem::Impl* self) { + scheduler->AddSimpleTask( + [=] { + FileSelector select; + select.base_dir = bucket + kSep + key; + select.recursive = true; + select.allow_not_found = false; + + FileInfoGenerator file_infos = self->GetFileInfoGenerator(select); + + auto handle_file_infos = [=](const std::vector& file_infos) { + std::vector file_paths; + for (const auto& file_info : file_infos) { + DCHECK_GT(file_info.path().size(), bucket.size()); + file_paths.push_back(file_info.path().substr(bucket.size() + 1)); + } + scheduler->AddSimpleTask( + [=, file_paths = std::move(file_paths)] { + return self->DeleteObjectsAsync(bucket, file_paths); + }, + std::string_view("DeleteDirContentsDeleteTask")); + return Status::OK(); + }; + + return VisitAsyncGenerator( + AsyncGenerator>(std::move(file_infos)), + std::move(handle_file_infos)); + }, + std::string_view("ListFilesForDelete")); + return Status::OK(); + }); + } + Future<> DeleteDirContentsAsync(const std::string& bucket, const std::string& key) { auto self = shared_from_this(); - return WalkForDeleteDirAsync(bucket, key) - .Then([bucket, key, - self](const std::shared_ptr& discovered) -> Future<> { - if (discovered->file_keys.empty() && discovered->dir_keys.empty() && - !key.empty()) { - // No contents found, is it an empty directory? - ARROW_ASSIGN_OR_RAISE(bool exists, self->IsEmptyDirectory(bucket, key)); - if (!exists) { - return PathNotFound(bucket, key); - } + return EnsureIsDirAsync(bucket, key) + .Then([self, bucket, key](bool is_dir) -> Future<> { + if (!is_dir) { + return Status::IOError("Cannot delete directory contents at ", bucket, kSep, + key, " because it is a file"); } - // First delete all "files", then delete all child "directories" - return self->DeleteObjectsAsync(bucket, discovered->file_keys) - .Then([bucket, discovered, self]() { - // Delete directories in reverse lexicographic order, to ensure children - // are deleted before their parents (Minio). - std::sort(discovered->dir_keys.rbegin(), discovered->dir_keys.rend()); - return self->DeleteObjectsAsync(bucket, discovered->dir_keys); - }); + return self->DoDeleteDirContentsAsync(bucket, key); + }); + } + + FileInfoGenerator GetFileInfoGenerator(const FileSelector& select) { + auto maybe_base_path = S3Path::FromString(select.base_dir); + if (!maybe_base_path.ok()) { + return MakeFailingGenerator(maybe_base_path.status()); + } + auto base_path = *std::move(maybe_base_path); + + PushGenerator> generator; + Future<> scheduler_fut = RunInScheduler( + [select, base_path, sink = generator.producer()]( + util::AsyncTaskScheduler* scheduler, S3FileSystem::Impl* self) { + if (base_path.empty()) { + bool should_recurse = select.recursive && select.max_recursion > 0; + self->FullListAsync(/*include_implicit_dirs=*/true, scheduler, sink, + should_recurse); + } else { + self->ListAsync(select, base_path.bucket, base_path.key, + /*include_implicit_dirs=*/true, scheduler, sink); + } + return Status::OK(); }); + + // Mark the generator done once all tasks are finished + scheduler_fut.AddCallback([sink = generator.producer()](const Status& st) mutable { + if (!st.ok()) { + sink.Push(st); + } + sink.Close(); + }); + + return generator; } Status EnsureDirectoryExists(const S3Path& path) { @@ -2344,22 +2464,16 @@ class S3FileSystem::Impl : public std::enable_shared_from_this> ListBuckets() { ARROW_ASSIGN_OR_RAISE(auto client_lock, holder_->Lock()); - - auto outcome = client_lock->ListBuckets(); - return ProcessListBuckets(outcome); + return ProcessListBuckets(client_lock.Move()->ListBuckets()); } - Future> ListBucketsAsync(io::IOContext ctx) { - ARROW_ASSIGN_OR_RAISE(auto client_lock, holder_->Lock()); - - return DeferNotOk(SubmitIO(ctx, - [client_lock = std::move(client_lock)]() mutable { - return client_lock->ListBuckets(); - })) - // TODO(ARROW-12655) Change to Then(Impl::ProcessListBuckets) - .Then([](const Aws::S3::Model::ListBucketsOutcome& outcome) { - return Impl::ProcessListBuckets(outcome); - }); + Future> ListBucketsAsync() { + auto deferred = + [self = shared_from_this()]() mutable -> Result> { + ARROW_ASSIGN_OR_RAISE(auto client_lock, self->holder_->Lock()); + return self->ProcessListBuckets(client_lock.Move()->ListBuckets()); + }; + return DeferNotOk(SubmitIO(io_context_, std::move(deferred))); } Result> OpenInputFile(const std::string& s, @@ -2449,7 +2563,7 @@ Result S3FileSystem::GetFileInfo(const std::string& s) { S3Model::HeadBucketRequest req; req.SetBucket(ToAwsString(path.bucket)); - auto outcome = client_lock->HeadBucket(req); + auto outcome = client_lock.Move()->HeadBucket(req); if (!outcome.IsSuccess()) { if (!IsNotFound(outcome.GetError())) { const auto msg = "When getting information for bucket '" + path.bucket + "': "; @@ -2469,7 +2583,7 @@ Result S3FileSystem::GetFileInfo(const std::string& s) { req.SetBucket(ToAwsString(path.bucket)); req.SetKey(ToAwsString(path.key)); - auto outcome = client_lock->HeadObject(req); + auto outcome = client_lock.Move()->HeadObject(req); if (outcome.IsSuccess()) { // "File" object found FileObjectToInfo(outcome.GetResult(), &info); @@ -2499,73 +2613,19 @@ Result S3FileSystem::GetFileInfo(const std::string& s) { } Result S3FileSystem::GetFileInfo(const FileSelector& select) { - ARROW_ASSIGN_OR_RAISE(auto base_path, S3Path::FromString(select.base_dir)); - - FileInfoVector results; - - if (base_path.empty()) { - // List all buckets - ARROW_ASSIGN_OR_RAISE(auto buckets, impl_->ListBuckets()); - for (const auto& bucket : buckets) { - FileInfo info; - info.set_path(bucket); - info.set_type(FileType::Directory); - results.push_back(std::move(info)); - if (select.recursive) { - RETURN_NOT_OK(impl_->Walk(select, bucket, "", &results)); - } - } - return results; + Future> file_infos_fut = + CollectAsyncGenerator(GetFileInfoGenerator(select)); + ARROW_ASSIGN_OR_RAISE(std::vector file_infos, file_infos_fut.result()); + FileInfoVector combined_file_infos; + for (const auto& file_info_vec : file_infos) { + combined_file_infos.insert(combined_file_infos.end(), file_info_vec.begin(), + file_info_vec.end()); } - - // Nominal case -> walk a single bucket - RETURN_NOT_OK(impl_->Walk(select, base_path.bucket, base_path.key, &results)); - return results; + return combined_file_infos; } FileInfoGenerator S3FileSystem::GetFileInfoGenerator(const FileSelector& select) { - auto maybe_base_path = S3Path::FromString(select.base_dir); - if (!maybe_base_path.ok()) { - return MakeFailingGenerator(maybe_base_path.status()); - } - auto base_path = *std::move(maybe_base_path); - - if (base_path.empty()) { - // List all buckets, then possibly recurse - PushGenerator> gen; - auto producer = gen.producer(); - - auto fut = impl_->ListBucketsAsync(io_context()); - auto impl = impl_->shared_from_this(); - fut.AddCallback( - [producer, select, impl](const Result>& res) mutable { - if (!res.ok()) { - producer.Push(res.status()); - producer.Close(); - return; - } - FileInfoVector buckets; - for (const auto& bucket : *res) { - buckets.push_back(FileInfo{bucket, FileType::Directory}); - } - // Generate all bucket infos - auto buckets_fut = Future::MakeFinished(std::move(buckets)); - producer.Push(MakeSingleFutureGenerator(buckets_fut)); - if (select.recursive) { - // Generate recursive walk for each bucket in turn - for (const auto& bucket : *buckets_fut.result()) { - producer.Push(impl->WalkAsync(select, bucket.path(), "")); - } - } - producer.Close(); - }); - - return MakeConcatenatedGenerator( - AsyncGenerator>{std::move(gen)}); - } - - // Nominal case -> walk a single bucket - return impl_->WalkAsync(select, base_path.bucket, base_path.key); + return impl_->GetFileInfoGenerator(select); } Status S3FileSystem::CreateDir(const std::string& s, bool recursive) { @@ -2624,7 +2684,7 @@ Status S3FileSystem::DeleteDir(const std::string& s) { req.SetBucket(ToAwsString(path.bucket)); return OutcomeToStatus( std::forward_as_tuple("When deleting bucket '", path.bucket, "': "), - "DeleteBucket", client_lock->DeleteBucket(req)); + "DeleteBucket", client_lock.Move()->DeleteBucket(req)); } else if (path.key.empty()) { return Status::IOError("Would delete bucket '", path.bucket, "'. ", "To delete buckets, enable the allow_bucket_deletion option."); @@ -2676,7 +2736,7 @@ Status S3FileSystem::DeleteFile(const std::string& s) { req.SetBucket(ToAwsString(path.bucket)); req.SetKey(ToAwsString(path.key)); - auto outcome = client_lock->HeadObject(req); + auto outcome = client_lock.Move()->HeadObject(req); if (!outcome.IsSuccess()) { if (IsNotFound(outcome.GetError())) { return PathNotFound(path); diff --git a/cpp/src/arrow/filesystem/s3fs_test.cc b/cpp/src/arrow/filesystem/s3fs_test.cc index 5b0287d99716b..e9f14fde72316 100644 --- a/cpp/src/arrow/filesystem/s3fs_test.cc +++ b/cpp/src/arrow/filesystem/s3fs_test.cc @@ -59,7 +59,6 @@ #include "arrow/testing/future_util.h" #include "arrow/testing/gtest_util.h" #include "arrow/testing/matchers.h" -#include "arrow/testing/util.h" #include "arrow/util/async_generator.h" #include "arrow/util/checked_cast.h" #include "arrow/util/future.h" @@ -67,13 +66,17 @@ #include "arrow/util/key_value_metadata.h" #include "arrow/util/logging.h" #include "arrow/util/macros.h" +#include "arrow/util/range.h" +#include "arrow/util/string.h" namespace arrow { namespace fs { using ::arrow::internal::checked_pointer_cast; using ::arrow::internal::PlatformFilename; +using ::arrow::internal::ToChars; using ::arrow::internal::UriEscape; +using ::arrow::internal::Zip; using ::arrow::fs::internal::ConnectRetryStrategy; using ::arrow::fs::internal::ErrorToStatus; @@ -146,11 +149,6 @@ class ShortRetryStrategy : public S3RetryStrategy { class AwsTestMixin : public ::testing::Test { public: - // We set this environment variable to speed up tests by ensuring - // DefaultAWSCredentialsProviderChain does not query (inaccessible) - // EC2 metadata endpoint - AwsTestMixin() : ec2_metadata_disabled_guard_("AWS_EC2_METADATA_DISABLED", "true") {} - void SetUp() override { #ifdef AWS_CPP_SDK_S3_NOT_SHARED auto aws_log_level = Aws::Utils::Logging::LogLevel::Fatal; @@ -169,7 +167,6 @@ class AwsTestMixin : public ::testing::Test { } private: - EnvVarGuard ec2_metadata_disabled_guard_; #ifdef AWS_CPP_SDK_S3_NOT_SHARED Aws::SDKOptions aws_options_; #endif @@ -304,6 +301,13 @@ TEST_F(S3OptionsTest, FromUri) { // Invalid option ASSERT_RAISES(Invalid, S3Options::FromUri("s3://mybucket/?xxx=zzz", &path)); + + // Endpoint from environment variable + { + EnvVarGuard endpoint_guard("AWS_ENDPOINT_URL", "http://127.0.0.1:9000"); + ASSERT_OK_AND_ASSIGN(options, S3Options::FromUri("s3://mybucket/", &path)); + ASSERT_EQ(options.endpoint_override, "http://127.0.0.1:9000"); + } } TEST_F(S3OptionsTest, FromAccessKey) { @@ -459,16 +463,19 @@ class TestS3FS : public S3TestMixin { } } - void MakeFileSystem() { + Result> MakeNewFileSystem( + io::IOContext io_context = io::default_io_context()) { options_.ConfigureAccessKey(minio_->access_key(), minio_->secret_key()); options_.scheme = "http"; options_.endpoint_override = minio_->connect_string(); if (!options_.retry_strategy) { options_.retry_strategy = std::make_shared(); } - ASSERT_OK_AND_ASSIGN(fs_, S3FileSystem::Make(options_)); + return S3FileSystem::Make(options_, io_context); } + void MakeFileSystem() { ASSERT_OK_AND_ASSIGN(fs_, MakeNewFileSystem()); } + template void AssertMetadataRoundtrip(const std::string& path, const std::shared_ptr& metadata, @@ -787,6 +794,81 @@ TEST_F(TestS3FS, GetFileInfoGenerator) { // Non-root dir case is tested by generic tests } +TEST_F(TestS3FS, GetFileInfoGeneratorStress) { + // This test is slow because it needs to create a bunch of seed files. However, it is + // the only test that stresses listing and deleting when there are more than 1000 files + // and paging is required. + constexpr int32_t kNumDirs = 4; + constexpr int32_t kNumFilesPerDir = 512; + FileInfoVector expected_infos; + + ASSERT_OK(fs_->CreateDir("stress")); + for (int32_t i = 0; i < kNumDirs; i++) { + const std::string dir_path = "stress/" + ToChars(i); + ASSERT_OK(fs_->CreateDir(dir_path)); + expected_infos.emplace_back(dir_path, FileType::Directory); + + std::vector> tasks; + for (int32_t j = 0; j < kNumFilesPerDir; j++) { + // Create the files in parallel in hopes of speeding up this process as much as + // possible + const std::string file_name = ToChars(j); + const std::string file_path = dir_path + "/" + file_name; + expected_infos.emplace_back(file_path, FileType::File); + ASSERT_OK_AND_ASSIGN(Future<> task, + ::arrow::internal::GetCpuThreadPool()->Submit( + [fs = fs_, file_name, file_path]() -> Status { + ARROW_ASSIGN_OR_RAISE( + std::shared_ptr out_str, + fs->OpenOutputStream(file_path)); + ARROW_RETURN_NOT_OK(out_str->Write(file_name)); + return out_str->Close(); + })); + tasks.push_back(std::move(task)); + } + ASSERT_FINISHES_OK(AllFinished(tasks)); + } + SortInfos(&expected_infos); + + FileSelector select; + FileInfoVector infos; + select.base_dir = "stress"; + select.recursive = true; + + // 32 is pretty fast, listing is much faster than the create step above + constexpr int32_t kNumTasks = 32; + for (int i = 0; i < kNumTasks; i++) { + CollectFileInfoGenerator(fs_->GetFileInfoGenerator(select), &infos); + SortInfos(&infos); + // One info for each directory and one info for each file + ASSERT_EQ(infos.size(), expected_infos.size()); + for (const auto&& [info, expected] : Zip(infos, expected_infos)) { + AssertFileInfo(info, expected.path(), expected.type()); + } + } + + ASSERT_OK(fs_->DeleteDirContents("stress")); + + CollectFileInfoGenerator(fs_->GetFileInfoGenerator(select), &infos); + ASSERT_EQ(infos.size(), 0); +} + +TEST_F(TestS3FS, GetFileInfoGeneratorCancelled) { + FileSelector select; + FileInfoVector infos; + select.base_dir = "bucket"; + select.recursive = true; + + StopSource stop_source; + io::IOContext cancellable_context(stop_source.token()); + ASSERT_OK_AND_ASSIGN(std::shared_ptr cancellable_fs, + MakeNewFileSystem(cancellable_context)); + stop_source.RequestStop(); + FileInfoGenerator generator = cancellable_fs->GetFileInfoGenerator(select); + auto file_infos = CollectAsyncGenerator(std::move(generator)); + ASSERT_FINISHES_AND_RAISES(Cancelled, file_infos); +} + TEST_F(TestS3FS, CreateDir) { FileInfo st; diff --git a/cpp/src/arrow/flight/CMakeLists.txt b/cpp/src/arrow/flight/CMakeLists.txt index 917c0c33211e0..7383a7eec9045 100644 --- a/cpp/src/arrow/flight/CMakeLists.txt +++ b/cpp/src/arrow/flight/CMakeLists.txt @@ -102,84 +102,21 @@ set(CMAKE_CXX_FLAGS_BACKUP "${CMAKE_CXX_FLAGS}") string(REPLACE "/WX" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") string(REPLACE "-Werror " " " CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") -# Probe the version of gRPC being used to see if it supports disabling server -# verification when using TLS. -# gRPC's pkg-config file neglects to specify pthreads. -find_package(Threads REQUIRED) -function(test_grpc_version DST_VAR DETECT_VERSION TEST_FILE) - if(NOT DEFINED ${DST_VAR}) - message(STATUS "Checking support for TlsCredentialsOptions (gRPC >= ${DETECT_VERSION})..." - ) - get_property(CURRENT_INCLUDE_DIRECTORIES - DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} - PROPERTY INCLUDE_DIRECTORIES) - # ARROW-13881: when detecting support, avoid mismatch between - # debug flags of gRPC and our probe (which results in LNK2038) - set(CMAKE_TRY_COMPILE_CONFIGURATION ${CMAKE_BUILD_TYPE}) - try_compile(HAS_GRPC_VERSION ${CMAKE_CURRENT_BINARY_DIR}/try_compile - SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/try_compile/${TEST_FILE}" - CMAKE_FLAGS "-DINCLUDE_DIRECTORIES=${CURRENT_INCLUDE_DIRECTORIES}" - LINK_LIBRARIES gRPC::grpc++ Threads::Threads - OUTPUT_VARIABLE TLS_CREDENTIALS_OPTIONS_CHECK_OUTPUT CXX_STANDARD 11) - if(HAS_GRPC_VERSION) - set(${DST_VAR} - "${DETECT_VERSION}" - CACHE INTERNAL "The detected (approximate) gRPC version.") - else() - message(STATUS "TlsCredentialsOptions (for gRPC ${DETECT_VERSION}) not found in grpc::experimental." - ) - if(ARROW_FLIGHT_REQUIRE_TLSCREDENTIALSOPTIONS) - message(WARNING "Build output:") - list(APPEND CMAKE_MESSAGE_INDENT "${TEST_FILE}: ") - message(WARNING ${TLS_CREDENTIALS_OPTIONS_CHECK_OUTPUT}) - list(REMOVE_AT CMAKE_MESSAGE_INDENT -1) - else() - message(DEBUG "Build output:") - list(APPEND CMAKE_MESSAGE_INDENT "${TEST_FILE}: ") - message(DEBUG ${TLS_CREDENTIALS_OPTIONS_CHECK_OUTPUT}) - list(REMOVE_AT CMAKE_MESSAGE_INDENT -1) - endif() - endif() - endif() -endfunction() - -if(GRPC_VENDORED) - # v1.35.0 -> 1.35 - string(REGEX MATCH "[0-9]+\\.[0-9]+" GRPC_VERSION "${ARROW_GRPC_BUILD_VERSION}") -else() - test_grpc_version(GRPC_VERSION "1.43" "check_tls_opts_143.cc") - test_grpc_version(GRPC_VERSION "1.36" "check_tls_opts_136.cc") - test_grpc_version(GRPC_VERSION "1.34" "check_tls_opts_134.cc") - test_grpc_version(GRPC_VERSION "1.32" "check_tls_opts_132.cc") - test_grpc_version(GRPC_VERSION "1.27" "check_tls_opts_127.cc") - message(STATUS "Found approximate gRPC version: ${GRPC_VERSION} (ARROW_FLIGHT_REQUIRE_TLSCREDENTIALSOPTIONS=${ARROW_FLIGHT_REQUIRE_TLSCREDENTIALSOPTIONS})" - ) -endif() -if(GRPC_VERSION EQUAL "1.27") - add_definitions(-DGRPC_NAMESPACE_FOR_TLS_CREDENTIALS_OPTIONS=grpc_impl::experimental) -elseif(GRPC_VERSION EQUAL "1.32") - add_definitions(-DGRPC_NAMESPACE_FOR_TLS_CREDENTIALS_OPTIONS=grpc::experimental) -elseif(GRPC_VERSION EQUAL "1.34" OR GRPC_VERSION EQUAL "1.35") +if(ARROW_GRPC_VERSION VERSION_GREATER_EQUAL "1.43") add_definitions(-DGRPC_USE_TLS_CHANNEL_CREDENTIALS_OPTIONS - -DGRPC_USE_TLS_CHANNEL_CREDENTIALS_OPTIONS_ROOT_CERTS - -DGRPC_NAMESPACE_FOR_TLS_CREDENTIALS_OPTIONS=grpc::experimental) -elseif(GRPC_VERSION EQUAL "1.36") + -DGRPC_NAMESPACE_FOR_TLS_CREDENTIALS_OPTIONS=grpc::experimental + -DGRPC_USE_CERTIFICATE_VERIFIER) +elseif(ARROW_GRPC_VERSION VERSION_GREATER_EQUAL "1.36") add_definitions(-DGRPC_USE_TLS_CHANNEL_CREDENTIALS_OPTIONS -DGRPC_NAMESPACE_FOR_TLS_CREDENTIALS_OPTIONS=grpc::experimental) -elseif((GRPC_VERSION EQUAL "1.43") OR (GRPC_VERSION EQUAL "1.46")) - # 1.46 is the bundled version +elseif(ARROW_GRPC_VERSION VERSION_GREATER_EQUAL "1.34") add_definitions(-DGRPC_USE_TLS_CHANNEL_CREDENTIALS_OPTIONS - -DGRPC_NAMESPACE_FOR_TLS_CREDENTIALS_OPTIONS=grpc::experimental - -DGRPC_USE_CERTIFICATE_VERIFIER) + -DGRPC_USE_TLS_CHANNEL_CREDENTIALS_OPTIONS_ROOT_CERTS + -DGRPC_NAMESPACE_FOR_TLS_CREDENTIALS_OPTIONS=grpc::experimental) +elseif(ARROW_GRPC_VERSION VERSION_GREATER_EQUAL "1.32") + add_definitions(-DGRPC_NAMESPACE_FOR_TLS_CREDENTIALS_OPTIONS=grpc::experimental) else() - message(STATUS "A proper version of gRPC could not be found to support TlsCredentialsOptions in Arrow Flight." - ) - message(STATUS "You may need a newer version of gRPC (>= 1.27), or the gRPC API has changed and Flight must be updated to match." - ) - if(ARROW_FLIGHT_REQUIRE_TLSCREDENTIALSOPTIONS) - message(FATAL_ERROR "Halting build since ARROW_FLIGHT_REQUIRE_TLSCREDENTIALSOPTIONS is set." - ) - endif() + add_definitions(-DGRPC_NAMESPACE_FOR_TLS_CREDENTIALS_OPTIONS=grpc_impl::experimental) endif() # Restore the CXXFLAGS that were modified above diff --git a/cpp/src/arrow/flight/client.cc b/cpp/src/arrow/flight/client.cc index e5e9f141aa62b..ec5377b7c11dc 100644 --- a/cpp/src/arrow/flight/client.cc +++ b/cpp/src/arrow/flight/client.cc @@ -79,16 +79,6 @@ arrow::Result> FlightStreamReader::ToTable( return Table::FromRecordBatches(schema, std::move(batches)); } -Status FlightStreamReader::ReadAll(std::vector>* batches, - const StopToken& stop_token) { - return ToRecordBatches(stop_token).Value(batches); -} - -Status FlightStreamReader::ReadAll(std::shared_ptr
* table, - const StopToken& stop_token) { - return ToTable(stop_token).Value(table); -} - /// \brief An ipc::MessageReader adapting the Flight ClientDataStream interface. /// /// In order to support app_metadata and reuse the existing IPC @@ -520,11 +510,6 @@ arrow::Result> FlightClient::Connect( return Connect(location, FlightClientOptions::Defaults()); } -Status FlightClient::Connect(const Location& location, - std::unique_ptr* client) { - return Connect(location, FlightClientOptions::Defaults()).Value(client); -} - arrow::Result> FlightClient::Connect( const Location& location, const FlightClientOptions& options) { flight::transport::grpc::InitializeFlightGrpcClient(); @@ -538,11 +523,6 @@ arrow::Result> FlightClient::Connect( return client; } -Status FlightClient::Connect(const Location& location, const FlightClientOptions& options, - std::unique_ptr* client) { - return Connect(location, options).Value(client); -} - Status FlightClient::Authenticate(const FlightCallOptions& options, std::unique_ptr auth_handler) { RETURN_NOT_OK(CheckOpen()); @@ -564,11 +544,6 @@ arrow::Result> FlightClient::DoAction( return results; } -Status FlightClient::DoAction(const FlightCallOptions& options, const Action& action, - std::unique_ptr* results) { - return DoAction(options, action).Value(results); -} - arrow::Result FlightClient::CancelFlightInfo( const FlightCallOptions& options, const CancelFlightInfoRequest& request) { ARROW_ASSIGN_OR_RAISE(auto body, request.SerializeToString()); @@ -601,11 +576,6 @@ arrow::Result> FlightClient::ListActions( return actions; } -Status FlightClient::ListActions(const FlightCallOptions& options, - std::vector* actions) { - return ListActions(options).Value(actions); -} - arrow::Result> FlightClient::GetFlightInfo( const FlightCallOptions& options, const FlightDescriptor& descriptor) { std::unique_ptr info; @@ -614,32 +584,16 @@ arrow::Result> FlightClient::GetFlightInfo( return info; } -Status FlightClient::GetFlightInfo(const FlightCallOptions& options, - const FlightDescriptor& descriptor, - std::unique_ptr* info) { - return GetFlightInfo(options, descriptor).Value(info); -} - arrow::Result> FlightClient::GetSchema( const FlightCallOptions& options, const FlightDescriptor& descriptor) { RETURN_NOT_OK(CheckOpen()); return transport_->GetSchema(options, descriptor); } -Status FlightClient::GetSchema(const FlightCallOptions& options, - const FlightDescriptor& descriptor, - std::unique_ptr* schema_result) { - return GetSchema(options, descriptor).Value(schema_result); -} - arrow::Result> FlightClient::ListFlights() { return ListFlights({}, {}); } -Status FlightClient::ListFlights(std::unique_ptr* listing) { - return ListFlights({}, {}).Value(listing); -} - arrow::Result> FlightClient::ListFlights( const FlightCallOptions& options, const Criteria& criteria) { std::unique_ptr listing; @@ -648,12 +602,6 @@ arrow::Result> FlightClient::ListFlights( return listing; } -Status FlightClient::ListFlights(const FlightCallOptions& options, - const Criteria& criteria, - std::unique_ptr* listing) { - return ListFlights(options, criteria).Value(listing); -} - arrow::Result> FlightClient::DoGet( const FlightCallOptions& options, const Ticket& ticket) { RETURN_NOT_OK(CheckOpen()); @@ -668,11 +616,6 @@ arrow::Result> FlightClient::DoGet( return stream_reader; } -Status FlightClient::DoGet(const FlightCallOptions& options, const Ticket& ticket, - std::unique_ptr* stream) { - return DoGet(options, ticket).Value(stream); -} - arrow::Result FlightClient::DoPut( const FlightCallOptions& options, const FlightDescriptor& descriptor, const std::shared_ptr& schema) { @@ -689,17 +632,6 @@ arrow::Result FlightClient::DoPut( return result; } -Status FlightClient::DoPut(const FlightCallOptions& options, - const FlightDescriptor& descriptor, - const std::shared_ptr& schema, - std::unique_ptr* writer, - std::unique_ptr* reader) { - ARROW_ASSIGN_OR_RAISE(auto result, DoPut(options, descriptor, schema)); - *writer = std::move(result.writer); - *reader = std::move(result.reader); - return Status::OK(); -} - arrow::Result FlightClient::DoExchange( const FlightCallOptions& options, const FlightDescriptor& descriptor) { RETURN_NOT_OK(CheckOpen()); @@ -717,16 +649,6 @@ arrow::Result FlightClient::DoExchange( return result; } -Status FlightClient::DoExchange(const FlightCallOptions& options, - const FlightDescriptor& descriptor, - std::unique_ptr* writer, - std::unique_ptr* reader) { - ARROW_ASSIGN_OR_RAISE(auto result, DoExchange(options, descriptor)); - *writer = std::move(result.writer); - *reader = std::move(result.reader); - return Status::OK(); -} - Status FlightClient::Close() { if (!closed_) { closed_ = true; diff --git a/cpp/src/arrow/flight/client.h b/cpp/src/arrow/flight/client.h index ba9f688dce8b7..7204b469a6127 100644 --- a/cpp/src/arrow/flight/client.h +++ b/cpp/src/arrow/flight/client.h @@ -139,17 +139,9 @@ class ARROW_FLIGHT_EXPORT FlightStreamReader : public MetadataRecordBatchReader virtual arrow::Result>> ToRecordBatches( const StopToken& stop_token) = 0; - using MetadataRecordBatchReader::ReadAll; - ARROW_DEPRECATED("Deprecated in 8.0.0. Use ToRecordBatches instead.") - Status ReadAll(std::vector>* batches, - const StopToken& stop_token); - using MetadataRecordBatchReader::ToTable; /// \brief Consume entire stream as a Table arrow::Result> ToTable(const StopToken& stop_token); - - ARROW_DEPRECATED("Deprecated in 8.0.0. Use ToTable instead.") - Status ReadAll(std::shared_ptr
* table, const StopToken& stop_token); }; // Silence warning @@ -196,9 +188,6 @@ class ARROW_FLIGHT_EXPORT FlightClient { /// the connection was successful static arrow::Result> Connect(const Location& location); - ARROW_DEPRECATED("Deprecated in 8.0.0. Use Result-returning overload instead.") - static Status Connect(const Location& location, std::unique_ptr* client); - /// \brief Connect to an unauthenticated flight service /// \param[in] location the URI /// \param[in] options Other options for setting up the client @@ -207,10 +196,6 @@ class ARROW_FLIGHT_EXPORT FlightClient { static arrow::Result> Connect( const Location& location, const FlightClientOptions& options); - ARROW_DEPRECATED("Deprecated in 8.0.0. Use Result-returning overload instead.") - static Status Connect(const Location& location, const FlightClientOptions& options, - std::unique_ptr* client); - /// \brief Authenticate to the server using the given handler. /// \param[in] options Per-RPC options /// \param[in] auth_handler The authentication mechanism to use @@ -239,14 +224,6 @@ class ARROW_FLIGHT_EXPORT FlightClient { return DoAction({}, action); } - ARROW_DEPRECATED("Deprecated in 8.0.0. Use Result-returning overload instead.") - Status DoAction(const FlightCallOptions& options, const Action& action, - std::unique_ptr* results); - ARROW_DEPRECATED("Deprecated in 8.0.0. Use Result-returning overload instead.") - Status DoAction(const Action& action, std::unique_ptr* results) { - return DoAction({}, action).Value(results); - } - /// \brief Perform the CancelFlightInfo action, returning a /// CancelFlightInfoResult /// @@ -281,13 +258,6 @@ class ARROW_FLIGHT_EXPORT FlightClient { return ListActions(FlightCallOptions()); } - ARROW_DEPRECATED("Deprecated in 8.0.0. Use Result-returning overload instead.") - Status ListActions(const FlightCallOptions& options, std::vector* actions); - ARROW_DEPRECATED("Deprecated in 8.0.0. Use Result-returning overload instead.") - Status ListActions(std::vector* actions) { - return ListActions().Value(actions); - } - /// \brief Request access plan for a single flight, which may be an existing /// dataset or a command to be executed /// \param[in] options Per-RPC options @@ -301,16 +271,6 @@ class ARROW_FLIGHT_EXPORT FlightClient { return GetFlightInfo({}, descriptor); } - ARROW_DEPRECATED("Deprecated in 8.0.0. Use Result-returning overload instead.") - Status GetFlightInfo(const FlightCallOptions& options, - const FlightDescriptor& descriptor, - std::unique_ptr* info); - ARROW_DEPRECATED("Deprecated in 8.0.0. Use Result-returning overload instead.") - Status GetFlightInfo(const FlightDescriptor& descriptor, - std::unique_ptr* info) { - return GetFlightInfo({}, descriptor).Value(info); - } - /// \brief Request schema for a single flight, which may be an existing /// dataset or a command to be executed /// \param[in] options Per-RPC options @@ -320,27 +280,15 @@ class ARROW_FLIGHT_EXPORT FlightClient { arrow::Result> GetSchema( const FlightCallOptions& options, const FlightDescriptor& descriptor); - ARROW_DEPRECATED("Deprecated in 8.0.0. Use Result-returning overload instead.") - Status GetSchema(const FlightCallOptions& options, const FlightDescriptor& descriptor, - std::unique_ptr* schema_result); - arrow::Result> GetSchema( const FlightDescriptor& descriptor) { return GetSchema({}, descriptor); } - ARROW_DEPRECATED("Deprecated in 8.0.0. Use Result-returning overload instead.") - Status GetSchema(const FlightDescriptor& descriptor, - std::unique_ptr* schema_result) { - return GetSchema({}, descriptor).Value(schema_result); - } /// \brief List all available flights known to the server /// \return Arrow result with an iterator that returns a FlightInfo for each flight arrow::Result> ListFlights(); - ARROW_DEPRECATED("Deprecated in 8.0.0. Use Result-returning overload instead.") - Status ListFlights(std::unique_ptr* listing); - /// \brief List available flights given indicated filter criteria /// \param[in] options Per-RPC options /// \param[in] criteria the filter criteria (opaque) @@ -348,10 +296,6 @@ class ARROW_FLIGHT_EXPORT FlightClient { arrow::Result> ListFlights( const FlightCallOptions& options, const Criteria& criteria); - ARROW_DEPRECATED("Deprecated in 8.0.0. Use Result-returning overload instead.") - Status ListFlights(const FlightCallOptions& options, const Criteria& criteria, - std::unique_ptr* listing); - /// \brief Given a flight ticket and schema, request to be sent the /// stream. Returns record batch stream reader /// \param[in] options Per-RPC options @@ -363,14 +307,6 @@ class ARROW_FLIGHT_EXPORT FlightClient { return DoGet({}, ticket); } - ARROW_DEPRECATED("Deprecated in 8.0.0. Use Result-returning overload instead.") - Status DoGet(const FlightCallOptions& options, const Ticket& ticket, - std::unique_ptr* stream); - ARROW_DEPRECATED("Deprecated in 8.0.0. Use Result-returning overload instead.") - Status DoGet(const Ticket& ticket, std::unique_ptr* stream) { - return DoGet({}, ticket).Value(stream); - } - /// \brief DoPut return value struct DoPutResult { /// \brief a writer to write record batches to @@ -399,21 +335,6 @@ class ARROW_FLIGHT_EXPORT FlightClient { return DoPut({}, descriptor, schema); } - ARROW_DEPRECATED("Deprecated in 8.0.0. Use Result-returning overload instead.") - Status DoPut(const FlightCallOptions& options, const FlightDescriptor& descriptor, - const std::shared_ptr& schema, - std::unique_ptr* writer, - std::unique_ptr* reader); - ARROW_DEPRECATED("Deprecated in 8.0.0. Use Result-returning overload instead.") - Status DoPut(const FlightDescriptor& descriptor, const std::shared_ptr& schema, - std::unique_ptr* writer, - std::unique_ptr* reader) { - ARROW_ASSIGN_OR_RAISE(auto output, DoPut({}, descriptor, schema)); - *writer = std::move(output.writer); - *reader = std::move(output.reader); - return Status::OK(); - } - struct DoExchangeResult { std::unique_ptr writer; std::unique_ptr reader; @@ -424,20 +345,6 @@ class ARROW_FLIGHT_EXPORT FlightClient { return DoExchange({}, descriptor); } - ARROW_DEPRECATED("Deprecated in 8.0.0. Use Result-returning overload instead.") - Status DoExchange(const FlightCallOptions& options, const FlightDescriptor& descriptor, - std::unique_ptr* writer, - std::unique_ptr* reader); - ARROW_DEPRECATED("Deprecated in 8.0.0. Use Result-returning overload instead.") - Status DoExchange(const FlightDescriptor& descriptor, - std::unique_ptr* writer, - std::unique_ptr* reader) { - ARROW_ASSIGN_OR_RAISE(auto output, DoExchange({}, descriptor)); - *writer = std::move(output.writer); - *reader = std::move(output.reader); - return Status::OK(); - } - /// \brief Explicitly shut down and clean up the client. /// /// For backwards compatibility, this will be implicitly called by diff --git a/cpp/src/arrow/flight/client_middleware.h b/cpp/src/arrow/flight/client_middleware.h index 5b67e784b9eda..8e3126553a953 100644 --- a/cpp/src/arrow/flight/client_middleware.h +++ b/cpp/src/arrow/flight/client_middleware.h @@ -42,6 +42,11 @@ class ARROW_FLIGHT_EXPORT ClientMiddleware { virtual void SendingHeaders(AddCallHeaders* outgoing_headers) = 0; /// \brief A callback when headers are received from the server. + /// + /// This may be called more than once, since servers send both + /// headers and trailers. Some implementations (e.g. gRPC-Java, and + /// hence Arrow Flight in Java) may consolidate headers into + /// trailers if the RPC errored. virtual void ReceivedHeaders(const CallHeaders& incoming_headers) = 0; /// \brief A callback after the call has completed. diff --git a/cpp/src/arrow/flight/flight_internals_test.cc b/cpp/src/arrow/flight/flight_internals_test.cc index 87cd1ca887d5f..e56bab6db2092 100644 --- a/cpp/src/arrow/flight/flight_internals_test.cc +++ b/cpp/src/arrow/flight/flight_internals_test.cc @@ -34,6 +34,9 @@ #include "arrow/testing/gtest_util.h" #include "arrow/util/string.h" +// Include after Flight headers +#include + namespace arrow { namespace flight { @@ -381,27 +384,6 @@ TEST(FlightTypes, LocationConstruction) { ASSERT_EQ(location.ToString(), "grpc+unix:///tmp/test.sock"); } -ARROW_SUPPRESS_DEPRECATION_WARNING -TEST(FlightTypes, DeprecatedLocationConstruction) { - Location location; - ASSERT_RAISES(Invalid, Location::Parse("This is not an URI", &location)); - ASSERT_RAISES(Invalid, - Location::ForGrpcTcp("This is not a hostname", 12345, &location)); - ASSERT_RAISES(Invalid, - Location::ForGrpcTls("This is not a hostname", 12345, &location)); - ASSERT_RAISES(Invalid, Location::ForGrpcUnix("This is not a filename", &location)); - - ASSERT_OK(Location::Parse("s3://test", &location)); - ASSERT_EQ(location.ToString(), "s3://test"); - ASSERT_OK(Location::ForGrpcTcp("localhost", 12345, &location)); - ASSERT_EQ(location.ToString(), "grpc+tcp://localhost:12345"); - ASSERT_OK(Location::ForGrpcTls("localhost", 12345, &location)); - ASSERT_EQ(location.ToString(), "grpc+tls://localhost:12345"); - ASSERT_OK(Location::ForGrpcUnix("/tmp/test.sock", &location)); - ASSERT_EQ(location.ToString(), "grpc+unix:///tmp/test.sock"); -} -ARROW_UNSUPPRESS_DEPRECATION_WARNING - // ---------------------------------------------------------------------- // Cookie authentication/middleware @@ -672,6 +654,38 @@ TEST_F(TestCookieParsing, CookieCache) { AddCookieVerifyCache({"id0=0;", "id1=1;", "id2=2"}, "id0=0; id1=1; id2=2"); } +// ---------------------------------------------------------------------- +// Protobuf tests + +TEST(GrpcTransport, FlightDataDeserialize) { +#ifndef _WIN32 + pb::FlightData raw; + // Tack on known and unknown fields by hand here + raw.GetReflection()->MutableUnknownFields(&raw)->AddFixed32(900, 1024); + raw.GetReflection()->MutableUnknownFields(&raw)->AddFixed64(901, 1024); + raw.GetReflection()->MutableUnknownFields(&raw)->AddVarint(902, 1024); + raw.GetReflection()->MutableUnknownFields(&raw)->AddLengthDelimited(903, "foobar"); + // Known field comes at end + raw.GetReflection()->MutableUnknownFields(&raw)->AddLengthDelimited( + pb::FlightData::kDataBodyFieldNumber, "data"); + + auto serialized = raw.SerializeAsString(); + + grpc_slice slice = grpc_slice_from_copied_buffer(serialized.data(), serialized.size()); + // gRPC requires that grpc_slice and grpc::Slice have the same representation + grpc::ByteBuffer buffer(reinterpret_cast(&slice), /*nslices=*/1); + + flight::internal::FlightData out; + auto status = flight::transport::grpc::FlightDataDeserialize(&buffer, &out); + ASSERT_TRUE(status.ok()); + ASSERT_EQ("data", out.body->ToString()); + + grpc_slice_unref(slice); +#else + GTEST_SKIP() << "Can't use Protobuf symbols on Windows"; +#endif +} + // ---------------------------------------------------------------------- // Transport abstraction tests diff --git a/cpp/src/arrow/flight/flight_test.cc b/cpp/src/arrow/flight/flight_test.cc index d56dc81e356bd..1e7ea9bb002bb 100644 --- a/cpp/src/arrow/flight/flight_test.cc +++ b/cpp/src/arrow/flight/flight_test.cc @@ -46,11 +46,7 @@ #error "gRPC headers should not be in public API" #endif -#ifdef GRPCPP_PP_INCLUDE #include -#else -#include -#endif // Include before test_util.h (boost), contains Windows fixes #include "arrow/flight/platform.h" diff --git a/cpp/src/arrow/flight/server.h b/cpp/src/arrow/flight/server.h index 6fb8ab1213117..049c6cee3ffcf 100644 --- a/cpp/src/arrow/flight/server.h +++ b/cpp/src/arrow/flight/server.h @@ -53,18 +53,10 @@ class ARROW_FLIGHT_EXPORT FlightDataStream { /// \brief Compute FlightPayload containing serialized RecordBatch schema virtual arrow::Result GetSchemaPayload() = 0; - ARROW_DEPRECATED("Deprecated in 8.0.0. Use Result-returning overload instead.") - Status GetSchemaPayload(FlightPayload* payload) { - return GetSchemaPayload().Value(payload); - } - // When the stream is completed, the last payload written will have null // metadata virtual arrow::Result Next() = 0; - ARROW_DEPRECATED("Deprecated in 8.0.0. Use Result-returning overload instead.") - Status Next(FlightPayload* payload) { return Next().Value(payload); } - virtual Status Close(); }; @@ -130,6 +122,15 @@ class ARROW_FLIGHT_EXPORT ServerCallContext { virtual const std::string& peer_identity() const = 0; /// \brief The peer address (not validated) virtual const std::string& peer() const = 0; + /// \brief Add a response header. This is only valid before the server + /// starts sending the response; generally this isn't an issue unless you + /// are implementing FlightDataStream, ResultStream, or similar interfaces + /// yourself, or during a DoExchange or DoPut. + virtual void AddHeader(const std::string& key, const std::string& value) const = 0; + /// \brief Add a response trailer. This is only valid before the server + /// sends the final status; generally this isn't an issue unless your RPC + /// handler launches a thread or similar. + virtual void AddTrailer(const std::string& key, const std::string& value) const = 0; /// \brief Look up a middleware by key. Do not maintain a reference /// to the object beyond the request body. /// \return The middleware, or nullptr if not found. @@ -210,8 +211,10 @@ class ARROW_FLIGHT_EXPORT FlightServerBase { Status SetShutdownOnSignals(const std::vector sigs); /// \brief Start serving. - /// This method blocks until either Shutdown() is called or one of the signals - /// registered in SetShutdownOnSignals() is received. + /// This method blocks until the server shuts down. + /// + /// The server will start to shut down when either Shutdown() is called + /// or one of the signals registered in SetShutdownOnSignals() is received. Status Serve(); /// \brief Query whether Serve() was interrupted by a signal. @@ -220,14 +223,18 @@ class ARROW_FLIGHT_EXPORT FlightServerBase { /// \return int the signal number that interrupted Serve(), if any, otherwise 0 int GotSignal() const; - /// \brief Shut down the server. Can be called from signal handler or another - /// thread while Serve() blocks. Optionally a deadline can be set. Once the - /// the deadline expires server will wait until remaining running calls - /// complete. + /// \brief Shut down the server, blocking until current requests finish. /// + /// Can be called from a signal handler or another thread while Serve() + /// blocks. Optionally a deadline can be set. Once the the deadline expires + /// server will wait until remaining running calls complete. + /// + /// Should only be called once. Status Shutdown(const std::chrono::system_clock::time_point* deadline = NULLPTR); - /// \brief Block until server is terminated with Shutdown. + /// \brief Block until server shuts down with Shutdown. + /// + /// Does not respond to signals like Serve(). Status Wait(); // Implement these methods to create your own server. The default diff --git a/cpp/src/arrow/flight/test_definitions.cc b/cpp/src/arrow/flight/test_definitions.cc index 507c5ef40421c..4e137380044f3 100644 --- a/cpp/src/arrow/flight/test_definitions.cc +++ b/cpp/src/arrow/flight/test_definitions.cc @@ -18,17 +18,22 @@ #include "arrow/flight/test_definitions.h" #include +#include +#include #include "arrow/array/array_base.h" #include "arrow/array/array_dict.h" #include "arrow/array/util.h" #include "arrow/flight/api.h" +#include "arrow/flight/client_middleware.h" #include "arrow/flight/test_util.h" #include "arrow/table.h" #include "arrow/testing/generator.h" +#include "arrow/testing/gtest_util.h" #include "arrow/util/checked_cast.h" #include "arrow/util/config.h" #include "arrow/util/logging.h" +#include "gmock/gmock.h" #if defined(ARROW_CUDA) #include "arrow/gpu/cuda_api.h" @@ -1438,20 +1443,26 @@ class ErrorHandlingTestServer : public FlightServerBase { public: Status GetFlightInfo(const ServerCallContext& context, const FlightDescriptor& request, std::unique_ptr* info) override { - if (request.path.size() >= 2) { + if (request.path.size() == 1 && request.path[0] == "metadata") { + context.AddHeader("x-header", "header-value"); + context.AddHeader("x-header-bin", "header\x01value"); + context.AddTrailer("x-trailer", "trailer-value"); + context.AddTrailer("x-trailer-bin", "trailer\x01value"); + return Status::Invalid("Expected"); + } else if (request.path.size() >= 2) { const int raw_code = std::atoi(request.path[0].c_str()); ARROW_ASSIGN_OR_RAISE(StatusCode code, TryConvertStatusCode(raw_code)); if (request.path.size() == 2) { - return Status(code, request.path[1]); + return {code, request.path[1]}; } else if (request.path.size() == 3) { - return Status(code, request.path[1], std::make_shared()); + return {code, request.path[1], std::make_shared()}; } else { const int raw_code = std::atoi(request.path[2].c_str()); ARROW_ASSIGN_OR_RAISE(FlightStatusCode flight_code, TryConvertFlightStatusCode(raw_code)); - return Status(code, request.path[1], - std::make_shared(flight_code, request.path[3])); + return {code, request.path[1], + std::make_shared(flight_code, request.path[3])}; } } return Status::NotImplemented("NYI"); @@ -1469,20 +1480,70 @@ class ErrorHandlingTestServer : public FlightServerBase { return MakeFlightError(FlightStatusCode::Unauthorized, "Unauthorized", "extra info"); } }; + +class MetadataRecordingClientMiddleware : public ClientMiddleware { + public: + explicit MetadataRecordingClientMiddleware( + std::mutex& mutex, std::vector>& headers) + : mutex_(mutex), headers_(headers) {} + void SendingHeaders(AddCallHeaders*) override {} + void ReceivedHeaders(const CallHeaders& incoming_headers) override { + std::lock_guard guard(mutex_); + for (const auto& [key, value] : incoming_headers) { + headers_.emplace_back(key, value); + } + } + void CallCompleted(const Status&) override {} + + private: + std::mutex& mutex_; + std::vector>& headers_; +}; + +class MetadataRecordingClientMiddlewareFactory : public ClientMiddlewareFactory { + public: + void StartCall(const CallInfo&, + std::unique_ptr* middleware) override { + *middleware = std::make_unique(mutex_, headers_); + } + + std::vector> GetHeaders() const { + std::lock_guard guard(mutex_); + // Take copy + return headers_; + } + + private: + mutable std::mutex mutex_; + std::vector> headers_; +}; } // namespace +struct ErrorHandlingTest::Impl { + std::shared_ptr metadata = + std::make_shared(); +}; + void ErrorHandlingTest::SetUpTest() { + impl_ = std::make_shared(); ASSERT_OK_AND_ASSIGN(auto location, Location::ForScheme(transport(), "127.0.0.1", 0)); ASSERT_OK(MakeServer( location, &server_, &client_, [](FlightServerOptions* options) { return Status::OK(); }, - [](FlightClientOptions* options) { return Status::OK(); })); + [&](FlightClientOptions* options) { + options->middleware.emplace_back(impl_->metadata); + return Status::OK(); + })); } void ErrorHandlingTest::TearDownTest() { ASSERT_OK(client_->Close()); ASSERT_OK(server_->Shutdown()); } +std::vector> ErrorHandlingTest::GetHeaders() { + return impl_->metadata->GetHeaders(); +} + void ErrorHandlingTest::TestGetFlightInfo() { std::unique_ptr info; for (const auto code : kStatusCodes) { @@ -1518,6 +1579,20 @@ void ErrorHandlingTest::TestGetFlightInfo() { } } +void ErrorHandlingTest::TestGetFlightInfoMetadata() { + auto descr = FlightDescriptor::Path({"metadata"}); + EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr("Expected"), + client_->GetFlightInfo(descr)); + // This is janky because we don't/can't expose grpc::CallContext. + // See https://github.com/apache/arrow/issues/34607 + ASSERT_THAT(GetHeaders(), ::testing::IsSupersetOf({ + std::make_pair("x-header", "header-value"), + std::make_pair("x-header-bin", "header\x01value"), + std::make_pair("x-trailer", "trailer-value"), + std::make_pair("x-trailer-bin", "trailer\x01value"), + })); +} + void CheckErrorDetail(const Status& status) { auto detail = FlightStatusDetail::UnwrapStatus(status); ASSERT_NE(detail, nullptr) << status.ToString(); diff --git a/cpp/src/arrow/flight/test_definitions.h b/cpp/src/arrow/flight/test_definitions.h index 7a7f905f3e97c..c73bc264b4966 100644 --- a/cpp/src/arrow/flight/test_definitions.h +++ b/cpp/src/arrow/flight/test_definitions.h @@ -265,10 +265,16 @@ class ARROW_FLIGHT_EXPORT ErrorHandlingTest : public FlightTest { // Test methods void TestGetFlightInfo(); + void TestGetFlightInfoMetadata(); void TestDoPut(); void TestDoExchange(); - private: + protected: + struct Impl; + + std::vector> GetHeaders(); + + std::shared_ptr impl_; std::unique_ptr client_; std::unique_ptr server_; }; @@ -277,6 +283,7 @@ class ARROW_FLIGHT_EXPORT ErrorHandlingTest : public FlightTest { static_assert(std::is_base_of::value, \ ARROW_STRINGIFY(FIXTURE) " must inherit from ErrorHandlingTest"); \ TEST_F(FIXTURE, TestGetFlightInfo) { TestGetFlightInfo(); } \ + TEST_F(FIXTURE, TestGetFlightInfoMetadata) { TestGetFlightInfoMetadata(); } \ TEST_F(FIXTURE, TestDoPut) { TestDoPut(); } \ TEST_F(FIXTURE, TestDoExchange) { TestDoExchange(); } diff --git a/cpp/src/arrow/flight/transport/grpc/customize_grpc.h b/cpp/src/arrow/flight/transport/grpc/customize_grpc.h index 1085a946966c8..5005fc6b16eb4 100644 --- a/cpp/src/arrow/flight/transport/grpc/customize_grpc.h +++ b/cpp/src/arrow/flight/transport/grpc/customize_grpc.h @@ -31,17 +31,8 @@ #pragma warning(disable : 4267) #endif -#ifdef GRPCPP_PP_INCLUDE #include -#else -#include -#endif - -#ifdef GRPCPP_PP_INCLUDE #include -#else -#include -#endif #ifdef _MSC_VER #pragma warning(pop) diff --git a/cpp/src/arrow/flight/transport/grpc/grpc_client.cc b/cpp/src/arrow/flight/transport/grpc/grpc_client.cc index a1d0e3266b4e6..9b40015f9f729 100644 --- a/cpp/src/arrow/flight/transport/grpc/grpc_client.cc +++ b/cpp/src/arrow/flight/transport/grpc/grpc_client.cc @@ -25,15 +25,10 @@ #include #include -#include "arrow/util/config.h" -#ifdef GRPCPP_PP_INCLUDE #include #if defined(GRPC_NAMESPACE_FOR_TLS_CREDENTIALS_OPTIONS) #include #endif -#else -#include -#endif #include @@ -112,9 +107,9 @@ class GrpcClientInterceptorAdapter : public ::grpc::experimental::Interceptor { public: explicit GrpcClientInterceptorAdapter( std::vector> middleware) - : middleware_(std::move(middleware)), received_headers_(false) {} + : middleware_(std::move(middleware)) {} - void Intercept(::grpc::experimental::InterceptorBatchMethods* methods) { + void Intercept(::grpc::experimental::InterceptorBatchMethods* methods) override { using InterceptionHookPoints = ::grpc::experimental::InterceptionHookPoints; if (methods->QueryInterceptionHookPoint( InterceptionHookPoints::PRE_SEND_INITIAL_METADATA)) { @@ -147,10 +142,6 @@ class GrpcClientInterceptorAdapter : public ::grpc::experimental::Interceptor { private: void ReceivedHeaders( const std::multimap<::grpc::string_ref, ::grpc::string_ref>& metadata) { - if (received_headers_) { - return; - } - received_headers_ = true; CallHeaders headers; for (const auto& entry : metadata) { headers.insert({std::string_view(entry.first.data(), entry.first.length()), @@ -162,20 +153,14 @@ class GrpcClientInterceptorAdapter : public ::grpc::experimental::Interceptor { } std::vector> middleware_; - // When communicating with a gRPC-Java server, the server may not - // send back headers if the call fails right away. Instead, the - // headers will be consolidated into the trailers. We don't want to - // call the client middleware callback twice, so instead track - // whether we saw headers - if not, then we need to check trailers. - bool received_headers_; }; class GrpcClientInterceptorAdapterFactory : public ::grpc::experimental::ClientInterceptorFactoryInterface { public: - GrpcClientInterceptorAdapterFactory( + explicit GrpcClientInterceptorAdapterFactory( std::vector> middleware) - : middleware_(middleware) {} + : middleware_(std::move(middleware)) {} ::grpc::experimental::Interceptor* CreateClientInterceptor( ::grpc::experimental::ClientRpcInfo* info) override { diff --git a/cpp/src/arrow/flight/transport/grpc/grpc_server.cc b/cpp/src/arrow/flight/transport/grpc/grpc_server.cc index dcf9c3f8c9f4b..50d4ffe002c7e 100644 --- a/cpp/src/arrow/flight/transport/grpc/grpc_server.cc +++ b/cpp/src/arrow/flight/transport/grpc/grpc_server.cc @@ -25,12 +25,7 @@ #include #include -#include "arrow/util/config.h" -#ifdef GRPCPP_PP_INCLUDE #include -#else -#include -#endif #include "arrow/buffer.h" #include "arrow/flight/serialization_internal.h" @@ -116,6 +111,7 @@ class GrpcServerAuthSender : public ServerAuthSender { }; class GrpcServerCallContext : public ServerCallContext { + public: explicit GrpcServerCallContext(::grpc::ServerContext* context) : context_(context), peer_(context_->peer()) { for (const auto& entry : context->client_metadata()) { @@ -148,6 +144,14 @@ class GrpcServerCallContext : public ServerCallContext { return ToGrpcStatus(status, context_); } + void AddHeader(const std::string& key, const std::string& value) const override { + context_->AddInitialMetadata(key, value); + } + + void AddTrailer(const std::string& key, const std::string& value) const override { + context_->AddTrailingMetadata(key, value); + } + ServerMiddleware* GetMiddleware(const std::string& key) const override { const auto& instance = middleware_map_.find(key); if (instance == middleware_map_.end()) { diff --git a/cpp/src/arrow/flight/transport/grpc/serialization_internal.cc b/cpp/src/arrow/flight/transport/grpc/serialization_internal.cc index 8514ca361df33..372dca7a2c4c8 100644 --- a/cpp/src/arrow/flight/transport/grpc/serialization_internal.cc +++ b/cpp/src/arrow/flight/transport/grpc/serialization_internal.cc @@ -21,6 +21,7 @@ #include #include +#include #include #include @@ -36,13 +37,8 @@ #include #include -#ifdef GRPCPP_PP_INCLUDE #include #include -#else -#include -#include -#endif #if defined(_MSC_VER) #pragma warning(pop) @@ -302,7 +298,7 @@ ::grpc::Status FlightDataSerialize(const FlightPayload& msg, ByteBuffer* out, const auto remainder = static_cast( bit_util::RoundUpToMultipleOf8(buffer->size()) - buffer->size()); if (remainder) { - slices.push_back(::grpc::Slice(kPaddingBytes, remainder)); + slices.emplace_back(kPaddingBytes, remainder); } } } @@ -321,7 +317,7 @@ ::grpc::Status FlightDataSerialize(const FlightPayload& msg, ByteBuffer* out, ::grpc::Status FlightDataDeserialize(ByteBuffer* buffer, arrow::flight::internal::FlightData* out) { if (!buffer) { - return ::grpc::Status(::grpc::StatusCode::INTERNAL, "No payload"); + return {::grpc::StatusCode::INTERNAL, "No payload"}; } // Reset fields in case the caller reuses a single allocation @@ -347,42 +343,45 @@ ::grpc::Status FlightDataDeserialize(ByteBuffer* buffer, pb::FlightDescriptor pb_descriptor; uint32_t length; if (!pb_stream.ReadVarint32(&length)) { - return ::grpc::Status(::grpc::StatusCode::INTERNAL, - "Unable to parse length of FlightDescriptor"); + return {::grpc::StatusCode::INTERNAL, + "Unable to parse length of FlightDescriptor"}; } // Can't use ParseFromCodedStream as this reads the entire // rest of the stream into the descriptor command field. std::string buffer; pb_stream.ReadString(&buffer, length); if (!pb_descriptor.ParseFromString(buffer)) { - return ::grpc::Status(::grpc::StatusCode::INTERNAL, - "Unable to parse FlightDescriptor"); + return {::grpc::StatusCode::INTERNAL, "Unable to parse FlightDescriptor"}; } arrow::flight::FlightDescriptor descriptor; GRPC_RETURN_NOT_OK( arrow::flight::internal::FromProto(pb_descriptor, &descriptor)); - out->descriptor.reset(new arrow::flight::FlightDescriptor(descriptor)); + out->descriptor = std::make_unique(descriptor); } break; case pb::FlightData::kDataHeaderFieldNumber: { if (!ReadBytesZeroCopy(wrapped_buffer, &pb_stream, &out->metadata)) { - return ::grpc::Status(::grpc::StatusCode::INTERNAL, - "Unable to read FlightData metadata"); + return {::grpc::StatusCode::INTERNAL, "Unable to read FlightData metadata"}; } } break; case pb::FlightData::kAppMetadataFieldNumber: { if (!ReadBytesZeroCopy(wrapped_buffer, &pb_stream, &out->app_metadata)) { - return ::grpc::Status(::grpc::StatusCode::INTERNAL, - "Unable to read FlightData application metadata"); + return {::grpc::StatusCode::INTERNAL, + "Unable to read FlightData application metadata"}; } } break; case pb::FlightData::kDataBodyFieldNumber: { if (!ReadBytesZeroCopy(wrapped_buffer, &pb_stream, &out->body)) { - return ::grpc::Status(::grpc::StatusCode::INTERNAL, - "Unable to read FlightData body"); + return {::grpc::StatusCode::INTERNAL, "Unable to read FlightData body"}; } } break; - default: - DCHECK(false) << "cannot happen"; + default: { + // Unknown field. We should skip it for compatibility. + if (!WireFormatLite::SkipField(&pb_stream, tag)) { + return {::grpc::StatusCode::INTERNAL, + "Could not skip unknown field tag in FlightData"}; + } + break; + } } } buffer->Clear(); diff --git a/cpp/src/arrow/flight/transport/grpc/util_internal.cc b/cpp/src/arrow/flight/transport/grpc/util_internal.cc index f9bf26058ad58..f431fc30ec87a 100644 --- a/cpp/src/arrow/flight/transport/grpc/util_internal.cc +++ b/cpp/src/arrow/flight/transport/grpc/util_internal.cc @@ -22,11 +22,7 @@ #include #include -#ifdef GRPCPP_PP_INCLUDE #include -#else -#include -#endif #include "arrow/flight/transport.h" #include "arrow/flight/types.h" diff --git a/cpp/src/arrow/flight/transport/ucx/flight_transport_ucx_test.cc b/cpp/src/arrow/flight/transport/ucx/flight_transport_ucx_test.cc index 3ac02bf7183a3..c3481d834f6ea 100644 --- a/cpp/src/arrow/flight/transport/ucx/flight_transport_ucx_test.cc +++ b/cpp/src/arrow/flight/transport/ucx/flight_transport_ucx_test.cc @@ -103,6 +103,8 @@ class UcxErrorHandlingTest : public ErrorHandlingTest, public ::testing::Test { std::string transport() const override { return "ucx"; } void SetUp() override { SetUpTest(); } void TearDown() override { TearDownTest(); } + + void TestGetFlightInfoMetadata() { GTEST_SKIP() << "Middleware not implemented"; } }; ARROW_FLIGHT_TEST_ERROR_HANDLING(UcxErrorHandlingTest); diff --git a/cpp/src/arrow/flight/transport/ucx/ucx_server.cc b/cpp/src/arrow/flight/transport/ucx/ucx_server.cc index 4a573d742929a..8bbac34705c23 100644 --- a/cpp/src/arrow/flight/transport/ucx/ucx_server.cc +++ b/cpp/src/arrow/flight/transport/ucx/ucx_server.cc @@ -72,6 +72,9 @@ class UcxServerCallContext : public flight::ServerCallContext { public: const std::string& peer_identity() const override { return peer_; } const std::string& peer() const override { return peer_; } + // Not supported + void AddHeader(const std::string& key, const std::string& value) const override {} + void AddTrailer(const std::string& key, const std::string& value) const override {} ServerMiddleware* GetMiddleware(const std::string& key) const override { return nullptr; } diff --git a/cpp/src/arrow/flight/try_compile/check_tls_opts_127.cc b/cpp/src/arrow/flight/try_compile/check_tls_opts_127.cc deleted file mode 100644 index 11de4989911d2..0000000000000 --- a/cpp/src/arrow/flight/try_compile/check_tls_opts_127.cc +++ /dev/null @@ -1,36 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// Dummy file for checking if TlsCredentialsOptions exists in -// the grpc_impl::experimental namespace. gRPC versions 1.27-1.31 -// put it here. This is for supporting disabling server -// validation when using TLS. - -#include -#include -#include - -static grpc_tls_server_verification_option check( - const grpc_impl::experimental::TlsCredentialsOptions* options) { - grpc_tls_server_verification_option server_opt = options->server_verification_option(); - return server_opt; -} - -int main(int argc, const char** argv) { - [[maybe_unused]] grpc_tls_server_verification_option opt = check(nullptr); - return 0; -} diff --git a/cpp/src/arrow/flight/try_compile/check_tls_opts_134.cc b/cpp/src/arrow/flight/try_compile/check_tls_opts_134.cc deleted file mode 100644 index 4ee2122ef57e7..0000000000000 --- a/cpp/src/arrow/flight/try_compile/check_tls_opts_134.cc +++ /dev/null @@ -1,44 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// Dummy file for checking if TlsCredentialsOptions exists in -// the grpc::experimental namespace. gRPC starting from 1.34 -// put it here. This is for supporting disabling server -// validation when using TLS. - -#include -#include -#include - -// Dummy file for checking if TlsCredentialsOptions exists in -// the grpc::experimental namespace. gRPC starting from 1.34 -// puts it here. This is for supporting disabling server -// validation when using TLS. - -static void check() { - // In 1.34, there's no parameterless constructor; in 1.36, there's - // only a parameterless constructor - auto options = - std::make_shared(nullptr); - options->set_server_verification_option( - grpc_tls_server_verification_option::GRPC_TLS_SERVER_VERIFICATION); -} - -int main(int argc, const char** argv) { - check(); - return 0; -} diff --git a/cpp/src/arrow/flight/types.cc b/cpp/src/arrow/flight/types.cc index 44c0c6547cbb1..7c72595ed624b 100644 --- a/cpp/src/arrow/flight/types.cc +++ b/cpp/src/arrow/flight/types.cc @@ -158,11 +158,6 @@ arrow::Result> SchemaResult::Make(const Schema& sc return std::make_unique(std::move(schema_in)); } -Status SchemaResult::GetSchema(ipc::DictionaryMemo* dictionary_memo, - std::shared_ptr* out) const { - return GetSchema(dictionary_memo).Value(out); -} - std::string SchemaResult::ToString() const { return ""; } @@ -206,10 +201,6 @@ arrow::Result FlightDescriptor::SerializeToString() const { return out; } -Status FlightDescriptor::SerializeToString(std::string* out) const { - return SerializeToString().Value(out); -} - arrow::Result FlightDescriptor::Deserialize( std::string_view serialized) { pb::FlightDescriptor pb_descriptor; @@ -226,11 +217,6 @@ arrow::Result FlightDescriptor::Deserialize( return out; } -Status FlightDescriptor::Deserialize(const std::string& serialized, - FlightDescriptor* out) { - return Deserialize(serialized).Value(out); -} - std::string Ticket::ToString() const { std::stringstream ss; ss << ""; @@ -250,10 +236,6 @@ arrow::Result Ticket::SerializeToString() const { return out; } -Status Ticket::SerializeToString(std::string* out) const { - return SerializeToString().Value(out); -} - arrow::Result Ticket::Deserialize(std::string_view serialized) { pb::Ticket pb_ticket; if (serialized.size() > static_cast(std::numeric_limits::max())) { @@ -269,10 +251,6 @@ arrow::Result Ticket::Deserialize(std::string_view serialized) { return out; } -Status Ticket::Deserialize(const std::string& serialized, Ticket* out) { - return Deserialize(serialized).Value(out); -} - arrow::Result FlightInfo::Make(const Schema& schema, const FlightDescriptor& descriptor, const std::vector& endpoints, @@ -299,11 +277,6 @@ arrow::Result> FlightInfo::GetSchema( return schema_; } -Status FlightInfo::GetSchema(ipc::DictionaryMemo* dictionary_memo, - std::shared_ptr* out) const { - return GetSchema(dictionary_memo).Value(out); -} - arrow::Result FlightInfo::SerializeToString() const { pb::FlightInfo pb_info; RETURN_NOT_OK(internal::ToProto(*this, &pb_info)); @@ -315,10 +288,6 @@ arrow::Result FlightInfo::SerializeToString() const { return out; } -Status FlightInfo::SerializeToString(std::string* out) const { - return SerializeToString().Value(out); -} - arrow::Result> FlightInfo::Deserialize( std::string_view serialized) { pb::FlightInfo pb_info; @@ -335,11 +304,6 @@ arrow::Result> FlightInfo::Deserialize( return std::make_unique(std::move(data)); } -Status FlightInfo::Deserialize(const std::string& serialized, - std::unique_ptr* out) { - return Deserialize(serialized).Value(out); -} - std::string FlightInfo::ToString() const { std::stringstream ss; ss << " Location::ForGrpcTls(const std::string& host, const int port) { std::stringstream uri_string; uri_string << "grpc+tls://" << host << ':' << port; return Location::Parse(uri_string.str()); } -Status Location::ForGrpcTls(const std::string& host, const int port, Location* location) { - return ForGrpcTls(host, port).Value(location); -} - arrow::Result Location::ForGrpcUnix(const std::string& path) { std::stringstream uri_string; uri_string << "grpc+unix://" << path; return Location::Parse(uri_string.str()); } -Status Location::ForGrpcUnix(const std::string& path, Location* location) { - return ForGrpcUnix(path).Value(location); -} - arrow::Result Location::ForScheme(const std::string& scheme, const std::string& host, const int port) { std::stringstream uri_string; @@ -808,8 +752,6 @@ std::ostream& operator<<(std::ostream& os, CancelStatus status) { return os; } -Status ResultStream::Next(std::unique_ptr* info) { return Next().Value(info); } - Status ResultStream::Drain() { while (true) { ARROW_ASSIGN_OR_RAISE(auto result, Next()); @@ -818,10 +760,6 @@ Status ResultStream::Drain() { return Status::OK(); } -Status MetadataRecordBatchReader::Next(FlightStreamChunk* next) { - return Next().Value(next); -} - arrow::Result>> MetadataRecordBatchReader::ToRecordBatches() { std::vector> batches; @@ -833,21 +771,12 @@ MetadataRecordBatchReader::ToRecordBatches() { return batches; } -Status MetadataRecordBatchReader::ReadAll( - std::vector>* batches) { - return ToRecordBatches().Value(batches); -} - arrow::Result> MetadataRecordBatchReader::ToTable() { ARROW_ASSIGN_OR_RAISE(auto batches, ToRecordBatches()); ARROW_ASSIGN_OR_RAISE(auto schema, GetSchema()); return Table::FromRecordBatches(schema, std::move(batches)); } -Status MetadataRecordBatchReader::ReadAll(std::shared_ptr
* table) { - return ToTable().Value(table); -} - Status MetadataRecordBatchWriter::Begin(const std::shared_ptr& schema) { return Begin(schema, ipc::IpcWriteOptions::Defaults()); } @@ -934,10 +863,6 @@ arrow::Result BasicAuth::Deserialize(std::string_view serialized) { return out; } -Status BasicAuth::Deserialize(const std::string& serialized, BasicAuth* out) { - return Deserialize(serialized).Value(out); -} - arrow::Result BasicAuth::SerializeToString() const { pb::BasicAuth pb_result; RETURN_NOT_OK(internal::ToProto(*this, &pb_result)); @@ -948,8 +873,5 @@ arrow::Result BasicAuth::SerializeToString() const { return out; } -Status BasicAuth::Serialize(const BasicAuth& basic_auth, std::string* out) { - return basic_auth.SerializeToString().Value(out); -} } // namespace flight } // namespace arrow diff --git a/cpp/src/arrow/flight/types.h b/cpp/src/arrow/flight/types.h index 3cca774314017..ca86c27e86976 100644 --- a/cpp/src/arrow/flight/types.h +++ b/cpp/src/arrow/flight/types.h @@ -311,12 +311,6 @@ struct ARROW_FLIGHT_EXPORT BasicAuth { static arrow::Result Deserialize(std::string_view serialized); /// \brief Serialize this message to its wire-format representation. arrow::Result SerializeToString() const; - - ARROW_DEPRECATED("Deprecated in 8.0.0. Use Result-returning overload instead.") - static Status Deserialize(const std::string& serialized, BasicAuth* out); - - ARROW_DEPRECATED("Deprecated in 8.0.0. Use Result-returning overload instead.") - static Status Serialize(const BasicAuth& basic_auth, std::string* out); }; /// \brief A request to retrieve or generate a dataset @@ -349,18 +343,12 @@ struct ARROW_FLIGHT_EXPORT FlightDescriptor { /// services) that may want to return Flight types. arrow::Result SerializeToString() const; - ARROW_DEPRECATED("Deprecated in 8.0.0. Use Result-returning overload instead.") - Status SerializeToString(std::string* out) const; - /// \brief Parse the wire-format representation of this type. /// /// Useful when interoperating with non-Flight systems (e.g. REST /// services) that may want to return Flight types. static arrow::Result Deserialize(std::string_view serialized); - ARROW_DEPRECATED("Deprecated in 8.0.0. Use Result-returning overload instead.") - static Status Deserialize(const std::string& serialized, FlightDescriptor* out); - // Convenience factory functions static FlightDescriptor Command(const std::string& c) { @@ -400,17 +388,11 @@ struct ARROW_FLIGHT_EXPORT Ticket { /// services) that may want to return Flight types. arrow::Result SerializeToString() const; - ARROW_DEPRECATED("Deprecated in 8.0.0. Use Result-returning overload instead.") - Status SerializeToString(std::string* out) const; - /// \brief Parse the wire-format representation of this type. /// /// Useful when interoperating with non-Flight systems (e.g. REST /// services) that may want to return Flight types. static arrow::Result Deserialize(std::string_view serialized); - - ARROW_DEPRECATED("Deprecated in 8.0.0. Use Result-returning overload instead.") - static Status Deserialize(const std::string& serialized, Ticket* out); }; class FlightClient; @@ -434,9 +416,6 @@ struct ARROW_FLIGHT_EXPORT Location { /// \brief Initialize a location by parsing a URI string static arrow::Result Parse(const std::string& uri_string); - ARROW_DEPRECATED("Deprecated in 8.0.0. Use Result-returning overload instead.") - static Status Parse(const std::string& uri_string, Location* location); - /// \brief Initialize a location for a non-TLS, gRPC-based Flight /// service from a host and port /// \param[in] host The hostname to connect to @@ -444,9 +423,6 @@ struct ARROW_FLIGHT_EXPORT Location { /// \return Arrow result with the resulting location static arrow::Result ForGrpcTcp(const std::string& host, const int port); - ARROW_DEPRECATED("Deprecated in 8.0.0. Use Result-returning overload instead.") - static Status ForGrpcTcp(const std::string& host, const int port, Location* location); - /// \brief Initialize a location for a TLS-enabled, gRPC-based Flight /// service from a host and port /// \param[in] host The hostname to connect to @@ -454,18 +430,12 @@ struct ARROW_FLIGHT_EXPORT Location { /// \return Arrow result with the resulting location static arrow::Result ForGrpcTls(const std::string& host, const int port); - ARROW_DEPRECATED("Deprecated in 8.0.0. Use Result-returning overload instead.") - static Status ForGrpcTls(const std::string& host, const int port, Location* location); - /// \brief Initialize a location for a domain socket-based Flight /// service /// \param[in] path The path to the domain socket /// \return Arrow result with the resulting location static arrow::Result ForGrpcUnix(const std::string& path); - ARROW_DEPRECATED("Deprecated in 8.0.0. Use Result-returning overload instead.") - static Status ForGrpcUnix(const std::string& path, Location* location); - /// \brief Initialize a location based on a URI scheme static arrow::Result ForScheme(const std::string& scheme, const std::string& host, const int port); @@ -576,10 +546,6 @@ struct ARROW_FLIGHT_EXPORT SchemaResult { arrow::Result> GetSchema( ipc::DictionaryMemo* dictionary_memo) const; - ARROW_DEPRECATED("Deprecated in 8.0.0. Use Result-returning overload instead.") - Status GetSchema(ipc::DictionaryMemo* dictionary_memo, - std::shared_ptr* out) const; - const std::string& serialized_schema() const { return raw_schema_; } std::string ToString() const; @@ -633,10 +599,6 @@ class ARROW_FLIGHT_EXPORT FlightInfo { arrow::Result> GetSchema( ipc::DictionaryMemo* dictionary_memo) const; - ARROW_DEPRECATED("Deprecated in 8.0.0. Use Result-returning overload instead.") - Status GetSchema(ipc::DictionaryMemo* dictionary_memo, - std::shared_ptr* out) const; - const std::string& serialized_schema() const { return data_.schema; } /// The descriptor associated with this flight, may not be set @@ -661,9 +623,6 @@ class ARROW_FLIGHT_EXPORT FlightInfo { /// services) that may want to return Flight types. arrow::Result SerializeToString() const; - ARROW_DEPRECATED("Deprecated in 8.0.0. Use Result-returning overload instead.") - Status SerializeToString(std::string* out) const; - /// \brief Parse the wire-format representation of this type. /// /// Useful when interoperating with non-Flight systems (e.g. REST @@ -671,10 +630,6 @@ class ARROW_FLIGHT_EXPORT FlightInfo { static arrow::Result> Deserialize( std::string_view serialized); - ARROW_DEPRECATED("Deprecated in 8.0.0. Use Result-returning overload instead.") - static Status Deserialize(const std::string& serialized, - std::unique_ptr* out); - std::string ToString() const; /// Compare two FlightInfo for equality. This will compare the @@ -727,9 +682,6 @@ class ARROW_FLIGHT_EXPORT FlightListing { /// \return Arrow result with a single FlightInfo. Set to \a nullptr if there /// are none left. virtual arrow::Result> Next() = 0; - - ARROW_DEPRECATED("Deprecated in 8.0.0. Use Result-returning overload instead.") - Status Next(std::unique_ptr* info); }; /// \brief An iterator to Result instances returned by DoAction. @@ -741,9 +693,6 @@ class ARROW_FLIGHT_EXPORT ResultStream { /// \return Arrow result with a single Result. Set to \a nullptr if there are none left. virtual arrow::Result> Next() = 0; - ARROW_DEPRECATED("Deprecated in 8.0.0. Use Result-returning overload instead.") - Status Next(std::unique_ptr* info); - /// \brief Read and drop the remaining messages to get the error (if any) from a server. /// \return Status OK if this is no error from a server, any other status if a /// server returns an error. @@ -770,20 +719,11 @@ class ARROW_FLIGHT_EXPORT MetadataRecordBatchReader { /// nullptr. virtual arrow::Result Next() = 0; - ARROW_DEPRECATED("Deprecated in 8.0.0. Use Result-returning overload instead.") - Status Next(FlightStreamChunk* next); - /// \brief Consume entire stream as a vector of record batches virtual arrow::Result>> ToRecordBatches(); - ARROW_DEPRECATED("Deprecated in 8.0.0. Use ToRecordBatches instead.") - Status ReadAll(std::vector>* batches); - /// \brief Consume entire stream as a Table virtual arrow::Result> ToTable(); - - ARROW_DEPRECATED("Deprecated in 8.0.0. Use ToTable instead.") - Status ReadAll(std::shared_ptr
* table); }; /// \brief Convert a MetadataRecordBatchReader to a regular RecordBatchReader. diff --git a/cpp/src/arrow/gpu/cuda_context.cc b/cpp/src/arrow/gpu/cuda_context.cc index f754c07d13c89..869ea6453ccda 100644 --- a/cpp/src/arrow/gpu/cuda_context.cc +++ b/cpp/src/arrow/gpu/cuda_context.cc @@ -384,7 +384,8 @@ Result> CudaMemoryManager::ViewBufferTo( if (to->is_cpu()) { // Device-on-CPU view ARROW_ASSIGN_OR_RAISE(auto address, GetHostAddress(buf->address())); - return std::make_shared(address, buf->size(), to, buf); + return std::make_shared(address, buf->size(), to, buf, + DeviceAllocationType::kCUDA_HOST); } return nullptr; } diff --git a/cpp/src/arrow/gpu/cuda_context.h b/cpp/src/arrow/gpu/cuda_context.h index 0115ed19a103d..a1b95c7b4181d 100644 --- a/cpp/src/arrow/gpu/cuda_context.h +++ b/cpp/src/arrow/gpu/cuda_context.h @@ -92,6 +92,10 @@ class ARROW_EXPORT CudaDevice : public Device { std::string ToString() const override; bool Equals(const Device&) const override; std::shared_ptr default_memory_manager() override; + DeviceAllocationType device_type() const override { + return DeviceAllocationType::kCUDA; + } + int64_t device_id() const override { return device_number(); } /// \brief Return a CudaDevice instance for a particular device /// \param[in] device_number the CUDA device number diff --git a/cpp/src/arrow/gpu/cuda_memory.cc b/cpp/src/arrow/gpu/cuda_memory.cc index 297e4dcf71e44..860c6311d7b2f 100644 --- a/cpp/src/arrow/gpu/cuda_memory.cc +++ b/cpp/src/arrow/gpu/cuda_memory.cc @@ -198,6 +198,11 @@ Result> CudaBuffer::ExportForIpc() { return handle; } +CudaHostBuffer::CudaHostBuffer(uint8_t* data, const int64_t size) + : MutableBuffer(data, size) { + device_type_ = DeviceAllocationType::kCUDA_HOST; +} + CudaHostBuffer::~CudaHostBuffer() { auto maybe_manager = CudaDeviceManager::Instance(); ARROW_CHECK_OK(maybe_manager.status()); @@ -480,5 +485,21 @@ Result GetHostAddress(uintptr_t device_ptr) { return static_cast(ptr); } +Result> DefaultMemoryMapper(ArrowDeviceType device_type, + int64_t device_id) { + switch (device_type) { + case ARROW_DEVICE_CPU: + return default_cpu_memory_manager(); + case ARROW_DEVICE_CUDA: + case ARROW_DEVICE_CUDA_HOST: + case ARROW_DEVICE_CUDA_MANAGED: { + ARROW_ASSIGN_OR_RAISE(auto device, arrow::cuda::CudaDevice::Make(device_id)); + return device->default_memory_manager(); + } + default: + return Status::NotImplemented("memory manager not implemented for device"); + } +} + } // namespace cuda } // namespace arrow diff --git a/cpp/src/arrow/gpu/cuda_memory.h b/cpp/src/arrow/gpu/cuda_memory.h index 18c23a507805a..d323bef03494e 100644 --- a/cpp/src/arrow/gpu/cuda_memory.h +++ b/cpp/src/arrow/gpu/cuda_memory.h @@ -21,6 +21,7 @@ #include #include "arrow/buffer.h" +#include "arrow/c/abi.h" #include "arrow/io/concurrency.h" #include "arrow/type_fwd.h" @@ -110,7 +111,8 @@ class ARROW_EXPORT CudaBuffer : public Buffer { /// \brief Device-accessible CPU memory created using cudaHostAlloc class ARROW_EXPORT CudaHostBuffer : public MutableBuffer { public: - using MutableBuffer::MutableBuffer; + CudaHostBuffer(uint8_t* data, const int64_t size); + ~CudaHostBuffer(); /// \brief Return a device address the GPU can read this memory from. @@ -258,5 +260,9 @@ Result GetDeviceAddress(const uint8_t* cpu_data, ARROW_EXPORT Result GetHostAddress(uintptr_t device_ptr); +ARROW_EXPORT +Result> DefaultMemoryMapper(ArrowDeviceType device_type, + int64_t device_id); + } // namespace cuda } // namespace arrow diff --git a/cpp/src/arrow/gpu/cuda_test.cc b/cpp/src/arrow/gpu/cuda_test.cc index aac45d13831e5..6d392213e231f 100644 --- a/cpp/src/arrow/gpu/cuda_test.cc +++ b/cpp/src/arrow/gpu/cuda_test.cc @@ -364,6 +364,7 @@ TEST_F(TestCudaHostBuffer, AllocateGlobal) { ASSERT_TRUE(host_buffer->is_cpu()); ASSERT_EQ(host_buffer->memory_manager(), cpu_mm_); + ASSERT_EQ(host_buffer->device_type(), DeviceAllocationType::kCUDA_HOST); ASSERT_OK_AND_ASSIGN(auto device_address, host_buffer->GetDeviceAddress(context_)); ASSERT_NE(device_address, 0); @@ -376,6 +377,7 @@ TEST_F(TestCudaHostBuffer, ViewOnDevice) { ASSERT_TRUE(host_buffer->is_cpu()); ASSERT_EQ(host_buffer->memory_manager(), cpu_mm_); + ASSERT_EQ(host_buffer->device_type(), DeviceAllocationType::kCUDA_HOST); // Try to view the host buffer on the device. This should correspond to // GetDeviceAddress() in the previous test. @@ -385,6 +387,7 @@ TEST_F(TestCudaHostBuffer, ViewOnDevice) { ASSERT_NE(device_buffer->address(), 0); ASSERT_EQ(device_buffer->size(), host_buffer->size()); ASSERT_EQ(device_buffer->parent(), host_buffer); + ASSERT_EQ(device_buffer->device_type(), DeviceAllocationType::kCUDA); // View back the device buffer on the CPU. This should roundtrip. ASSERT_OK_AND_ASSIGN(auto buffer, Buffer::View(device_buffer, cpu_mm_)); @@ -393,6 +396,7 @@ TEST_F(TestCudaHostBuffer, ViewOnDevice) { ASSERT_EQ(buffer->address(), host_buffer->address()); ASSERT_EQ(buffer->size(), host_buffer->size()); ASSERT_EQ(buffer->parent(), device_buffer); + ASSERT_EQ(buffer->device_type(), DeviceAllocationType::kCUDA_HOST); } // ------------------------------------------------------------------------ diff --git a/cpp/src/arrow/io/interfaces.h b/cpp/src/arrow/io/interfaces.h index c5355c9422756..dcbe4feb261fb 100644 --- a/cpp/src/arrow/io/interfaces.h +++ b/cpp/src/arrow/io/interfaces.h @@ -96,10 +96,6 @@ struct ARROW_EXPORT IOContext { StopToken stop_token_; }; -struct ARROW_DEPRECATED("renamed to IOContext in 4.0.0") AsyncContext : public IOContext { - using IOContext::IOContext; -}; - class ARROW_EXPORT FileInterface { public: virtual ~FileInterface() = 0; diff --git a/cpp/src/arrow/record_batch.cc b/cpp/src/arrow/record_batch.cc index 3ce99d6f84a40..683e72878b9b1 100644 --- a/cpp/src/arrow/record_batch.cc +++ b/cpp/src/arrow/record_batch.cc @@ -348,19 +348,11 @@ Result RecordBatchReader::ToRecordBatches() { return batches; } -Status RecordBatchReader::ReadAll(RecordBatchVector* batches) { - return ToRecordBatches().Value(batches); -} - Result> RecordBatchReader::ToTable() { ARROW_ASSIGN_OR_RAISE(auto batches, ToRecordBatches()); return Table::FromRecordBatches(schema(), std::move(batches)); } -Status RecordBatchReader::ReadAll(std::shared_ptr
* table) { - return ToTable().Value(table); -} - class SimpleRecordBatchReader : public RecordBatchReader { public: SimpleRecordBatchReader(Iterator> it, diff --git a/cpp/src/arrow/record_batch.h b/cpp/src/arrow/record_batch.h index 8f9b5882d93ac..d728d5eb0da2f 100644 --- a/cpp/src/arrow/record_batch.h +++ b/cpp/src/arrow/record_batch.h @@ -326,15 +326,9 @@ class ARROW_EXPORT RecordBatchReader { /// \brief Consume entire stream as a vector of record batches Result ToRecordBatches(); - ARROW_DEPRECATED("Deprecated in 8.0.0. Use ToRecordBatches instead.") - Status ReadAll(RecordBatchVector* batches); - /// \brief Read all batches and concatenate as arrow::Table Result> ToTable(); - ARROW_DEPRECATED("Deprecated in 8.0.0. Use ToTable instead.") - Status ReadAll(std::shared_ptr
* table); - /// \brief Create a RecordBatchReader from a vector of RecordBatch. /// /// \param[in] batches the vector of RecordBatch to read from diff --git a/cpp/src/arrow/record_batch_test.cc b/cpp/src/arrow/record_batch_test.cc index def6dbc54edcb..4975e94325d32 100644 --- a/cpp/src/arrow/record_batch_test.cc +++ b/cpp/src/arrow/record_batch_test.cc @@ -496,32 +496,4 @@ TEST_F(TestRecordBatchReader, ToTable) { ASSERT_EQ(table->column(0)->chunks().size(), 0); } -ARROW_SUPPRESS_DEPRECATION_WARNING -TEST_F(TestRecordBatchReader, DeprecatedReadAllToRecordBatches) { - RecordBatchVector batches; - ASSERT_OK(reader_->ReadAll(&batches)); - ASSERT_EQ(batches.size(), batches_.size()); - for (size_t index = 0; index < batches.size(); index++) { - AssertBatchesEqual(*batches[index], *batches_[index]); - } - - ASSERT_OK(reader_->ReadAll(&batches)); - ASSERT_EQ(batches.size(), 0); -} - -TEST_F(TestRecordBatchReader, DeprecatedReadAllToTable) { - std::shared_ptr
table; - - ASSERT_OK(reader_->ReadAll(&table)); - const auto& chunks = table->column(0)->chunks(); - ASSERT_EQ(chunks.size(), batches_.size()); - for (size_t index = 0; index < batches_.size(); index++) { - AssertArraysEqual(*chunks[index], *batches_[index]->column(0)); - } - - ASSERT_OK(reader_->ReadAll(&table)); - ASSERT_EQ(table->column(0)->chunks().size(), 0); -} -ARROW_UNSUPPRESS_DEPRECATION_WARNING - } // namespace arrow diff --git a/cpp/src/arrow/scalar.h b/cpp/src/arrow/scalar.h index 0797306a67413..1d1ce4aa72948 100644 --- a/cpp/src/arrow/scalar.h +++ b/cpp/src/arrow/scalar.h @@ -131,6 +131,13 @@ struct ARROW_EXPORT NullScalar : public Scalar { namespace internal { +struct ARROW_EXPORT ArraySpanFillFromScalarScratchSpace { + // 16 bytes of scratch space to enable ArraySpan to be a view onto any + // Scalar- including binary scalars where we need to create a buffer + // that looks like two 32-bit or 64-bit offsets. + alignas(int64_t) mutable uint8_t scratch_space_[sizeof(int64_t) * 2]; +}; + struct ARROW_EXPORT PrimitiveScalarBase : public Scalar { explicit PrimitiveScalarBase(std::shared_ptr type) : Scalar(std::move(type), false) {} @@ -238,7 +245,9 @@ struct ARROW_EXPORT DoubleScalar : public NumericScalar { using NumericScalar::NumericScalar; }; -struct ARROW_EXPORT BaseBinaryScalar : public internal::PrimitiveScalarBase { +struct ARROW_EXPORT BaseBinaryScalar + : public internal::PrimitiveScalarBase, + private internal::ArraySpanFillFromScalarScratchSpace { using internal::PrimitiveScalarBase::PrimitiveScalarBase; using ValueType = std::shared_ptr; @@ -257,6 +266,8 @@ struct ARROW_EXPORT BaseBinaryScalar : public internal::PrimitiveScalarBase { protected: BaseBinaryScalar(std::shared_ptr value, std::shared_ptr type) : internal::PrimitiveScalarBase{std::move(type), true}, value(std::move(value)) {} + + friend ArraySpan; }; struct ARROW_EXPORT BinaryScalar : public BaseBinaryScalar { @@ -464,7 +475,9 @@ struct ARROW_EXPORT Decimal256Scalar : public DecimalScalar; @@ -472,6 +485,9 @@ struct ARROW_EXPORT BaseListScalar : public Scalar { bool is_valid = true); std::shared_ptr value; + + private: + friend struct ArraySpan; }; struct ARROW_EXPORT ListScalar : public BaseListScalar { @@ -519,7 +535,8 @@ struct ARROW_EXPORT StructScalar : public Scalar { std::vector field_names); }; -struct ARROW_EXPORT UnionScalar : public Scalar { +struct ARROW_EXPORT UnionScalar : public Scalar, + private internal::ArraySpanFillFromScalarScratchSpace { int8_t type_code; virtual const std::shared_ptr& child_value() const = 0; @@ -527,6 +544,8 @@ struct ARROW_EXPORT UnionScalar : public Scalar { protected: UnionScalar(std::shared_ptr type, int8_t type_code, bool is_valid) : Scalar(std::move(type), is_valid), type_code(type_code) {} + + friend struct ArraySpan; }; struct ARROW_EXPORT SparseUnionScalar : public UnionScalar { @@ -568,7 +587,9 @@ struct ARROW_EXPORT DenseUnionScalar : public UnionScalar { value(std::move(value)) {} }; -struct ARROW_EXPORT RunEndEncodedScalar : public Scalar { +struct ARROW_EXPORT RunEndEncodedScalar + : public Scalar, + private internal::ArraySpanFillFromScalarScratchSpace { using TypeClass = RunEndEncodedType; using ValueType = std::shared_ptr; @@ -589,6 +610,8 @@ struct ARROW_EXPORT RunEndEncodedScalar : public Scalar { private: const TypeClass& ree_type() const { return internal::checked_cast(*type); } + + friend ArraySpan; }; /// \brief A Scalar value for DictionaryType diff --git a/cpp/src/arrow/table_builder.cc b/cpp/src/arrow/table_builder.cc index 414aa263cc7f0..19ca151ac200f 100644 --- a/cpp/src/arrow/table_builder.cc +++ b/cpp/src/arrow/table_builder.cc @@ -36,19 +36,6 @@ RecordBatchBuilder::RecordBatchBuilder(const std::shared_ptr& schema, MemoryPool* pool, int64_t initial_capacity) : schema_(schema), initial_capacity_(initial_capacity), pool_(pool) {} -Status RecordBatchBuilder::Make(const std::shared_ptr& schema, MemoryPool* pool, - std::unique_ptr* builder) { - ARROW_ASSIGN_OR_RAISE(*builder, Make(schema, pool, kMinBuilderCapacity)) - return Status::OK(); -} - -Status RecordBatchBuilder::Make(const std::shared_ptr& schema, MemoryPool* pool, - int64_t initial_capacity, - std::unique_ptr* builder) { - ARROW_ASSIGN_OR_RAISE(*builder, Make(schema, pool, initial_capacity)) - return Status::OK(); -} - Result> RecordBatchBuilder::Make( const std::shared_ptr& schema, MemoryPool* pool) { return Make(schema, pool, kMinBuilderCapacity); @@ -63,17 +50,6 @@ Result> RecordBatchBuilder::Make( return std::move(builder); } -Status RecordBatchBuilder::Flush(bool reset_builders, - std::shared_ptr* batch) { - ARROW_ASSIGN_OR_RAISE(*batch, Flush(reset_builders)); - return Status::OK(); -} - -Status RecordBatchBuilder::Flush(std::shared_ptr* batch) { - ARROW_ASSIGN_OR_RAISE(*batch, Flush(true)); - return Status::OK(); -} - Result> RecordBatchBuilder::Flush(bool reset_builders) { std::vector> fields; fields.resize(this->num_fields()); diff --git a/cpp/src/arrow/table_builder.h b/cpp/src/arrow/table_builder.h index 65ebd86ea416e..671cc4ab97996 100644 --- a/cpp/src/arrow/table_builder.h +++ b/cpp/src/arrow/table_builder.h @@ -38,24 +38,6 @@ class RecordBatch; /// schema class ARROW_EXPORT RecordBatchBuilder { public: - /// \brief Create and initialize a RecordBatchBuilder - /// \param[in] schema The schema for the record batch - /// \param[in] pool A MemoryPool to use for allocations - /// \param[in] builder the created builder instance - ARROW_DEPRECATED("Deprecated in 9.0.0. Use Result-returning variant.") - static Status Make(const std::shared_ptr& schema, MemoryPool* pool, - std::unique_ptr* builder); - - /// \brief Create and initialize a RecordBatchBuilder - /// \param[in] schema The schema for the record batch - /// \param[in] pool A MemoryPool to use for allocations - /// \param[in] initial_capacity The initial capacity for the builders - /// \param[in] builder the created builder instance - ARROW_DEPRECATED("Deprecated in 9.0.0. Use Result-returning variant.") - static Status Make(const std::shared_ptr& schema, MemoryPool* pool, - int64_t initial_capacity, - std::unique_ptr* builder); - /// \brief Create and initialize a RecordBatchBuilder /// \param[in] schema The schema for the record batch /// \param[in] pool A MemoryPool to use for allocations @@ -84,19 +66,6 @@ class ARROW_EXPORT RecordBatchBuilder { return internal::checked_cast(raw_field_builders_[i]); } - /// \brief Finish current batch and optionally reset - /// \param[in] reset_builders the resulting RecordBatch - /// \param[out] batch the resulting RecordBatch - /// \return Status - ARROW_DEPRECATED("Deprecated in 9.0.0. Use Result-returning variant.") - Status Flush(bool reset_builders, std::shared_ptr* batch); - - /// \brief Finish current batch and reset - /// \param[out] batch the resulting RecordBatch - /// \return Status - ARROW_DEPRECATED("Deprecated in 9.0.0. Use Result-returning variant.") - Status Flush(std::shared_ptr* batch); - /// \brief Finish current batch and optionally reset /// \param[in] reset_builders the resulting RecordBatch /// \return the resulting RecordBatch diff --git a/cpp/src/arrow/testing/random.cc b/cpp/src/arrow/testing/random.cc index b8ea247a43746..b74c41f75e452 100644 --- a/cpp/src/arrow/testing/random.cc +++ b/cpp/src/arrow/testing/random.cc @@ -33,6 +33,7 @@ #include "arrow/array/builder_decimal.h" #include "arrow/array/builder_primitive.h" #include "arrow/buffer.h" +#include "arrow/extension_type.h" #include "arrow/record_batch.h" #include "arrow/testing/gtest_util.h" #include "arrow/type.h" @@ -935,14 +936,27 @@ std::shared_ptr RandomArrayGenerator::ArrayOf(const Field& field, int64_t case Type::type::SPARSE_UNION: case Type::type::DENSE_UNION: { ArrayVector child_arrays(field.type()->num_fields()); - for (int i = 0; i < field.type()->num_fields(); i++) { + for (int i = 0; i < field.type()->num_fields(); ++i) { const auto& child_field = field.type()->field(i); child_arrays[i] = ArrayOf(*child_field, length, alignment, memory_pool); } auto array = field.type()->id() == Type::type::SPARSE_UNION ? SparseUnion(child_arrays, length, alignment, memory_pool) : DenseUnion(child_arrays, length, alignment, memory_pool); - return *array->View(field.type()); + + const auto& type_codes = checked_cast(*field.type()).type_codes(); + const auto& default_type_codes = + checked_cast(*array->type()).type_codes(); + + if (type_codes != default_type_codes) { + // map to the type ids specified by the UnionType + auto* type_ids = + reinterpret_cast(array->data()->buffers[1]->mutable_data()); + for (int64_t i = 0; i != array->length(); ++i) { + type_ids[i] = type_codes[type_ids[i]]; + } + } + return *array->View(field.type()); // view gets the field names right for us } case Type::type::DICTIONARY: { @@ -982,8 +996,15 @@ std::shared_ptr RandomArrayGenerator::ArrayOf(const Field& field, int64_t } case Type::type::EXTENSION: - // Could be supported by generating the storage type (though any extension - // invariants wouldn't be preserved) + if (GetMetadata(field.metadata().get(), "extension_allow_random_storage", + false)) { + const auto& ext_type = checked_cast(*field.type()); + auto storage = ArrayOf(*field.WithType(ext_type.storage_type()), length, + alignment, memory_pool); + return ExtensionType::WrapArray(field.type(), storage); + } + // We don't have explicit permission to generate random storage; bail rather than + // silently risk breaking extension invariants break; case Type::type::FIXED_SIZE_LIST: { diff --git a/cpp/src/arrow/testing/random.h b/cpp/src/arrow/testing/random.h index 1bd189c39c2df..de9ea6d05648d 100644 --- a/cpp/src/arrow/testing/random.h +++ b/cpp/src/arrow/testing/random.h @@ -563,6 +563,13 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator { /// For MapType: /// - values (int32_t): the number of key-value pairs to generate, which will be /// partitioned among the array values. + /// + /// For extension types: + /// - extension_allow_random_storage (bool): in general an extension array may have + /// invariants on its storage beyond those already imposed by the arrow format, + /// which may result in an invalid array if we just wrap randomly generated + /// storage. Set this flag to explicitly allow wrapping of randomly generated + /// storage. std::shared_ptr BatchOf( const FieldVector& fields, int64_t size, int64_t alignment = kDefaultBufferAlignment, @@ -575,7 +582,7 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator { std::default_random_engine seed_rng_; }; -/// Generate an array with random data. See RandomArrayGenerator::BatchOf. +/// Generate a batch with random data. See RandomArrayGenerator::BatchOf. ARROW_TESTING_EXPORT std::shared_ptr GenerateBatch( const FieldVector& fields, int64_t size, SeedType seed, diff --git a/cpp/src/arrow/testing/util.cc b/cpp/src/arrow/testing/util.cc index b59854480765b..e8a782575e278 100644 --- a/cpp/src/arrow/testing/util.cc +++ b/cpp/src/arrow/testing/util.cc @@ -43,6 +43,7 @@ #include "arrow/table.h" #include "arrow/testing/random.h" #include "arrow/type.h" +#include "arrow/util/cpu_info.h" #include "arrow/util/io_util.h" #include "arrow/util/logging.h" #include "arrow/util/pcg_random.h" @@ -211,4 +212,18 @@ const std::vector>& all_dictionary_index_types() { return types; } +std::vector GetSupportedHardwareFlags( + const std::vector& candidate_flags) { + std::vector hardware_flags; + // Always test fallback codepaths + hardware_flags.push_back(0); + for (const int64_t candidate_flag : candidate_flags) { + if (candidate_flag != 0 && + internal::CpuInfo::GetInstance()->IsSupported(candidate_flag)) { + hardware_flags.push_back(candidate_flag); + } + } + return hardware_flags; +} + } // namespace arrow diff --git a/cpp/src/arrow/testing/util.h b/cpp/src/arrow/testing/util.h index 4f4b03438fd58..b4b2785a36292 100644 --- a/cpp/src/arrow/testing/util.h +++ b/cpp/src/arrow/testing/util.h @@ -131,4 +131,10 @@ ARROW_TESTING_EXPORT std::string GetListenAddress(); ARROW_TESTING_EXPORT const std::vector>& all_dictionary_index_types(); +// Get a list of supported hardware flags from the given candidates. +// The result will always contain 0, meaning no optional CPU feature enabled at all. +ARROW_TESTING_EXPORT +std::vector GetSupportedHardwareFlags( + const std::vector& candidate_flags); + } // namespace arrow diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index d218789f681fc..ccd1ddccf54ce 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -922,7 +922,7 @@ class ARROW_EXPORT BaseListType : public NestedType { ~BaseListType() override; const std::shared_ptr& value_field() const { return children_[0]; } - std::shared_ptr value_type() const { return children_[0]->type(); } + const std::shared_ptr& value_type() const { return children_[0]->type(); } }; /// \brief Concrete type class for list data diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h index 657abbaecc42b..a8a27139d11bc 100644 --- a/cpp/src/arrow/type_fwd.h +++ b/cpp/src/arrow/type_fwd.h @@ -45,6 +45,7 @@ class Future; namespace util { class Codec; +class CodecOptions; } // namespace util class Buffer; @@ -68,6 +69,7 @@ using FieldVector = std::vector>; class Array; struct ArrayData; +struct ArraySpan; class ArrayBuilder; struct Scalar; diff --git a/cpp/src/arrow/util/CMakeLists.txt b/cpp/src/arrow/util/CMakeLists.txt index 2dc99e237079b..5171b5d1d6305 100644 --- a/cpp/src/arrow/util/CMakeLists.txt +++ b/cpp/src/arrow/util/CMakeLists.txt @@ -61,6 +61,7 @@ add_arrow_test(utility-test reflection_test.cc rows_to_batches_test.cc small_vector_test.cc + span_test.cc stl_util_test.cc string_test.cc tdigest_test.cc diff --git a/cpp/src/arrow/util/async_util.cc b/cpp/src/arrow/util/async_util.cc index 55627eb43bbcf..63e27bfbe5773 100644 --- a/cpp/src/arrow/util/async_util.cc +++ b/cpp/src/arrow/util/async_util.cc @@ -201,10 +201,14 @@ class AsyncTaskSchedulerImpl : public AsyncTaskScheduler { } // Capture `task` to keep it alive until finished if (!submit_result->TryAddCallback([this, task_inner = std::move(task)]() mutable { - return [this, task_inner2 = std::move(task_inner)](const Status& st) { + return [this, task_inner2 = std::move(task_inner)](const Status& st) mutable { #ifdef ARROW_WITH_OPENTELEMETRY TraceTaskFinished(task_inner2.get()); #endif + // OnTaskFinished might trigger the scheduler to end. We want to ensure that + // is the very last thing that happens after all task destructors have run so + // we eagerly destroy the task first. + task_inner2.reset(); OnTaskFinished(st); }; })) { diff --git a/cpp/src/arrow/util/async_util_test.cc b/cpp/src/arrow/util/async_util_test.cc index 7734b84c9ebaf..313ca91912335 100644 --- a/cpp/src/arrow/util/async_util_test.cc +++ b/cpp/src/arrow/util/async_util_test.cc @@ -204,6 +204,29 @@ TEST(AsyncTaskScheduler, InitialTaskFails) { ASSERT_FINISHES_AND_RAISES(Invalid, finished); } +TEST(AsyncTaskScheduler, TaskDestroyedBeforeSchedulerEnds) { + bool my_task_destroyed = false; + Future<> task_fut = Future<>::Make(); + struct DestroyTrackingTask : public AsyncTaskScheduler::Task { + DestroyTrackingTask(bool& my_task_destroyed, Future<> task_fut) + : my_task_destroyed(my_task_destroyed), task_fut(std::move(task_fut)) {} + ~DestroyTrackingTask() override { my_task_destroyed = true; } + std::string_view name() const override { return "DestroyTrackingTask"; } + Result> operator()() override { return task_fut; } + bool& my_task_destroyed; + Future<> task_fut; + }; + Future<> finished = AsyncTaskScheduler::Make([&](AsyncTaskScheduler* scheduler) { + scheduler->AddTask(std::make_unique( + my_task_destroyed, task_fut)); + return Status::OK(); + }).Then([&] { ASSERT_TRUE(my_task_destroyed); }); + ASSERT_FALSE(my_task_destroyed); + task_fut.MarkFinished(); + ASSERT_FINISHES_OK(finished); + ASSERT_TRUE(my_task_destroyed); +} + TEST(AsyncTaskScheduler, TaskGroup) { Future<> task = Future<>::Make(); bool finish_callback_ran = false; diff --git a/cpp/src/arrow/util/basic_decimal.cc b/cpp/src/arrow/util/basic_decimal.cc index f2fd39d6f37ad..0835ab9074a48 100644 --- a/cpp/src/arrow/util/basic_decimal.cc +++ b/cpp/src/arrow/util/basic_decimal.cc @@ -969,6 +969,16 @@ bool BasicDecimal256::FitsInPrecision(int32_t precision) const { return BasicDecimal256::Abs(*this) < kDecimal256PowersOfTen[precision]; } +void BasicDecimal256::GetWholeAndFraction(int scale, BasicDecimal256* whole, + BasicDecimal256* fraction) const { + DCHECK_GE(scale, 0); + DCHECK_LE(scale, 76); + + BasicDecimal256 multiplier(kDecimal256PowersOfTen[scale]); + auto s = Divide(multiplier, whole, fraction); + DCHECK_EQ(s, DecimalStatus::kSuccess); +} + const BasicDecimal256& BasicDecimal256::GetScaleMultiplier(int32_t scale) { DCHECK_GE(scale, 0); DCHECK_LE(scale, 76); diff --git a/cpp/src/arrow/util/basic_decimal.h b/cpp/src/arrow/util/basic_decimal.h index b263bb234a795..d8a91ea76b390 100644 --- a/cpp/src/arrow/util/basic_decimal.h +++ b/cpp/src/arrow/util/basic_decimal.h @@ -366,6 +366,10 @@ class ARROW_EXPORT BasicDecimal256 : public GenericBasicDecimal, public util::EqualityComparable { public: - template - using View = std::basic_string_view; - Bitmap() = default; Bitmap(const std::shared_ptr& buffer, int64_t offset, int64_t length) @@ -72,17 +69,17 @@ class ARROW_EXPORT Bitmap : public util::ToStringOstreamable, Bitmap Slice(int64_t offset) const { if (mutable_data_ != NULLPTR) { - return Bitmap(mutable_data_, offset_ + offset, length_ - offset); + return {mutable_data_, offset_ + offset, length_ - offset}; } else { - return Bitmap(data_, offset_ + offset, length_ - offset); + return {data_, offset_ + offset, length_ - offset}; } } Bitmap Slice(int64_t offset, int64_t length) const { if (mutable_data_ != NULLPTR) { - return Bitmap(mutable_data_, offset_ + offset, length); + return {mutable_data_, offset_ + offset, length}; } else { - return Bitmap(data_, offset_ + offset, length); + return {data_, offset_ + offset, length}; } } @@ -158,7 +155,7 @@ class ARROW_EXPORT Bitmap : public util::ToStringOstreamable, Bitmap bitmaps[N]; int64_t offsets[N]; int64_t bit_length = BitLength(bitmaps_arg, N); - View words[N]; + util::span words[N]; for (size_t i = 0; i < N; ++i) { bitmaps[i] = bitmaps_arg[i]; offsets[i] = bitmaps[i].template word_offset(); @@ -386,15 +383,15 @@ class ARROW_EXPORT Bitmap : public util::ToStringOstreamable, /// number of bits in this Bitmap int64_t length() const { return length_; } - /// string_view of all bytes which contain any bit in this Bitmap - util::bytes_view bytes() const { + /// span of all bytes which contain any bit in this Bitmap + util::span bytes() const { auto byte_offset = offset_ / 8; auto byte_count = bit_util::CeilDiv(offset_ + length_, 8) - byte_offset; - return util::bytes_view(data_ + byte_offset, byte_count); + return {data_ + byte_offset, static_cast(byte_count)}; } private: - /// string_view of all Words which contain any bit in this Bitmap + /// span of all Words which contain any bit in this Bitmap /// /// For example, given Word=uint16_t and a bitmap spanning bits [20, 36) /// words() would span bits [16, 48). @@ -407,15 +404,15 @@ class ARROW_EXPORT Bitmap : public util::ToStringOstreamable, /// \warning The words may contain bytes which lie outside the buffer or are /// uninitialized. template - View words() const { + util::span words() const { auto bytes_addr = reinterpret_cast(bytes().data()); auto words_addr = bytes_addr - bytes_addr % sizeof(Word); auto word_byte_count = bit_util::RoundUpToPowerOf2(static_cast(bytes_addr + bytes().size()), static_cast(sizeof(Word))) - words_addr; - return View(reinterpret_cast(words_addr), - word_byte_count / sizeof(Word)); + return {reinterpret_cast(words_addr), + static_cast(word_byte_count / sizeof(Word))}; } /// offset of first bit relative to words().data() diff --git a/cpp/src/arrow/util/byte_size.cc b/cpp/src/arrow/util/byte_size.cc index fe232c9accd77..548368757c2e2 100644 --- a/cpp/src/arrow/util/byte_size.cc +++ b/cpp/src/arrow/util/byte_size.cc @@ -27,6 +27,7 @@ #include "arrow/record_batch.h" #include "arrow/table.h" #include "arrow/util/logging.h" +#include "arrow/util/ree_util.h" #include "arrow/visit_type_inline.h" namespace arrow { @@ -294,6 +295,20 @@ struct GetByteRangesArray { return Status::OK(); } + Status Visit(const RunEndEncodedType& type) const { + auto [phys_offset, phys_length] = ree_util::FindPhysicalRange(input, offset, length); + for (int i = 0; i < type.num_fields(); i++) { + GetByteRangesArray child{*input.child_data[i], + /*offset=*/input.child_data[i]->offset + phys_offset, + /*length=*/phys_length, + range_starts, + range_offsets, + range_lengths}; + RETURN_NOT_OK(VisitTypeInline(*type.field(i)->type(), &child)); + } + return Status::OK(); + } + Status Visit(const ExtensionType& extension_type) const { GetByteRangesArray storage{input, offset, length, range_starts, range_offsets, range_lengths}; diff --git a/cpp/src/arrow/util/byte_size_test.cc b/cpp/src/arrow/util/byte_size_test.cc index fc18049fddf0a..0aaf0a76a2a42 100644 --- a/cpp/src/arrow/util/byte_size_test.cc +++ b/cpp/src/arrow/util/byte_size_test.cc @@ -390,6 +390,33 @@ TEST(ByteRanges, SparseUnion) { }); } +TEST(ByteRanges, RunEndEncodedArray) { + auto run_ends = + ArrayFromJSON(int32(), "[-1, -1, 100, 200, 300, 400, 500, -1]")->Slice(2, 5); + auto values = ArrayFromJSON(int32(), R"([-1, 1, 2, 3, 4, 5, -1, -1])")->Slice(1, 5); + ASSERT_OK_AND_ASSIGN(auto ree_array, RunEndEncodedArray::Make(500, run_ends, values)); + CheckBufferRanges(ree_array, { + {0, 8, 20}, + {1, 4, 20}, + }); + CheckBufferRanges(ree_array->Slice(50), { + {0, 8, 20}, + {1, 4, 20}, + }); + CheckBufferRanges(ree_array->Slice(100, 400), { + {0, 12, 16}, + {1, 8, 16}, + }); + CheckBufferRanges(ree_array->Slice(100, 301), { + {0, 12, 16}, + {1, 8, 16}, + }); + CheckBufferRanges(ree_array->Slice(100, 300), { + {0, 12, 12}, + {1, 8, 12}, + }); +} + TEST(ByteRanges, ExtensionArray) { std::shared_ptr ext_arr = ExampleUuid(); CheckBufferRanges(ext_arr, {{0, 0, 1}, {1, 0, 64}}); diff --git a/cpp/src/arrow/util/byte_stream_split.h b/cpp/src/arrow/util/byte_stream_split.h index 28dcce52bb8fc..d428df0659b28 100644 --- a/cpp/src/arrow/util/byte_stream_split.h +++ b/cpp/src/arrow/util/byte_stream_split.h @@ -39,9 +39,9 @@ void ByteStreamSplitDecodeSse2(const uint8_t* data, int64_t num_values, int64_t constexpr size_t kNumStreams = sizeof(T); static_assert(kNumStreams == 4U || kNumStreams == 8U, "Invalid number of streams."); constexpr size_t kNumStreamsLog2 = (kNumStreams == 8U ? 3U : 2U); + constexpr int64_t kBlockSize = sizeof(__m128i) * kNumStreams; const int64_t size = num_values * sizeof(T); - constexpr int64_t kBlockSize = sizeof(__m128i) * kNumStreams; const int64_t num_blocks = size / kBlockSize; uint8_t* output_data = reinterpret_cast(out); @@ -92,11 +92,12 @@ void ByteStreamSplitEncodeSse2(const uint8_t* raw_values, const size_t num_value uint8_t* output_buffer_raw) { constexpr size_t kNumStreams = sizeof(T); static_assert(kNumStreams == 4U || kNumStreams == 8U, "Invalid number of streams."); + constexpr size_t kBlockSize = sizeof(__m128i) * kNumStreams; + __m128i stage[3][kNumStreams]; __m128i final_result[kNumStreams]; const size_t size = num_values * sizeof(T); - constexpr size_t kBlockSize = sizeof(__m128i) * kNumStreams; const size_t num_blocks = size / kBlockSize; const __m128i* raw_values_sse = reinterpret_cast(raw_values); __m128i* output_buffer_streams[kNumStreams]; @@ -143,7 +144,7 @@ void ByteStreamSplitEncodeSse2(const uint8_t* raw_values, const size_t num_value _mm_unpackhi_epi8(stage[stage_lvl][i * 2], stage[stage_lvl][i * 2 + 1]); } } - if (kNumStreams == 8U) { + if constexpr (kNumStreams == 8U) { // This is the path for double. __m128i tmp[8]; for (size_t i = 0; i < 4; ++i) { @@ -181,9 +182,9 @@ void ByteStreamSplitDecodeAvx2(const uint8_t* data, int64_t num_values, int64_t constexpr size_t kNumStreams = sizeof(T); static_assert(kNumStreams == 4U || kNumStreams == 8U, "Invalid number of streams."); constexpr size_t kNumStreamsLog2 = (kNumStreams == 8U ? 3U : 2U); + constexpr int64_t kBlockSize = sizeof(__m256i) * kNumStreams; const int64_t size = num_values * sizeof(T); - constexpr int64_t kBlockSize = sizeof(__m256i) * kNumStreams; if (size < kBlockSize) // Back to SSE for small size return ByteStreamSplitDecodeSse2(data, num_values, stride, out); const int64_t num_blocks = size / kBlockSize; @@ -220,7 +221,7 @@ void ByteStreamSplitDecodeAvx2(const uint8_t* data, int64_t num_values, int64_t } } - if (kNumStreams == 8U) { + if constexpr (kNumStreams == 8U) { // path for double, 128i index: // {0x00, 0x08}, {0x01, 0x09}, {0x02, 0x0A}, {0x03, 0x0B}, // {0x04, 0x0C}, {0x05, 0x0D}, {0x06, 0x0E}, {0x07, 0x0F}, @@ -266,11 +267,12 @@ void ByteStreamSplitEncodeAvx2(const uint8_t* raw_values, const size_t num_value uint8_t* output_buffer_raw) { constexpr size_t kNumStreams = sizeof(T); static_assert(kNumStreams == 4U || kNumStreams == 8U, "Invalid number of streams."); - if (kNumStreams == 8U) // Back to SSE, currently no path for double. + constexpr size_t kBlockSize = sizeof(__m256i) * kNumStreams; + + if constexpr (kNumStreams == 8U) // Back to SSE, currently no path for double. return ByteStreamSplitEncodeSse2(raw_values, num_values, output_buffer_raw); const size_t size = num_values * sizeof(T); - constexpr size_t kBlockSize = sizeof(__m256i) * kNumStreams; if (size < kBlockSize) // Back to SSE for small size return ByteStreamSplitEncodeSse2(raw_values, num_values, output_buffer_raw); const size_t num_blocks = size / kBlockSize; @@ -339,9 +341,9 @@ void ByteStreamSplitDecodeAvx512(const uint8_t* data, int64_t num_values, int64_ constexpr size_t kNumStreams = sizeof(T); static_assert(kNumStreams == 4U || kNumStreams == 8U, "Invalid number of streams."); constexpr size_t kNumStreamsLog2 = (kNumStreams == 8U ? 3U : 2U); + constexpr int64_t kBlockSize = sizeof(__m512i) * kNumStreams; const int64_t size = num_values * sizeof(T); - constexpr int64_t kBlockSize = sizeof(__m512i) * kNumStreams; if (size < kBlockSize) // Back to AVX2 for small size return ByteStreamSplitDecodeAvx2(data, num_values, stride, out); const int64_t num_blocks = size / kBlockSize; @@ -379,7 +381,7 @@ void ByteStreamSplitDecodeAvx512(const uint8_t* data, int64_t num_values, int64_ } } - if (kNumStreams == 8U) { + if constexpr (kNumStreams == 8U) { // path for double, 128i index: // {0x00, 0x04, 0x08, 0x0C}, {0x10, 0x14, 0x18, 0x1C}, // {0x01, 0x05, 0x09, 0x0D}, {0x11, 0x15, 0x19, 0x1D}, @@ -442,8 +444,10 @@ void ByteStreamSplitEncodeAvx512(const uint8_t* raw_values, const size_t num_val uint8_t* output_buffer_raw) { constexpr size_t kNumStreams = sizeof(T); static_assert(kNumStreams == 4U || kNumStreams == 8U, "Invalid number of streams."); - const size_t size = num_values * sizeof(T); constexpr size_t kBlockSize = sizeof(__m512i) * kNumStreams; + + const size_t size = num_values * sizeof(T); + if (size < kBlockSize) // Back to AVX2 for small size return ByteStreamSplitEncodeAvx2(raw_values, num_values, output_buffer_raw); @@ -469,7 +473,7 @@ void ByteStreamSplitEncodeAvx512(const uint8_t* raw_values, const size_t num_val __m512i unpack[KNumUnpack + 1][kNumStreams]; __m512i permutex[kNumStreams]; __m512i permutex_mask; - if (kNumStreams == 8U) { + if constexpr (kNumStreams == 8U) { // use _mm512_set_epi32, no _mm512_set_epi16 for some old gcc version. permutex_mask = _mm512_set_epi32(0x001F0017, 0x000F0007, 0x001E0016, 0x000E0006, 0x001D0015, 0x000D0005, 0x001C0014, 0x000C0004, @@ -494,7 +498,7 @@ void ByteStreamSplitEncodeAvx512(const uint8_t* raw_values, const size_t num_val } } - if (kNumStreams == 8U) { + if constexpr (kNumStreams == 8U) { // path for double // 1. unpack to epi16 block // 2. permutexvar_epi16 to 128i block diff --git a/cpp/src/arrow/util/compression.cc b/cpp/src/arrow/util/compression.cc index c67cb4539bc8f..5ad17e993f153 100644 --- a/cpp/src/arrow/util/compression.cc +++ b/cpp/src/arrow/util/compression.cc @@ -136,7 +136,7 @@ Result Codec::DefaultCompressionLevel(Compression::type codec_type) { } Result> Codec::Create(Compression::type codec_type, - int compression_level) { + const CodecOptions& codec_options) { if (!IsAvailable(codec_type)) { if (codec_type == Compression::LZO) { return Status::NotImplemented("LZO codec not implemented"); @@ -151,6 +151,7 @@ Result> Codec::Create(Compression::type codec_type, "' not built"); } + auto compression_level = codec_options.compression_level; if (compression_level != kUseDefaultCompressionLevel && !SupportsCompressionLevel(codec_type)) { return Status::Invalid("Codec '", GetCodecAsString(codec_type), @@ -166,16 +167,23 @@ Result> Codec::Create(Compression::type codec_type, codec = internal::MakeSnappyCodec(); #endif break; - case Compression::GZIP: + case Compression::GZIP: { #ifdef ARROW_WITH_ZLIB - codec = internal::MakeGZipCodec(compression_level); + auto opt = dynamic_cast(&codec_options); + codec = internal::MakeGZipCodec(compression_level, + opt ? opt->gzip_format : GZipFormat::GZIP, + opt ? opt->window_bits : std::nullopt); #endif break; - case Compression::BROTLI: + } + case Compression::BROTLI: { #ifdef ARROW_WITH_BROTLI - codec = internal::MakeBrotliCodec(compression_level); + auto opt = dynamic_cast(&codec_options); + codec = internal::MakeBrotliCodec(compression_level, + opt ? opt->window_bits : std::nullopt); #endif break; + } case Compression::LZ4: #ifdef ARROW_WITH_LZ4 codec = internal::MakeLz4RawCodec(compression_level); @@ -210,6 +218,12 @@ Result> Codec::Create(Compression::type codec_type, return std::move(codec); } +// use compression level to create Codec +Result> Codec::Create(Compression::type codec_type, + int compression_level) { + return Codec::Create(codec_type, CodecOptions{compression_level}); +} + bool Codec::IsAvailable(Compression::type codec_type) { switch (codec_type) { case Compression::UNCOMPRESSED: diff --git a/cpp/src/arrow/util/compression.h b/cpp/src/arrow/util/compression.h index f0d359d195c80..f7bf4d5e12d02 100644 --- a/cpp/src/arrow/util/compression.h +++ b/cpp/src/arrow/util/compression.h @@ -20,6 +20,7 @@ #include #include #include +#include #include #include "arrow/result.h" @@ -107,6 +108,40 @@ class ARROW_EXPORT Decompressor { // XXX add methods for buffer size heuristics? }; +/// \brief Compression codec options +class ARROW_EXPORT CodecOptions { + public: + explicit CodecOptions(int compression_level = kUseDefaultCompressionLevel) + : compression_level(compression_level) {} + + virtual ~CodecOptions() = default; + + int compression_level; +}; + +// ---------------------------------------------------------------------- +// GZip codec options implementation + +enum class GZipFormat { + ZLIB, + DEFLATE, + GZIP, +}; + +class ARROW_EXPORT GZipCodecOptions : public CodecOptions { + public: + GZipFormat gzip_format = GZipFormat::GZIP; + std::optional window_bits; +}; + +// ---------------------------------------------------------------------- +// brotli codec options implementation + +class ARROW_EXPORT BrotliCodecOptions : public CodecOptions { + public: + std::optional window_bits; +}; + /// \brief Compression codec class ARROW_EXPORT Codec { public: @@ -122,9 +157,13 @@ class ARROW_EXPORT Codec { /// \brief Return compression type for name (all lower case) static Result GetCompressionType(const std::string& name); - /// \brief Create a codec for the given compression algorithm + /// \brief Create a codec for the given compression algorithm with CodecOptions static Result> Create( - Compression::type codec, int compression_level = kUseDefaultCompressionLevel); + Compression::type codec, const CodecOptions& codec_options = CodecOptions{}); + + /// \brief Create a codec for the given compression algorithm + static Result> Create(Compression::type codec, + int compression_level); /// \brief Return true if support for indicated codec has been enabled static bool IsAvailable(Compression::type codec); diff --git a/cpp/src/arrow/util/compression_brotli.cc b/cpp/src/arrow/util/compression_brotli.cc index 0ee69281c9fa0..5025f595022f1 100644 --- a/cpp/src/arrow/util/compression_brotli.cc +++ b/cpp/src/arrow/util/compression_brotli.cc @@ -92,8 +92,8 @@ class BrotliDecompressor : public Decompressor { class BrotliCompressor : public Compressor { public: - explicit BrotliCompressor(int compression_level) - : compression_level_(compression_level) {} + explicit BrotliCompressor(int compression_level, int window_bits) + : compression_level_(compression_level), window_bits_(window_bits) {} ~BrotliCompressor() override { if (state_ != nullptr) { @@ -109,6 +109,9 @@ class BrotliCompressor : public Compressor { if (!BrotliEncoderSetParameter(state_, BROTLI_PARAM_QUALITY, compression_level_)) { return BrotliError("Brotli set compression level failed"); } + if (!BrotliEncoderSetParameter(state_, BROTLI_PARAM_LGWIN, window_bits_)) { + return BrotliError("Brotli set window size failed"); + } return Status::OK(); } @@ -166,6 +169,7 @@ class BrotliCompressor : public Compressor { private: const int compression_level_; + const int window_bits_; }; // ---------------------------------------------------------------------- @@ -173,10 +177,11 @@ class BrotliCompressor : public Compressor { class BrotliCodec : public Codec { public: - explicit BrotliCodec(int compression_level) + explicit BrotliCodec(int compression_level, int window_bits) : compression_level_(compression_level == kUseDefaultCompressionLevel ? kBrotliDefaultCompressionLevel - : compression_level) {} + : compression_level), + window_bits_(window_bits) {} Result Decompress(int64_t input_len, const uint8_t* input, int64_t output_buffer_len, uint8_t* output_buffer) override { @@ -201,16 +206,16 @@ class BrotliCodec : public Codec { DCHECK_GE(input_len, 0); DCHECK_GE(output_buffer_len, 0); std::size_t output_size = static_cast(output_buffer_len); - if (BrotliEncoderCompress(compression_level_, BROTLI_DEFAULT_WINDOW, - BROTLI_DEFAULT_MODE, static_cast(input_len), input, - &output_size, output_buffer) == BROTLI_FALSE) { + if (BrotliEncoderCompress(compression_level_, window_bits_, BROTLI_DEFAULT_MODE, + static_cast(input_len), input, &output_size, + output_buffer) == BROTLI_FALSE) { return Status::IOError("Brotli compression failure."); } return output_size; } Result> MakeCompressor() override { - auto ptr = std::make_shared(compression_level_); + auto ptr = std::make_shared(compression_level_, window_bits_); RETURN_NOT_OK(ptr->Init()); return ptr; } @@ -221,6 +226,14 @@ class BrotliCodec : public Codec { return ptr; } + Status Init() override { + if (window_bits_ < BROTLI_MIN_WINDOW_BITS || window_bits_ > BROTLI_MAX_WINDOW_BITS) { + return Status::Invalid("Brotli window_bits should be between ", + BROTLI_MIN_WINDOW_BITS, " and ", BROTLI_MAX_WINDOW_BITS); + } + return Status::OK(); + } + Compression::type compression_type() const override { return Compression::BROTLI; } int compression_level() const override { return compression_level_; } @@ -232,12 +245,15 @@ class BrotliCodec : public Codec { private: const int compression_level_; + const int window_bits_; }; } // namespace -std::unique_ptr MakeBrotliCodec(int compression_level) { - return std::make_unique(compression_level); +std::unique_ptr MakeBrotliCodec(int compression_level, + std::optional window_bits) { + return std::make_unique(compression_level, + window_bits.value_or(BROTLI_DEFAULT_WINDOW)); } } // namespace internal diff --git a/cpp/src/arrow/util/compression_internal.h b/cpp/src/arrow/util/compression_internal.h index d4cdca117da0c..ab2cf6d98b632 100644 --- a/cpp/src/arrow/util/compression_internal.h +++ b/cpp/src/arrow/util/compression_internal.h @@ -35,25 +35,20 @@ constexpr int kBrotliDefaultCompressionLevel = 8; // Brotli codec. std::unique_ptr MakeBrotliCodec( - int compression_level = kBrotliDefaultCompressionLevel); + int compression_level = kBrotliDefaultCompressionLevel, + std::optional window_bits = std::nullopt); // BZ2 codec. constexpr int kBZ2DefaultCompressionLevel = 9; + std::unique_ptr MakeBZ2Codec(int compression_level = kBZ2DefaultCompressionLevel); // GZip constexpr int kGZipDefaultCompressionLevel = 9; -struct GZipFormat { - enum type { - ZLIB, - DEFLATE, - GZIP, - }; -}; - std::unique_ptr MakeGZipCodec(int compression_level = kGZipDefaultCompressionLevel, - GZipFormat::type format = GZipFormat::GZIP); + GZipFormat format = GZipFormat::GZIP, + std::optional window_bits = std::nullopt); // Snappy std::unique_ptr MakeSnappyCodec(); diff --git a/cpp/src/arrow/util/compression_test.cc b/cpp/src/arrow/util/compression_test.cc index 761e883ec7e83..8f2a7f052ccb6 100644 --- a/cpp/src/arrow/util/compression_test.cc +++ b/cpp/src/arrow/util/compression_test.cc @@ -389,9 +389,81 @@ TEST(TestCodecMisc, SpecifyCompressionLevel) { continue; } const auto level = combination.level; + const auto codec_options = arrow::util::CodecOptions(level); const auto expect_success = combination.expect_success; - auto result1 = Codec::Create(compression, level); - auto result2 = Codec::Create(compression, level); + auto result1 = Codec::Create(compression, codec_options); + auto result2 = Codec::Create(compression, codec_options); + ASSERT_EQ(expect_success, result1.ok()); + ASSERT_EQ(expect_success, result2.ok()); + if (expect_success) { + CheckCodecRoundtrip(*result1, *result2, data); + } + } +} + +TEST(TestCodecMisc, SpecifyCodecOptionsGZip) { + // for now only GZIP & Brotli codec options supported, since it has specific parameters + // to be customized, other codecs could directly go with CodecOptions, could add more + // specific codec options if needed. + struct CombinationOption { + int level; + GZipFormat format; + int window_bits; + bool expect_success; + }; + constexpr CombinationOption combinations[] = {{2, GZipFormat::ZLIB, 12, true}, + {9, GZipFormat::GZIP, 9, true}, + {9, GZipFormat::GZIP, 20, false}, + {5, GZipFormat::DEFLATE, -12, false}, + {-992, GZipFormat::GZIP, 15, false}}; + + std::vector data = MakeRandomData(2000); + for (const auto& combination : combinations) { + const auto compression = Compression::GZIP; + if (!Codec::IsAvailable(compression)) { + // Support for this codec hasn't been built + continue; + } + auto codec_options = arrow::util::GZipCodecOptions(); + codec_options.compression_level = combination.level; + codec_options.gzip_format = combination.format; + codec_options.window_bits = combination.window_bits; + const auto expect_success = combination.expect_success; + auto result1 = Codec::Create(compression, codec_options); + auto result2 = Codec::Create(compression, codec_options); + ASSERT_EQ(expect_success, result1.ok()); + ASSERT_EQ(expect_success, result2.ok()); + if (expect_success) { + CheckCodecRoundtrip(*result1, *result2, data); + } + } +} + +TEST(TestCodecMisc, SpecifyCodecOptionsBrotli) { + // for now only GZIP & Brotli codec options supported, since it has specific parameters + // to be customized, other codecs could directly go with CodecOptions, could add more + // specific codec options if needed. + struct CombinationOption { + int level; + int window_bits; + bool expect_success; + }; + constexpr CombinationOption combinations[] = { + {8, 22, true}, {11, 10, true}, {1, 24, true}, {5, -12, false}, {-992, 25, false}}; + + std::vector data = MakeRandomData(2000); + for (const auto& combination : combinations) { + const auto compression = Compression::BROTLI; + if (!Codec::IsAvailable(compression)) { + // Support for this codec hasn't been built + continue; + } + auto codec_options = arrow::util::BrotliCodecOptions(); + codec_options.compression_level = combination.level; + codec_options.window_bits = combination.window_bits; + const auto expect_success = combination.expect_success; + auto result1 = Codec::Create(compression, codec_options); + auto result2 = Codec::Create(compression, codec_options); ASSERT_EQ(expect_success, result1.ok()); ASSERT_EQ(expect_success, result2.ok()); if (expect_success) { diff --git a/cpp/src/arrow/util/compression_zlib.cc b/cpp/src/arrow/util/compression_zlib.cc index 6dcc5153abd4e..2b38bdceab15b 100644 --- a/cpp/src/arrow/util/compression_zlib.cc +++ b/cpp/src/arrow/util/compression_zlib.cc @@ -44,7 +44,13 @@ namespace { // there. // Maximum window size -constexpr int WINDOW_BITS = 15; +constexpr int kGZipMaxWindowBits = 15; + +// Minimum window size +constexpr int kGZipMinWindowBits = 9; + +// Default window size +constexpr int kGZipDefaultWindowBits = 15; // Output Gzip. constexpr int GZIP_CODEC = 16; @@ -55,8 +61,7 @@ constexpr int DETECT_CODEC = 32; constexpr int kGZipMinCompressionLevel = 1; constexpr int kGZipMaxCompressionLevel = 9; -int CompressionWindowBitsForFormat(GZipFormat::type format) { - int window_bits = WINDOW_BITS; +int CompressionWindowBitsForFormat(GZipFormat format, int window_bits) { switch (format) { case GZipFormat::DEFLATE: window_bits = -window_bits; @@ -70,12 +75,12 @@ int CompressionWindowBitsForFormat(GZipFormat::type format) { return window_bits; } -int DecompressionWindowBitsForFormat(GZipFormat::type format) { +int DecompressionWindowBitsForFormat(GZipFormat format, int window_bits) { if (format == GZipFormat::DEFLATE) { - return -WINDOW_BITS; + return -window_bits; } else { /* If not deflate, autodetect format from header */ - return WINDOW_BITS | DETECT_CODEC; + return window_bits | DETECT_CODEC; } } @@ -88,8 +93,11 @@ Status ZlibErrorPrefix(const char* prefix_msg, const char* msg) { class GZipDecompressor : public Decompressor { public: - explicit GZipDecompressor(GZipFormat::type format) - : format_(format), initialized_(false), finished_(false) {} + explicit GZipDecompressor(GZipFormat format, int window_bits) + : format_(format), + window_bits_(window_bits), + initialized_(false), + finished_(false) {} ~GZipDecompressor() override { if (initialized_) { @@ -103,7 +111,7 @@ class GZipDecompressor : public Decompressor { finished_ = false; int ret; - int window_bits = DecompressionWindowBitsForFormat(format_); + int window_bits = DecompressionWindowBitsForFormat(format_, window_bits_); if ((ret = inflateInit2(&stream_, window_bits)) != Z_OK) { return ZlibError("zlib inflateInit failed: "); } else { @@ -161,7 +169,8 @@ class GZipDecompressor : public Decompressor { } z_stream stream_; - GZipFormat::type format_; + GZipFormat format_; + int window_bits_; bool initialized_; bool finished_; }; @@ -180,13 +189,13 @@ class GZipCompressor : public Compressor { } } - Status Init(GZipFormat::type format) { + Status Init(GZipFormat format, int input_window_bits) { DCHECK(!initialized_); memset(&stream_, 0, sizeof(stream_)); int ret; // Initialize to run specified format - int window_bits = CompressionWindowBitsForFormat(format); + int window_bits = CompressionWindowBitsForFormat(format, input_window_bits); if ((ret = deflateInit2(&stream_, Z_DEFAULT_COMPRESSION, Z_DEFLATED, window_bits, compression_level_, Z_DEFAULT_STRATEGY)) != Z_OK) { return ZlibError("zlib deflateInit failed: "); @@ -300,8 +309,9 @@ class GZipCompressor : public Compressor { class GZipCodec : public Codec { public: - explicit GZipCodec(int compression_level, GZipFormat::type format) + explicit GZipCodec(int compression_level, GZipFormat format, int window_bits) : format_(format), + window_bits_(window_bits), compressor_initialized_(false), decompressor_initialized_(false) { compression_level_ = compression_level == kUseDefaultCompressionLevel @@ -316,12 +326,12 @@ class GZipCodec : public Codec { Result> MakeCompressor() override { auto ptr = std::make_shared(compression_level_); - RETURN_NOT_OK(ptr->Init(format_)); + RETURN_NOT_OK(ptr->Init(format_, window_bits_)); return ptr; } Result> MakeDecompressor() override { - auto ptr = std::make_shared(format_); + auto ptr = std::make_shared(format_, window_bits_); RETURN_NOT_OK(ptr->Init()); return ptr; } @@ -332,7 +342,7 @@ class GZipCodec : public Codec { int ret; // Initialize to run specified format - int window_bits = CompressionWindowBitsForFormat(format_); + int window_bits = CompressionWindowBitsForFormat(format_, window_bits_); if ((ret = deflateInit2(&stream_, Z_DEFAULT_COMPRESSION, Z_DEFLATED, window_bits, compression_level_, Z_DEFAULT_STRATEGY)) != Z_OK) { return ZlibErrorPrefix("zlib deflateInit failed: ", stream_.msg); @@ -354,7 +364,7 @@ class GZipCodec : public Codec { int ret; // Initialize to run either deflate or zlib/gzip format - int window_bits = DecompressionWindowBitsForFormat(format_); + int window_bits = DecompressionWindowBitsForFormat(format_, window_bits_); if ((ret = inflateInit2(&stream_, window_bits)) != Z_OK) { return ZlibErrorPrefix("zlib inflateInit failed: ", stream_.msg); } @@ -461,6 +471,10 @@ class GZipCodec : public Codec { } Status Init() override { + if (window_bits_ < kGZipMinWindowBits || window_bits_ > kGZipMaxWindowBits) { + return Status::Invalid("GZip window_bits should be between ", kGZipMinWindowBits, + " and ", kGZipMaxWindowBits); + } const Status init_compressor_status = InitCompressor(); if (!init_compressor_status.ok()) { return init_compressor_status; @@ -482,7 +496,7 @@ class GZipCodec : public Codec { // Realistically, this will always be GZIP, but we leave the option open to // configure - GZipFormat::type format_; + GZipFormat format_; // These variables are mutually exclusive. When the codec is in "compressor" // state, compressor_initialized_ is true while decompressor_initialized_ is @@ -491,6 +505,7 @@ class GZipCodec : public Codec { // Indeed, this is slightly hacky, but the alternative is having separate // Compressor and Decompressor classes. If this ever becomes an issue, we can // perform the refactoring then + int window_bits_; bool compressor_initialized_; bool decompressor_initialized_; int compression_level_; @@ -498,8 +513,10 @@ class GZipCodec : public Codec { } // namespace -std::unique_ptr MakeGZipCodec(int compression_level, GZipFormat::type format) { - return std::make_unique(compression_level, format); +std::unique_ptr MakeGZipCodec(int compression_level, GZipFormat format, + std::optional window_bits) { + return std::make_unique(compression_level, format, + window_bits.value_or(kGZipDefaultWindowBits)); } } // namespace internal diff --git a/cpp/src/arrow/util/config.h.cmake b/cpp/src/arrow/util/config.h.cmake index f6fad2016a27e..1008b9c6b9a05 100644 --- a/cpp/src/arrow/util/config.h.cmake +++ b/cpp/src/arrow/util/config.h.cmake @@ -57,5 +57,3 @@ #cmakedefine ARROW_WITH_MUSL #cmakedefine ARROW_WITH_OPENTELEMETRY #cmakedefine ARROW_WITH_UCX - -#cmakedefine GRPCPP_PP_INCLUDE diff --git a/cpp/src/arrow/util/decimal.cc b/cpp/src/arrow/util/decimal.cc index 1f8447059f633..704b6bb9d491d 100644 --- a/cpp/src/arrow/util/decimal.cc +++ b/cpp/src/arrow/util/decimal.cc @@ -305,12 +305,39 @@ struct Decimal128RealConversion } template - static Real ToRealPositive(const Decimal128& decimal, int32_t scale) { + static Real ToRealPositiveNoSplit(const Decimal128& decimal, int32_t scale) { Real x = RealTraits::two_to_64(static_cast(decimal.high_bits())); x += static_cast(decimal.low_bits()); x *= LargePowerOfTen(-scale); return x; } + + /// An appoximate conversion from Decimal128 to Real that guarantees: + /// 1. If the decimal is an integer, the conversion is exact. + /// 2. If the number of fractional digits is <= RealTraits::kMantissaDigits (e.g. + /// 8 for float and 16 for double), the conversion is within 1 ULP of the exact + /// value. + /// 3. Otherwise, the conversion is within 2^(-RealTraits::kMantissaDigits+1) + /// (e.g. 2^-23 for float and 2^-52 for double) of the exact value. + /// Here "exact value" means the closest representable value by Real. + template + static Real ToRealPositive(const Decimal128& decimal, int32_t scale) { + if (scale <= 0 || (decimal.high_bits() == 0 && + decimal.low_bits() <= RealTraits::kMaxPreciseInteger)) { + // No need to split the decimal if it is already an integer (scale <= 0) or if it + // can be precisely represented by Real + return ToRealPositiveNoSplit(decimal, scale); + } + + // Split decimal into whole and fractional parts to avoid precision loss + BasicDecimal128 whole_decimal, fraction_decimal; + decimal.GetWholeAndFraction(scale, &whole_decimal, &fraction_decimal); + + Real whole = ToRealPositiveNoSplit(whole_decimal, 0); + Real fraction = ToRealPositiveNoSplit(fraction_decimal, scale); + + return whole + fraction; + } }; } // namespace @@ -967,7 +994,7 @@ struct Decimal256RealConversion } template - static Real ToRealPositive(const Decimal256& decimal, int32_t scale) { + static Real ToRealPositiveNoSplit(const Decimal256& decimal, int32_t scale) { DCHECK_GE(decimal, 0); Real x = 0; const auto parts_le = bit_util::little_endian::Make(decimal.native_endian_array()); @@ -978,6 +1005,33 @@ struct Decimal256RealConversion x *= LargePowerOfTen(-scale); return x; } + + /// An appoximate conversion from Decimal256 to Real that guarantees: + /// 1. If the decimal is an integer, the conversion is exact. + /// 2. If the number of fractional digits is <= RealTraits::kMantissaDigits (e.g. + /// 8 for float and 16 for double), the conversion is within 1 ULP of the exact + /// value. + /// 3. Otherwise, the conversion is within 2^(-RealTraits::kMantissaDigits+1) + /// (e.g. 2^-23 for float and 2^-52 for double) of the exact value. + /// Here "exact value" means the closest representable value by Real. + template + static Real ToRealPositive(const Decimal256& decimal, int32_t scale) { + const auto parts_le = decimal.little_endian_array(); + if (scale <= 0 || (parts_le[3] == 0 && parts_le[2] == 0 && parts_le[1] == 0 && + parts_le[0] < RealTraits::kMaxPreciseInteger)) { + // No need to split the decimal if it is already an integer (scale <= 0) or if it + // can be precisely represented by Real + return ToRealPositiveNoSplit(decimal, scale); + } + + // Split the decimal into whole and fractional parts to avoid precision loss + BasicDecimal256 whole_decimal, fraction_decimal; + decimal.GetWholeAndFraction(scale, &whole_decimal, &fraction_decimal); + + Real whole = ToRealPositiveNoSplit(whole_decimal, 0); + Real fraction = ToRealPositiveNoSplit(fraction_decimal, scale); + return whole + fraction; + } }; } // namespace diff --git a/cpp/src/arrow/util/decimal_internal.h b/cpp/src/arrow/util/decimal_internal.h index 041aac4ef860d..51a7229ab6678 100644 --- a/cpp/src/arrow/util/decimal_internal.h +++ b/cpp/src/arrow/util/decimal_internal.h @@ -451,6 +451,8 @@ struct RealTraits { static constexpr int kMantissaBits = 24; // ceil(log10(2 ^ kMantissaBits)) static constexpr int kMantissaDigits = 8; + // Integers between zero and kMaxPreciseInteger can be precisely represented + static constexpr uint64_t kMaxPreciseInteger = (1ULL << kMantissaBits) - 1; }; template <> @@ -464,6 +466,8 @@ struct RealTraits { static constexpr int kMantissaBits = 53; // ceil(log10(2 ^ kMantissaBits)) static constexpr int kMantissaDigits = 16; + // Integers between zero and kMaxPreciseInteger can be precisely represented + static constexpr uint64_t kMaxPreciseInteger = (1ULL << kMantissaBits) - 1; }; template diff --git a/cpp/src/arrow/util/decimal_test.cc b/cpp/src/arrow/util/decimal_test.cc index 1401750ce76d6..0a8b7a09730bf 100644 --- a/cpp/src/arrow/util/decimal_test.cc +++ b/cpp/src/arrow/util/decimal_test.cc @@ -1046,24 +1046,51 @@ using ToDoubleTestParam = ToRealTestParam; template void CheckDecimalToReal(const std::string& decimal_value, int32_t scale, Real expected) { Decimal dec(decimal_value); - ASSERT_EQ(dec.template ToReal(scale), expected) - << "Decimal value: " << decimal_value << " Scale: " << scale; + Real actual = dec.template ToReal(scale); + ASSERT_EQ(actual, expected) << "Decimal value: " << decimal_value + << ", scale: " << scale << ", expected: " << expected + << ", actual: " << actual; +} + +template +void CheckDecimalToRealWithinOneULP(const std::string& decimal_value, int32_t scale, + Real expected) { + Decimal dec(decimal_value); + Real actual = dec.template ToReal(scale); + ASSERT_TRUE(actual == expected || actual == std::nextafter(expected, expected + 1) || + actual == std::nextafter(expected, expected - 1)) + << "Decimal value: " << decimal_value << ", scale: " << scale + << ", expected: " << expected << ", actual: " << actual; +} + +template +void CheckDecimalToRealWithinEpsilon(const std::string& decimal_value, int32_t scale, + Real epsilon, Real expected) { + Decimal dec(decimal_value); + Real actual = dec.template ToReal(scale); + ASSERT_LE(std::abs(actual - expected), epsilon) + << "Decimal value: " << decimal_value << ", scale: " << scale + << ", expected: " << expected << ", actual: " << actual; } template void CheckDecimalToRealApprox(const std::string& decimal_value, int32_t scale, float expected) { Decimal dec(decimal_value); - ASSERT_FLOAT_EQ(dec.template ToReal(scale), expected) - << "Decimal value: " << decimal_value << " Scale: " << scale; + float actual = dec.template ToReal(scale); + ASSERT_FLOAT_EQ(actual, expected) + << "Decimal value: " << decimal_value << ", scale: " << scale + << ", expected: " << expected << ", actual: " << actual; } template void CheckDecimalToRealApprox(const std::string& decimal_value, int32_t scale, double expected) { Decimal dec(decimal_value); - ASSERT_DOUBLE_EQ(dec.template ToReal(scale), expected) - << "Decimal value: " << decimal_value << " Scale: " << scale; + double actual = dec.template ToReal(scale); + ASSERT_DOUBLE_EQ(actual, expected) + << "Decimal value: " << decimal_value << ", scale: " << scale + << ", expected: " << expected << ", actual: " << actual; } // Common tests for Decimal128::ToReal and Decimal256::ToReal @@ -1110,59 +1137,79 @@ class TestDecimalToReal : public ::testing::Test { } } } +}; - // Test precision of conversions to float values - void TestPrecision() { - // 2**63 + 2**40 (exactly representable in a float's 24 bits of precision) - CheckDecimalToReal("9223373136366403584", 0, 9.223373e+18f); - CheckDecimalToReal("-9223373136366403584", 0, -9.223373e+18f); - // 2**64 + 2**41 (exactly representable in a float) - CheckDecimalToReal("18446746272732807168", 0, 1.8446746e+19f); - CheckDecimalToReal("-18446746272732807168", 0, -1.8446746e+19f); - } +TYPED_TEST_SUITE(TestDecimalToReal, RealTypes); +TYPED_TEST(TestDecimalToReal, TestSuccess) { this->TestSuccess(); } - // Test conversions with a range of scales - void TestLargeValues(int32_t max_scale) { - // Note that exact comparisons would succeed on some platforms (Linux, macOS). - // Nevertheless, power-of-ten factors are not all exactly representable - // in binary floating point. - for (int32_t scale = -max_scale; scale <= max_scale; scale++) { +// Custom test for Decimal::ToReal +template +class TestDecimalToRealFloat : public TestDecimalToReal> {}; +TYPED_TEST_SUITE(TestDecimalToRealFloat, DecimalTypes); + +TYPED_TEST(TestDecimalToRealFloat, LargeValues) { + auto max_scale = TypeParam::kMaxScale; + // Note that exact comparisons would succeed on some platforms (Linux, macOS). + // Nevertheless, power-of-ten factors are not all exactly representable + // in binary floating point. + for (int32_t scale = -max_scale; scale <= max_scale; scale++) { #ifdef _WIN32 - // MSVC gives pow(10.f, -45.f) == 0 even though 1e-45f is nonzero - if (scale == 45) continue; + // MSVC gives pow(10.f, -45.f) == 0 even though 1e-45f is nonzero + if (scale == 45) continue; #endif - CheckDecimalToRealApprox("1", scale, Pow10(-scale)); - } - for (int32_t scale = -max_scale; scale <= max_scale - 2; scale++) { + CheckDecimalToRealApprox("1", scale, this->Pow10(-scale)); + } + for (int32_t scale = -max_scale; scale <= max_scale - 2; scale++) { #ifdef _WIN32 - // MSVC gives pow(10.f, -45.f) == 0 even though 1e-45f is nonzero - if (scale == 45) continue; + // MSVC gives pow(10.f, -45.f) == 0 even though 1e-45f is nonzero + if (scale == 45) continue; #endif - const Real factor = static_cast(123); - CheckDecimalToRealApprox("123", scale, factor * Pow10(-scale)); - } + const auto factor = static_cast(123); + CheckDecimalToRealApprox("123", scale, factor * this->Pow10(-scale)); } -}; - -TYPED_TEST_SUITE(TestDecimalToReal, RealTypes); - -TYPED_TEST(TestDecimalToReal, TestSuccess) { this->TestSuccess(); } +} -// Custom test for Decimal128::ToReal -class TestDecimal128ToRealFloat : public TestDecimalToReal> { -}; -TEST_F(TestDecimal128ToRealFloat, LargeValues) { TestLargeValues(/*max_scale=*/38); } -TEST_F(TestDecimal128ToRealFloat, Precision) { this->TestPrecision(); } -// Custom test for Decimal256::ToReal -class TestDecimal256ToRealFloat : public TestDecimalToReal> { -}; -TEST_F(TestDecimal256ToRealFloat, LargeValues) { TestLargeValues(/*max_scale=*/76); } -TEST_F(TestDecimal256ToRealFloat, Precision) { this->TestPrecision(); } +TYPED_TEST(TestDecimalToRealFloat, Precision) { + // 2**63 + 2**40 (exactly representable in a float's 24 bits of precision) + CheckDecimalToReal("9223373136366403584", 0, 9.223373e+18f); + CheckDecimalToReal("-9223373136366403584", 0, -9.223373e+18f); + // 2**64 + 2**41 (exactly representable in a float) + CheckDecimalToReal("18446746272732807168", 0, 1.8446746e+19f); + CheckDecimalToReal("-18446746272732807168", 0, -1.8446746e+19f); + + // Integers are always exact + auto scale = TypeParam::kMaxScale - 1; + std::string seven = "7."; + seven.append(scale, '0'); // pad with trailing zeros + CheckDecimalToReal(seven, scale, 7.0f); + CheckDecimalToReal("-" + seven, scale, -7.0f); + + CheckDecimalToReal("99999999999999999999.0000000000000000", 16, + 99999999999999999999.0f); + CheckDecimalToReal("-99999999999999999999.0000000000000000", 16, + -99999999999999999999.0f); + + // Small fractions are within one ULP + CheckDecimalToRealWithinOneULP("9999999.9", 1, 9999999.9f); + CheckDecimalToRealWithinOneULP("-9999999.9", 1, -9999999.9f); + CheckDecimalToRealWithinOneULP("9999999.999999", 6, 9999999.999999f); + CheckDecimalToRealWithinOneULP("-9999999.999999", 6, + -9999999.999999f); + + // Large fractions are within 2^-23 + constexpr float epsilon = 1.1920928955078125e-07f; // 2^-23 + CheckDecimalToRealWithinEpsilon( + "112334829348925.99070703983306884765625", 23, epsilon, + 112334829348925.99070703983306884765625f); + CheckDecimalToRealWithinEpsilon( + "1.987748987892758765582589910934859345", 36, epsilon, + 1.987748987892758765582589910934859345f); +} // ToReal tests are disabled on MinGW because of precision issues in results #ifndef __MINGW32__ -// Custom test for Decimal128::ToReal +// Custom test for Decimal::ToReal template class TestDecimalToRealDouble : public TestDecimalToReal> { }; @@ -1209,6 +1256,34 @@ TYPED_TEST(TestDecimalToRealDouble, Precision) { 9.999999999999998e+47); CheckDecimalToReal("-99999999999999978859343891977453174784", -10, -9.999999999999998e+47); + // Integers are always exact + auto scale = TypeParam::kMaxScale - 1; + std::string seven = "7."; + seven.append(scale, '0'); + CheckDecimalToReal(seven, scale, 7.0); + CheckDecimalToReal("-" + seven, scale, -7.0); + + CheckDecimalToReal("99999999999999999999.0000000000000000", 16, + 99999999999999999999.0); + CheckDecimalToReal("-99999999999999999999.0000000000000000", 16, + -99999999999999999999.0); + + // Small fractions are within one ULP + CheckDecimalToRealWithinOneULP("9999999.9", 1, 9999999.9); + CheckDecimalToRealWithinOneULP("-9999999.9", 1, -9999999.9); + CheckDecimalToRealWithinOneULP("9999999.999999999999999", 15, + 9999999.999999999999999); + CheckDecimalToRealWithinOneULP("-9999999.999999999999999", 15, + -9999999.999999999999999); + + // Large fractions are within 2^-52 + constexpr double epsilon = 2.220446049250313080847263336181640625e-16; // 2^-52 + CheckDecimalToRealWithinEpsilon( + "112334829348925.99070703983306884765625", 23, epsilon, + 112334829348925.99070703983306884765625); + CheckDecimalToRealWithinEpsilon( + "1.987748987892758765582589910934859345", 36, epsilon, + 1.987748987892758765582589910934859345); } #endif // __MINGW32__ diff --git a/cpp/src/arrow/util/io_util.cc b/cpp/src/arrow/util/io_util.cc index 18cac9ae11c61..ac92618ff6603 100644 --- a/cpp/src/arrow/util/io_util.cc +++ b/cpp/src/arrow/util/io_util.cc @@ -45,6 +45,7 @@ #include #include #include +#include #include #include #include @@ -1896,7 +1897,8 @@ std::vector GetPlatformTemporaryDirs() { } std::string MakeRandomName(int num_chars) { - static const std::string chars = "0123456789abcdefghijklmnopqrstuvwxyz"; + constexpr std::string_view chars = "0123456789abcdefghijklmnopqrstuvwxyz"; + std::default_random_engine gen( static_cast(GetRandomSeed())); std::uniform_int_distribution dist(0, static_cast(chars.length() - 1)); diff --git a/cpp/src/arrow/util/logging_test.cc b/cpp/src/arrow/util/logging_test.cc index 547e0bba3c5f7..9c991213dd4e1 100644 --- a/cpp/src/arrow/util/logging_test.cc +++ b/cpp/src/arrow/util/logging_test.cc @@ -68,6 +68,33 @@ TEST(PrintLogTest, LogTestWithInit) { ArrowLog::ShutDownArrowLog(); } +struct LoggingTracer { + mutable bool was_printed = false; + + friend std::ostream& operator<<(std::ostream& os, const LoggingTracer& x) { + x.was_printed = true; + return os; + } +}; + +TEST(ArrowCheck, PayloadNotEvaluatedOnSuccess) { + volatile bool cond = true; + LoggingTracer tracer; + + ARROW_CHECK_OR_LOG(cond, WARNING) << "Some message" << tracer; + ASSERT_FALSE(tracer.was_printed); +} + +TEST(ArrowCheck, PayloadEvaluatedOnFailure) { + volatile bool cond = false; + LoggingTracer tracer; + + // Have to use a log level that actually gets printed, otherwise `operator<<` + // isn't called (which is good except for this test). + ARROW_CHECK_OR_LOG(cond, WARNING) << "Some message" << tracer; + ASSERT_TRUE(tracer.was_printed); +} + } // namespace util TEST(DcheckMacros, DoNotEvaluateReleaseMode) { diff --git a/cpp/src/arrow/util/ree_util.cc b/cpp/src/arrow/util/ree_util.cc index 11da64313a811..fcd6c204e06b2 100644 --- a/cpp/src/arrow/util/ree_util.cc +++ b/cpp/src/arrow/util/ree_util.cc @@ -85,6 +85,26 @@ int64_t FindPhysicalLength(const ArraySpan& span) { return internal::FindPhysicalLength(span); } +std::pair FindPhysicalRange(const ArraySpan& span, int64_t offset, + int64_t length) { + const auto& run_ends_span = RunEndsArray(span); + auto type_id = run_ends_span.type->id(); + if (type_id == Type::INT16) { + auto* run_ends = run_ends_span.GetValues(1); + return internal::FindPhysicalRange(run_ends, run_ends_span.length, length, + offset); + } + if (type_id == Type::INT32) { + auto* run_ends = run_ends_span.GetValues(1); + return internal::FindPhysicalRange(run_ends, run_ends_span.length, length, + offset); + } + DCHECK_EQ(type_id, Type::INT64); + auto* run_ends = run_ends_span.GetValues(1); + return internal::FindPhysicalRange(run_ends, run_ends_span.length, length, + offset); +} + namespace { template diff --git a/cpp/src/arrow/util/ree_util.h b/cpp/src/arrow/util/ree_util.h index e708eb0b59472..5a240240b859f 100644 --- a/cpp/src/arrow/util/ree_util.h +++ b/cpp/src/arrow/util/ree_util.h @@ -73,24 +73,38 @@ int64_t FindPhysicalIndex(const RunEndCType* run_ends, int64_t run_ends_size, in return result; } -/// \brief Uses binary-search to calculate the number of physical values (and +/// \brief Uses binary-search to calculate the range of physical values (and /// run-ends) necessary to represent the logical range of values from /// offset to length +/// +/// \return a pair of physical offset and physical length template -int64_t FindPhysicalLength(const RunEndCType* run_ends, int64_t run_ends_size, - int64_t length, int64_t offset) { +std::pair FindPhysicalRange(const RunEndCType* run_ends, + int64_t run_ends_size, int64_t length, + int64_t offset) { + const int64_t physical_offset = + FindPhysicalIndex(run_ends, run_ends_size, 0, offset); // The physical length is calculated by finding the offset of the last element // and adding 1 to it, so first we ensure there is at least one element. if (length == 0) { - return 0; + return {physical_offset, 0}; } - const int64_t physical_offset = - FindPhysicalIndex(run_ends, run_ends_size, 0, offset); const int64_t physical_index_of_last = FindPhysicalIndex( run_ends + physical_offset, run_ends_size - physical_offset, length - 1, offset); assert(physical_index_of_last < run_ends_size - physical_offset); - return physical_index_of_last + 1; + return {physical_offset, physical_index_of_last + 1}; +} + +/// \brief Uses binary-search to calculate the number of physical values (and +/// run-ends) necessary to represent the logical range of values from +/// offset to length +template +int64_t FindPhysicalLength(const RunEndCType* run_ends, int64_t run_ends_size, + int64_t length, int64_t offset) { + auto [_, physical_length] = + FindPhysicalRange(run_ends, run_ends_size, length, offset); + return physical_length; } /// \brief Find the physical index into the values array of the REE ArraySpan @@ -125,7 +139,8 @@ int64_t FindPhysicalLength(const ArraySpan& span) { /// \brief Find the physical index into the values array of the REE ArraySpan /// /// This function uses binary-search, so it has a O(log N) cost. -int64_t FindPhysicalIndex(const ArraySpan& span, int64_t i, int64_t absolute_offset); +ARROW_EXPORT int64_t FindPhysicalIndex(const ArraySpan& span, int64_t i, + int64_t absolute_offset); /// \brief Find the physical length of an REE ArraySpan /// @@ -136,7 +151,15 @@ int64_t FindPhysicalIndex(const ArraySpan& span, int64_t i, int64_t absolute_off /// Avoid calling this function if the physical length can be estabilished in /// some other way (e.g. when iterating over the runs sequentially until the /// end). This function uses binary-search, so it has a O(log N) cost. -int64_t FindPhysicalLength(const ArraySpan& span); +ARROW_EXPORT int64_t FindPhysicalLength(const ArraySpan& span); + +/// \brief Find the physical range of physical values referenced by the REE in +/// the logical range from offset to offset + length +/// +/// \return a pair of physical offset and physical length +ARROW_EXPORT std::pair FindPhysicalRange(const ArraySpan& span, + int64_t offset, + int64_t length); template class RunEndEncodedArraySpan { diff --git a/cpp/src/arrow/util/span.h b/cpp/src/arrow/util/span.h new file mode 100644 index 0000000000000..4254fec75e145 --- /dev/null +++ b/cpp/src/arrow/util/span.h @@ -0,0 +1,132 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include +#include + +namespace arrow::util { + +/// std::span polyfill. +/// +/// Does not support static extents. +template +class span { + static_assert(sizeof(T), + R"( +std::span allows contiguous_iterators instead of just pointers, the enforcement +of which requires T to be a complete type. arrow::util::span does not support +contiguous_iterators, but T is still required to be a complete type to prevent +writing code which would break when it is replaced by std::span.)"); + + public: + using element_type = T; + using value_type = std::remove_cv_t; + using iterator = T*; + using const_iterator = T const*; + + span() = default; + span(const span&) = default; + span& operator=(const span&) = default; + + template >> + // NOLINTNEXTLINE runtime/explicit + constexpr span(span mut) : span{mut.data(), mut.size()} {} + + constexpr span(T* data, size_t count) : data_{data}, size_{count} {} + + constexpr span(T* begin, T* end) + : data_{begin}, size_{static_cast(end - begin)} {} + + template < + typename R, + typename DisableUnlessConstructibleFromDataAndSize = + decltype(span(std::data(std::declval()), std::size(std::declval()))), + typename DisableUnlessSimilarTypes = std::enable_if_t()))>>, + std::decay_t>>> + // NOLINTNEXTLINE runtime/explicit, non-const reference + constexpr span(R&& range) : span{std::data(range), std::size(range)} {} + + constexpr T* begin() const { return data_; } + constexpr T* end() const { return data_ + size_; } + constexpr T* data() const { return data_; } + + constexpr size_t size() const { return size_; } + constexpr size_t size_bytes() const { return size_ * sizeof(T); } + constexpr bool empty() const { return size_ == 0; } + + constexpr T& operator[](size_t i) { return data_[i]; } + constexpr const T& operator[](size_t i) const { return data_[i]; } + + constexpr span subspan(size_t offset) const { + if (offset > size_) return {data_, data_}; + return {data_ + offset, size_ - offset}; + } + + constexpr span subspan(size_t offset, size_t count) const { + auto out = subspan(offset); + if (count < out.size_) { + out.size_ = count; + } + return out; + } + + constexpr bool operator==(span const& other) const { + if (size_ != other.size_) return false; + + if constexpr (std::is_integral_v) { + if (size_ == 0) { + return true; // memcmp does not handle null pointers, even if size_ == 0 + } + return std::memcmp(data_, other.data_, size_bytes()) == 0; + } else { + T* ptr = data_; + for (T const& e : other) { + if (*ptr++ != e) return false; + } + return true; + } + } + constexpr bool operator!=(span const& other) const { return !(*this == other); } + + private: + T* data_{}; + size_t size_{}; +}; + +template +span(R& range) -> span>; + +template +span(T*, size_t) -> span; + +template +constexpr span as_bytes(span s) { + return {reinterpret_cast(s.data()), s.size_bytes()}; +} + +template +constexpr span as_writable_bytes(span s) { + return {reinterpret_cast(s.data()), s.size_bytes()}; +} + +} // namespace arrow::util diff --git a/cpp/src/arrow/util/span_test.cc b/cpp/src/arrow/util/span_test.cc new file mode 100644 index 0000000000000..fcbb49f71e5d0 --- /dev/null +++ b/cpp/src/arrow/util/span_test.cc @@ -0,0 +1,205 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include + +#include "arrow/testing/gtest_util.h" +#include "arrow/testing/matchers.h" +#include "arrow/util/span.h" + +using testing::ElementsAre; +using testing::ElementsAreArray; +using testing::PrintToString; + +namespace arrow::util { + +template +std::ostream& operator<<(std::ostream& os, const span& span) { + // Inefficient but good enough for testing + os << PrintToString(std::vector(span.begin(), span.end())); + return os; +} + +TEST(Span, Construction) { + // const spans may be constructed from mutable spans + static_assert(std::is_constructible_v, span>); + // ... but mutable spans may be constructed from const spans + static_assert(!std::is_constructible_v, span>); + + int arr[] = {1, 2, 3}; + constexpr int const_arr[] = {7, 8, 9}; + + static_assert(std::is_constructible_v, decltype(arr)&>); + static_assert(!std::is_constructible_v, decltype(const_arr)&>); + + static_assert(std::is_constructible_v, decltype(arr)&>); + static_assert(std::is_constructible_v, decltype(const_arr)&>); + static_assert(std::is_constructible_v, span>); + + static_assert(std::is_constructible_v, std::vector&>); + static_assert(!std::is_constructible_v, const std::vector&>); + static_assert(!std::is_constructible_v, std::vector&&>); + + static_assert(std::is_constructible_v, std::vector&>); + static_assert(std::is_constructible_v, const std::vector&>); + // const spans may even be constructed from rvalue ranges + static_assert(std::is_constructible_v, std::vector&&>); + + EXPECT_THAT(span(const_arr), ElementsAreArray(const_arr)); + EXPECT_THAT(span(arr), ElementsAreArray(arr)); + + static_assert(!std::is_constructible_v, decltype(const_arr)&>); + static_assert(!std::is_constructible_v, decltype(const_arr)&>); +} + +TEST(Span, TemplateArgumentDeduction) { + int arr[3]; + const int const_arr[] = {1, 2, 3}; + std::vector vec; + const std::vector const_vec; + static_assert(std::is_same_v>); + static_assert(std::is_same_v>); + static_assert(std::is_same_v>); + static_assert(std::is_same_v>); +} + +TEST(Span, Size) { + int arr[] = {1, 2, 3}; + EXPECT_EQ(span(arr).size(), 3); + EXPECT_EQ(span(arr).size_bytes(), sizeof(int) * 3); + + std::vector vec; + EXPECT_TRUE(span(vec).empty()); + EXPECT_EQ(span(vec).size(), 0); + EXPECT_EQ(span(vec).size_bytes(), 0); + + vec.resize(999); + EXPECT_FALSE(span(vec).empty()); + EXPECT_EQ(span(vec).size(), 999); + EXPECT_EQ(span(vec).size_bytes(), sizeof(int) * 999); +} + +TEST(Span, Equality) { + auto check_eq = [](auto l, auto r) { + ARROW_SCOPED_TRACE("l = ", l, ", r = ", r); + EXPECT_TRUE(l == r); + EXPECT_FALSE(l != r); + }; + auto check_ne = [](auto l, auto r) { + ARROW_SCOPED_TRACE("l = ", l, ", r = ", r); + EXPECT_TRUE(l != r); + EXPECT_FALSE(l == r); + }; + + { + // exercise integral branch with memcmp + check_eq(span(), span()); + + int arr[] = {1, 2, 3}; + check_eq(span(arr), span(arr)); + check_eq(span(arr).subspan(1), span(arr).subspan(1)); + check_ne(span(arr).subspan(1), span(arr).subspan(2)); + + std::vector vec{1, 2, 3}; + check_eq(span(vec), span(arr)); + check_eq(span(vec).subspan(1), span(arr).subspan(1)); + + vec = {2, 3, 4}; + check_ne(span(vec), span(arr)); + check_eq(span(vec).subspan(0, 2), span(arr).subspan(1)); + + // 0-sized + vec = {}; + check_ne(span(vec), span(arr)); + check_eq(span(vec), span(arr).subspan(3)); + } + { + // exercise non-integral branch with for loop + check_eq(span(), span()); + + std::string arr[] = {"a", "b", "c"}; + check_eq(span(arr), span(arr)); + check_eq(span(arr).subspan(1), span(arr).subspan(1)); + + std::vector vec{"a", "b", "c"}; + check_eq(span(vec), span(arr)); + check_eq(span(vec).subspan(1), span(arr).subspan(1)); + + vec = {"b", "c", "d"}; + check_ne(span(vec), span(arr)); + check_eq(span(vec).subspan(0, 2), span(arr).subspan(1)); + + // 0-sized + vec = {}; + check_ne(span(vec), span(arr)); + check_eq(span(vec), span(arr).subspan(3)); + } +} + +TEST(Span, SubSpan) { + int arr[] = {1, 2, 3}; + span s(arr); + + auto ExpectIdentical = [](span l, span r) { + EXPECT_EQ(l.data(), r.data()); + EXPECT_EQ(l.size(), r.size()); + }; + + ExpectIdentical(s.subspan(0), s); + ExpectIdentical(s.subspan(0, s.size()), s); + + for (size_t offset = 0; offset < s.size(); ++offset) { + span expected(arr + offset, s.size() - offset); + ExpectIdentical(s.subspan(offset), expected); + ExpectIdentical(s.subspan(offset, s.size() * 3), expected); + } + EXPECT_TRUE(s.subspan(s.size()).empty()); + EXPECT_TRUE(s.subspan(s.size() * 3).empty()); + + for (size_t length = 0; length < s.size(); ++length) { + span expected(arr, length); + ExpectIdentical(s.subspan(0, length), expected); + } + + ExpectIdentical(s.subspan(1, 1), span(arr + 1, 1)); +} + +TEST(Span, Mutation) { + size_t arr[] = {9, 9, 9, 9, 9}; + + span s(arr); + for (size_t i = 0; i < s.size(); ++i) { + s[i] = i * i; + } + + EXPECT_THAT(arr, ElementsAre(0, 1, 4, 9, 16)); + + auto set = [](span lhs, size_t rhs) { + for (size_t& i : lhs) { + i = rhs; + } + }; + set(span(arr), 0); + set(span(arr).subspan(1), 1); + set(span(arr).subspan(2, 2), 23); + EXPECT_THAT(arr, ElementsAre(0, 1, 23, 23, 1)); +} + +} // namespace arrow::util diff --git a/cpp/src/arrow/util/value_parsing.cc b/cpp/src/arrow/util/value_parsing.cc index 92495612a7df8..f6a24ac1467f8 100644 --- a/cpp/src/arrow/util/value_parsing.cc +++ b/cpp/src/arrow/util/value_parsing.cc @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +#define FASTFLOAT_ALLOWS_LEADING_PLUS 1 + #include "arrow/util/value_parsing.h" #include diff --git a/cpp/src/arrow/util/value_parsing_test.cc b/cpp/src/arrow/util/value_parsing_test.cc index 6f83b6dfa6592..30c5e6aae74ba 100644 --- a/cpp/src/arrow/util/value_parsing_test.cc +++ b/cpp/src/arrow/util/value_parsing_test.cc @@ -119,6 +119,9 @@ TEST(StringConversion, ToFloat) { AssertConversion("0", 0.0f); AssertConversion("-0.0", -0.0f); AssertConversion("-1e20", -1e20f); + AssertConversion("+Infinity", std::numeric_limits::infinity()); + AssertConversion("-Infinity", -std::numeric_limits::infinity()); + AssertConversion("Infinity", std::numeric_limits::infinity()); AssertConversionFails(""); AssertConversionFails("e"); @@ -135,6 +138,9 @@ TEST(StringConversion, ToDouble) { AssertConversion("0", 0); AssertConversion("-0.0", -0.0); AssertConversion("-1e100", -1e100); + AssertConversion("+Infinity", std::numeric_limits::infinity()); + AssertConversion("-Infinity", -std::numeric_limits::infinity()); + AssertConversion("Infinity", std::numeric_limits::infinity()); AssertConversionFails(""); AssertConversionFails("e"); diff --git a/cpp/src/arrow/vendored/fast_float/README.md b/cpp/src/arrow/vendored/fast_float/README.md index 6d44654f2a721..b07c280e0ad44 100644 --- a/cpp/src/arrow/vendored/fast_float/README.md +++ b/cpp/src/arrow/vendored/fast_float/README.md @@ -20,7 +20,7 @@ # fast_float The files in this directory are vendored from fast_float -git tag `v3.8.1`. +git tag `v3.10.1`. See https://github.com/fastfloat/fast_float @@ -31,7 +31,7 @@ See https://github.com/fastfloat/fast_float ## How to update You must replace `VERSION` in the command lines with suitable version -such as `3.8.1`. +such as `3.10.1`. ```bash cpp/src/arrow/vendoered/fast_float/update.sh VERSION diff --git a/cpp/src/arrow/vendored/fast_float/ascii_number.h b/cpp/src/arrow/vendored/fast_float/ascii_number.h index 24ec813174a7a..6d825ccfb5a48 100644 --- a/cpp/src/arrow/vendored/fast_float/ascii_number.h +++ b/cpp/src/arrow/vendored/fast_float/ascii_number.h @@ -13,9 +13,11 @@ namespace fast_float { // Next function can be micro-optimized, but compilers are entirely // able to optimize it well. -fastfloat_really_inline bool is_integer(char c) noexcept { return c >= '0' && c <= '9'; } +fastfloat_really_inline constexpr bool is_integer(char c) noexcept { + return c >= '0' && c <= '9'; +} -fastfloat_really_inline uint64_t byteswap(uint64_t val) { +fastfloat_really_inline constexpr uint64_t byteswap(uint64_t val) { return (val & 0xFF00000000000000) >> 56 | (val & 0x00FF000000000000) >> 40 | (val & 0x0000FF0000000000) >> 24 @@ -45,7 +47,8 @@ fastfloat_really_inline void write_u64(uint8_t *chars, uint64_t val) { } // credit @aqrit -fastfloat_really_inline uint32_t parse_eight_digits_unrolled(uint64_t val) { +fastfloat_really_inline FASTFLOAT_CONSTEXPR14 +uint32_t parse_eight_digits_unrolled(uint64_t val) { const uint64_t mask = 0x000000FF000000FF; const uint64_t mul1 = 0x000F424000000064; // 100 + (1000000ULL << 32) const uint64_t mul2 = 0x0000271000000001; // 1 + (10000ULL << 32) @@ -60,7 +63,7 @@ fastfloat_really_inline uint32_t parse_eight_digits_unrolled(const char *chars) } // credit @aqrit -fastfloat_really_inline bool is_made_of_eight_digits_fast(uint64_t val) noexcept { +fastfloat_really_inline constexpr bool is_made_of_eight_digits_fast(uint64_t val) noexcept { return !((((val + 0x4646464646464646) | (val - 0x3030303030303030)) & 0x8080808080808080)); } @@ -94,7 +97,11 @@ parsed_number_string parse_number_string(const char *p, const char *pend, parse_ answer.valid = false; answer.too_many_digits = false; answer.negative = (*p == '-'); +#if FASTFLOAT_ALLOWS_LEADING_PLUS // disabled by default + if ((*p == '-') || (*p == '+')) { +#else if (*p == '-') { // C++17 20.19.3.(7.1) explicitly forbids '+' sign here +#endif ++p; if (p == pend) { return answer; diff --git a/cpp/src/arrow/vendored/fast_float/bigint.h b/cpp/src/arrow/vendored/fast_float/bigint.h index b733c7b64ba6a..bc083893ac4ca 100644 --- a/cpp/src/arrow/vendored/fast_float/bigint.h +++ b/cpp/src/arrow/vendored/fast_float/bigint.h @@ -51,27 +51,27 @@ struct stackvec { stackvec &operator=(stackvec &&other) = delete; // create stack vector from existing limb span. - stackvec(limb_span s) { + FASTFLOAT_CONSTEXPR20 stackvec(limb_span s) { FASTFLOAT_ASSERT(try_extend(s)); } - limb& operator[](size_t index) noexcept { + FASTFLOAT_CONSTEXPR14 limb& operator[](size_t index) noexcept { FASTFLOAT_DEBUG_ASSERT(index < length); return data[index]; } - const limb& operator[](size_t index) const noexcept { + FASTFLOAT_CONSTEXPR14 const limb& operator[](size_t index) const noexcept { FASTFLOAT_DEBUG_ASSERT(index < length); return data[index]; } // index from the end of the container - const limb& rindex(size_t index) const noexcept { + FASTFLOAT_CONSTEXPR14 const limb& rindex(size_t index) const noexcept { FASTFLOAT_DEBUG_ASSERT(index < length); size_t rindex = length - index - 1; return data[rindex]; } // set the length, without bounds checking. - void set_len(size_t len) noexcept { + FASTFLOAT_CONSTEXPR14 void set_len(size_t len) noexcept { length = uint16_t(len); } constexpr size_t len() const noexcept { @@ -84,12 +84,12 @@ struct stackvec { return size; } // append item to vector, without bounds checking - void push_unchecked(limb value) noexcept { + FASTFLOAT_CONSTEXPR14 void push_unchecked(limb value) noexcept { data[length] = value; length++; } // append item to vector, returning if item was added - bool try_push(limb value) noexcept { + FASTFLOAT_CONSTEXPR14 bool try_push(limb value) noexcept { if (len() < capacity()) { push_unchecked(value); return true; @@ -98,13 +98,13 @@ struct stackvec { } } // add items to the vector, from a span, without bounds checking - void extend_unchecked(limb_span s) noexcept { + FASTFLOAT_CONSTEXPR20 void extend_unchecked(limb_span s) noexcept { limb* ptr = data + length; - ::memcpy((void*)ptr, (const void*)s.ptr, sizeof(limb) * s.len()); + std::copy_n(s.ptr, s.len(), ptr); set_len(len() + s.len()); } // try to add items to the vector, returning if items were added - bool try_extend(limb_span s) noexcept { + FASTFLOAT_CONSTEXPR20 bool try_extend(limb_span s) noexcept { if (len() + s.len() <= capacity()) { extend_unchecked(s); return true; @@ -115,6 +115,7 @@ struct stackvec { // resize the vector, without bounds checking // if the new size is longer than the vector, assign value to each // appended item. + FASTFLOAT_CONSTEXPR20 void resize_unchecked(size_t new_len, limb value) noexcept { if (new_len > len()) { size_t count = new_len - len(); @@ -127,7 +128,7 @@ struct stackvec { } } // try to resize the vector, returning if the vector was resized. - bool try_resize(size_t new_len, limb value) noexcept { + FASTFLOAT_CONSTEXPR20 bool try_resize(size_t new_len, limb value) noexcept { if (new_len > capacity()) { return false; } else { @@ -138,7 +139,7 @@ struct stackvec { // check if any limbs are non-zero after the given index. // this needs to be done in reverse order, since the index // is relative to the most significant limbs. - bool nonzero(size_t index) const noexcept { + FASTFLOAT_CONSTEXPR14 bool nonzero(size_t index) const noexcept { while (index < len()) { if (rindex(index) != 0) { return true; @@ -148,27 +149,27 @@ struct stackvec { return false; } // normalize the big integer, so most-significant zero limbs are removed. - void normalize() noexcept { + FASTFLOAT_CONSTEXPR14 void normalize() noexcept { while (len() > 0 && rindex(0) == 0) { length--; } } }; -fastfloat_really_inline +fastfloat_really_inline FASTFLOAT_CONSTEXPR14 uint64_t empty_hi64(bool& truncated) noexcept { truncated = false; return 0; } -fastfloat_really_inline +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 uint64_t uint64_hi64(uint64_t r0, bool& truncated) noexcept { truncated = false; int shl = leading_zeroes(r0); return r0 << shl; } -fastfloat_really_inline +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 uint64_t uint64_hi64(uint64_t r0, uint64_t r1, bool& truncated) noexcept { int shl = leading_zeroes(r0); if (shl == 0) { @@ -181,19 +182,19 @@ uint64_t uint64_hi64(uint64_t r0, uint64_t r1, bool& truncated) noexcept { } } -fastfloat_really_inline +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 uint64_t uint32_hi64(uint32_t r0, bool& truncated) noexcept { return uint64_hi64(r0, truncated); } -fastfloat_really_inline +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 uint64_t uint32_hi64(uint32_t r0, uint32_t r1, bool& truncated) noexcept { uint64_t x0 = r0; uint64_t x1 = r1; return uint64_hi64((x0 << 32) | x1, truncated); } -fastfloat_really_inline +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 uint64_t uint32_hi64(uint32_t r0, uint32_t r1, uint32_t r2, bool& truncated) noexcept { uint64_t x0 = r0; uint64_t x1 = r1; @@ -205,15 +206,16 @@ uint64_t uint32_hi64(uint32_t r0, uint32_t r1, uint32_t r2, bool& truncated) noe // we want an efficient operation. for msvc, where // we don't have built-in intrinsics, this is still // pretty fast. -fastfloat_really_inline +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 limb scalar_add(limb x, limb y, bool& overflow) noexcept { limb z; - // gcc and clang #if defined(__has_builtin) #if __has_builtin(__builtin_add_overflow) - overflow = __builtin_add_overflow(x, y, &z); - return z; + if (!cpp20_and_in_constexpr()) { + overflow = __builtin_add_overflow(x, y, &z); + return z; + } #endif #endif @@ -224,7 +226,7 @@ limb scalar_add(limb x, limb y, bool& overflow) noexcept { } // multiply two small integers, getting both the high and low bits. -fastfloat_really_inline +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 limb scalar_mul(limb x, limb y, limb& carry) noexcept { #ifdef FASTFLOAT_64BIT_LIMB #if defined(__SIZEOF_INT128__) @@ -252,7 +254,8 @@ limb scalar_mul(limb x, limb y, limb& carry) noexcept { // add scalar value to bigint starting from offset. // used in grade school multiplication template -inline bool small_add_from(stackvec& vec, limb y, size_t start) noexcept { +inline FASTFLOAT_CONSTEXPR20 +bool small_add_from(stackvec& vec, limb y, size_t start) noexcept { size_t index = start; limb carry = y; bool overflow; @@ -269,13 +272,15 @@ inline bool small_add_from(stackvec& vec, limb y, size_t start) noexcept { // add scalar value to bigint. template -fastfloat_really_inline bool small_add(stackvec& vec, limb y) noexcept { +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 +bool small_add(stackvec& vec, limb y) noexcept { return small_add_from(vec, y, 0); } // multiply bigint by scalar value. template -inline bool small_mul(stackvec& vec, limb y) noexcept { +inline FASTFLOAT_CONSTEXPR20 +bool small_mul(stackvec& vec, limb y) noexcept { limb carry = 0; for (size_t index = 0; index < vec.len(); index++) { vec[index] = scalar_mul(vec[index], y, carry); @@ -289,6 +294,7 @@ inline bool small_mul(stackvec& vec, limb y) noexcept { // add bigint to bigint starting from index. // used in grade school multiplication template +FASTFLOAT_CONSTEXPR20 bool large_add_from(stackvec& x, limb_span y, size_t start) noexcept { // the effective x buffer is from `xstart..x.len()`, so exit early // if we can't get that current range. @@ -319,12 +325,14 @@ bool large_add_from(stackvec& x, limb_span y, size_t start) noexcept { // add bigint to bigint. template -fastfloat_really_inline bool large_add_from(stackvec& x, limb_span y) noexcept { +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 +bool large_add_from(stackvec& x, limb_span y) noexcept { return large_add_from(x, y, 0); } // grade-school multiplication algorithm template +FASTFLOAT_CONSTEXPR20 bool long_mul(stackvec& x, limb_span y) noexcept { limb_span xs = limb_span(x.data, x.len()); stackvec z(xs); @@ -353,6 +361,7 @@ bool long_mul(stackvec& x, limb_span y) noexcept { // grade-school multiplication algorithm template +FASTFLOAT_CONSTEXPR20 bool large_mul(stackvec& x, limb_span y) noexcept { if (y.len() == 1) { FASTFLOAT_TRY(small_mul(x, y[0])); @@ -362,21 +371,52 @@ bool large_mul(stackvec& x, limb_span y) noexcept { return true; } +template +struct pow5_tables { + static constexpr uint32_t large_step = 135; + static constexpr uint64_t small_power_of_5[] = { + 1UL, 5UL, 25UL, 125UL, 625UL, 3125UL, 15625UL, 78125UL, 390625UL, + 1953125UL, 9765625UL, 48828125UL, 244140625UL, 1220703125UL, + 6103515625UL, 30517578125UL, 152587890625UL, 762939453125UL, + 3814697265625UL, 19073486328125UL, 95367431640625UL, 476837158203125UL, + 2384185791015625UL, 11920928955078125UL, 59604644775390625UL, + 298023223876953125UL, 1490116119384765625UL, 7450580596923828125UL, + }; +#ifdef FASTFLOAT_64BIT_LIMB + constexpr static limb large_power_of_5[] = { + 1414648277510068013UL, 9180637584431281687UL, 4539964771860779200UL, + 10482974169319127550UL, 198276706040285095UL}; +#else + constexpr static limb large_power_of_5[] = { + 4279965485U, 329373468U, 4020270615U, 2137533757U, 4287402176U, + 1057042919U, 1071430142U, 2440757623U, 381945767U, 46164893U}; +#endif +}; + +template +constexpr uint32_t pow5_tables::large_step; + +template +constexpr uint64_t pow5_tables::small_power_of_5[]; + +template +constexpr limb pow5_tables::large_power_of_5[]; + // big integer type. implements a small subset of big integer // arithmetic, using simple algorithms since asymptotically // faster algorithms are slower for a small number of limbs. // all operations assume the big-integer is normalized. -struct bigint { +struct bigint : pow5_tables<> { // storage of the limbs, in little-endian order. stackvec vec; - bigint(): vec() {} + FASTFLOAT_CONSTEXPR20 bigint(): vec() {} bigint(const bigint &) = delete; bigint &operator=(const bigint &) = delete; bigint(bigint &&) = delete; bigint &operator=(bigint &&other) = delete; - bigint(uint64_t value): vec() { + FASTFLOAT_CONSTEXPR20 bigint(uint64_t value): vec() { #ifdef FASTFLOAT_64BIT_LIMB vec.push_unchecked(value); #else @@ -388,7 +428,7 @@ struct bigint { // get the high 64 bits from the vector, and if bits were truncated. // this is to get the significant digits for the float. - uint64_t hi64(bool& truncated) const noexcept { + FASTFLOAT_CONSTEXPR20 uint64_t hi64(bool& truncated) const noexcept { #ifdef FASTFLOAT_64BIT_LIMB if (vec.len() == 0) { return empty_hi64(truncated); @@ -420,7 +460,7 @@ struct bigint { // positive, this is larger, otherwise they are equal. // the limbs are stored in little-endian order, so we // must compare the limbs in ever order. - int compare(const bigint& other) const noexcept { + FASTFLOAT_CONSTEXPR20 int compare(const bigint& other) const noexcept { if (vec.len() > other.vec.len()) { return 1; } else if (vec.len() < other.vec.len()) { @@ -441,7 +481,7 @@ struct bigint { // shift left each limb n bits, carrying over to the new limb // returns true if we were able to shift all the digits. - bool shl_bits(size_t n) noexcept { + FASTFLOAT_CONSTEXPR20 bool shl_bits(size_t n) noexcept { // Internally, for each item, we shift left by n, and add the previous // right shifted limb-bits. // For example, we transform (for u8) shifted left 2, to: @@ -467,7 +507,7 @@ struct bigint { } // move the limbs left by `n` limbs. - bool shl_limbs(size_t n) noexcept { + FASTFLOAT_CONSTEXPR20 bool shl_limbs(size_t n) noexcept { FASTFLOAT_DEBUG_ASSERT(n != 0); if (n + vec.len() > vec.capacity()) { return false; @@ -488,7 +528,7 @@ struct bigint { } // move the limbs left by `n` bits. - bool shl(size_t n) noexcept { + FASTFLOAT_CONSTEXPR20 bool shl(size_t n) noexcept { size_t rem = n % limb_bits; size_t div = n / limb_bits; if (rem != 0) { @@ -501,7 +541,7 @@ struct bigint { } // get the number of leading zeros in the bigint. - int ctlz() const noexcept { + FASTFLOAT_CONSTEXPR20 int ctlz() const noexcept { if (vec.is_empty()) { return 0; } else { @@ -516,45 +556,27 @@ struct bigint { } // get the number of bits in the bigint. - int bit_length() const noexcept { + FASTFLOAT_CONSTEXPR20 int bit_length() const noexcept { int lz = ctlz(); return int(limb_bits * vec.len()) - lz; } - bool mul(limb y) noexcept { + FASTFLOAT_CONSTEXPR20 bool mul(limb y) noexcept { return small_mul(vec, y); } - bool add(limb y) noexcept { + FASTFLOAT_CONSTEXPR20 bool add(limb y) noexcept { return small_add(vec, y); } // multiply as if by 2 raised to a power. - bool pow2(uint32_t exp) noexcept { + FASTFLOAT_CONSTEXPR20 bool pow2(uint32_t exp) noexcept { return shl(exp); } // multiply as if by 5 raised to a power. - bool pow5(uint32_t exp) noexcept { + FASTFLOAT_CONSTEXPR20 bool pow5(uint32_t exp) noexcept { // multiply by a power of 5 - static constexpr uint32_t large_step = 135; - static constexpr uint64_t small_power_of_5[] = { - 1UL, 5UL, 25UL, 125UL, 625UL, 3125UL, 15625UL, 78125UL, 390625UL, - 1953125UL, 9765625UL, 48828125UL, 244140625UL, 1220703125UL, - 6103515625UL, 30517578125UL, 152587890625UL, 762939453125UL, - 3814697265625UL, 19073486328125UL, 95367431640625UL, 476837158203125UL, - 2384185791015625UL, 11920928955078125UL, 59604644775390625UL, - 298023223876953125UL, 1490116119384765625UL, 7450580596923828125UL, - }; -#ifdef FASTFLOAT_64BIT_LIMB - constexpr static limb large_power_of_5[] = { - 1414648277510068013UL, 9180637584431281687UL, 4539964771860779200UL, - 10482974169319127550UL, 198276706040285095UL}; -#else - constexpr static limb large_power_of_5[] = { - 4279965485U, 329373468U, 4020270615U, 2137533757U, 4287402176U, - 1057042919U, 1071430142U, 2440757623U, 381945767U, 46164893U}; -#endif size_t large_length = sizeof(large_power_of_5) / sizeof(limb); limb_span large = limb_span(large_power_of_5, large_length); while (exp >= large_step) { @@ -580,7 +602,7 @@ struct bigint { } // multiply as if by 10 raised to a power. - bool pow10(uint32_t exp) noexcept { + FASTFLOAT_CONSTEXPR20 bool pow10(uint32_t exp) noexcept { FASTFLOAT_TRY(pow5(exp)); return pow2(exp); } diff --git a/cpp/src/arrow/vendored/fast_float/constexpr_feature_detect.h b/cpp/src/arrow/vendored/fast_float/constexpr_feature_detect.h new file mode 100644 index 0000000000000..ba8b65c64a160 --- /dev/null +++ b/cpp/src/arrow/vendored/fast_float/constexpr_feature_detect.h @@ -0,0 +1,40 @@ +#ifndef FASTFLOAT_CONSTEXPR_FEATURE_DETECT_H +#define FASTFLOAT_CONSTEXPR_FEATURE_DETECT_H + +#ifdef __has_include +#if __has_include() +#include +#endif +#endif + +// Testing for https://wg21.link/N3652, adopted in C++14 +#if __cpp_constexpr >= 201304 +#define FASTFLOAT_CONSTEXPR14 constexpr +#else +#define FASTFLOAT_CONSTEXPR14 +#endif + +#if defined(__cpp_lib_bit_cast) && __cpp_lib_bit_cast >= 201806L +#define FASTFLOAT_HAS_BIT_CAST 1 +#else +#define FASTFLOAT_HAS_BIT_CAST 0 +#endif + +#if defined(__cpp_lib_is_constant_evaluated) && __cpp_lib_is_constant_evaluated >= 201811L +#define FASTFLOAT_HAS_IS_CONSTANT_EVALUATED 1 +#else +#define FASTFLOAT_HAS_IS_CONSTANT_EVALUATED 0 +#endif + +// Testing for relevant C++20 constexpr library features +#if FASTFLOAT_HAS_IS_CONSTANT_EVALUATED \ + && FASTFLOAT_HAS_BIT_CAST \ + && __cpp_lib_constexpr_algorithms >= 201806L /*For std::copy and std::fill*/ +#define FASTFLOAT_CONSTEXPR20 constexpr +#define FASTFLOAT_IS_CONSTEXPR 1 +#else +#define FASTFLOAT_CONSTEXPR20 +#define FASTFLOAT_IS_CONSTEXPR 0 +#endif + +#endif // FASTFLOAT_CONSTEXPR_FEATURE_DETECT_H diff --git a/cpp/src/arrow/vendored/fast_float/decimal_to_binary.h b/cpp/src/arrow/vendored/fast_float/decimal_to_binary.h index 8ae481d323865..9390228c3946a 100644 --- a/cpp/src/arrow/vendored/fast_float/decimal_to_binary.h +++ b/cpp/src/arrow/vendored/fast_float/decimal_to_binary.h @@ -64,7 +64,7 @@ namespace detail { // create an adjusted mantissa, biased by the invalid power2 // for significant digits already multiplied by 10 ** q. template -fastfloat_really_inline +fastfloat_really_inline FASTFLOAT_CONSTEXPR14 adjusted_mantissa compute_error_scaled(int64_t q, uint64_t w, int lz) noexcept { int hilz = int(w >> 63) ^ 1; adjusted_mantissa answer; @@ -118,16 +118,11 @@ adjusted_mantissa compute_float(int64_t q, uint64_t w) noexcept { // 3. We might lose a bit due to the "upperbit" routine (result too small, requiring a shift) value128 product = compute_product_approximation(q, w); - if(product.low == 0xFFFFFFFFFFFFFFFF) { // could guard it further - // In some very rare cases, this could happen, in which case we might need a more accurate - // computation that what we can provide cheaply. This is very, very unlikely. - // - const bool inside_safe_exponent = (q >= -27) && (q <= 55); // always good because 5**q <2**128 when q>=0, - // and otherwise, for q<0, we have 5**-q<2**64 and the 128-bit reciprocal allows for exact computation. - if(!inside_safe_exponent) { - return compute_error_scaled(q, product.high, lz); - } - } + // The computed 'product' is always sufficient. + // Mathematical proof: + // Noble Mushtak and Daniel Lemire, Fast Number Parsing Without Fallback (to appear) + // See script/mushtak_lemire.py + // The "compute_product_approximation" function can be slightly slower than a branchless approach: // value128 product = compute_product(q, w); // but in practice, we can win big with the compute_product_approximation if its additional branch diff --git a/cpp/src/arrow/vendored/fast_float/digit_comparison.h b/cpp/src/arrow/vendored/fast_float/digit_comparison.h index 5cb01a93648fd..b27348a1fcc7e 100644 --- a/cpp/src/arrow/vendored/fast_float/digit_comparison.h +++ b/cpp/src/arrow/vendored/fast_float/digit_comparison.h @@ -24,7 +24,8 @@ constexpr static uint64_t powers_of_ten_uint64[] = { // this algorithm is not even close to optimized, but it has no practical // effect on performance: in order to have a faster algorithm, we'd need // to slow down performance for faster algorithms, and this is still fast. -fastfloat_really_inline int32_t scientific_exponent(parsed_number_string& num) noexcept { +fastfloat_really_inline FASTFLOAT_CONSTEXPR14 +int32_t scientific_exponent(parsed_number_string& num) noexcept { uint64_t mantissa = num.mantissa; int32_t exponent = int32_t(num.exponent); while (mantissa >= 10000) { @@ -82,7 +83,8 @@ fastfloat_really_inline adjusted_mantissa to_extended_halfway(T value) noexcept // round an extended-precision float to the nearest machine float. template -fastfloat_really_inline void round(adjusted_mantissa& am, callback cb) noexcept { +fastfloat_really_inline FASTFLOAT_CONSTEXPR14 +void round(adjusted_mantissa& am, callback cb) noexcept { int32_t mantissa_shift = 64 - binary_format::mantissa_explicit_bits() - 1; if (-am.power2 >= mantissa_shift) { // have a denormal float @@ -111,23 +113,19 @@ fastfloat_really_inline void round(adjusted_mantissa& am, callback cb) noexcept } template -fastfloat_really_inline +fastfloat_really_inline FASTFLOAT_CONSTEXPR14 void round_nearest_tie_even(adjusted_mantissa& am, int32_t shift, callback cb) noexcept { - uint64_t mask; - uint64_t halfway; - if (shift == 64) { - mask = UINT64_MAX; - } else { - mask = (uint64_t(1) << shift) - 1; - } - if (shift == 0) { - halfway = 0; - } else { - halfway = uint64_t(1) << (shift - 1); - } + const uint64_t mask + = (shift == 64) + ? UINT64_MAX + : (uint64_t(1) << shift) - 1; + const uint64_t halfway + = (shift == 0) + ? 0 + : uint64_t(1) << (shift - 1); uint64_t truncated_bits = am.mantissa & mask; - uint64_t is_above = truncated_bits > halfway; - uint64_t is_halfway = truncated_bits == halfway; + bool is_above = truncated_bits > halfway; + bool is_halfway = truncated_bits == halfway; // shift digits into position if (shift == 64) { @@ -141,7 +139,8 @@ void round_nearest_tie_even(adjusted_mantissa& am, int32_t shift, callback cb) n am.mantissa += uint64_t(cb(is_odd, is_halfway, is_above)); } -fastfloat_really_inline void round_down(adjusted_mantissa& am, int32_t shift) noexcept { +fastfloat_really_inline FASTFLOAT_CONSTEXPR14 +void round_down(adjusted_mantissa& am, int32_t shift) noexcept { if (shift == 64) { am.mantissa = 0; } else { @@ -200,7 +199,7 @@ void parse_eight_digits(const char*& p, limb& value, size_t& counter, size_t& co count += 8; } -fastfloat_really_inline +fastfloat_really_inline FASTFLOAT_CONSTEXPR14 void parse_one_digit(const char*& p, limb& value, size_t& counter, size_t& count) noexcept { value = value * 10 + limb(*p - '0'); p++; diff --git a/cpp/src/arrow/vendored/fast_float/fast_float.h b/cpp/src/arrow/vendored/fast_float/fast_float.h index b379efdd9e409..7942fe35ad1d3 100644 --- a/cpp/src/arrow/vendored/fast_float/fast_float.h +++ b/cpp/src/arrow/vendored/fast_float/fast_float.h @@ -59,7 +59,7 @@ template from_chars_result from_chars_advanced(const char *first, const char *last, T &value, parse_options options) noexcept; -} +} // namespace fast_float } // namespace arrow_vendored #include "parse_number.h" #endif // FASTFLOAT_FAST_FLOAT_H diff --git a/cpp/src/arrow/vendored/fast_float/fast_table.h b/cpp/src/arrow/vendored/fast_float/fast_table.h index 4861cab3a0d84..2c637d85c8fb0 100644 --- a/cpp/src/arrow/vendored/fast_float/fast_table.h +++ b/cpp/src/arrow/vendored/fast_float/fast_table.h @@ -18,11 +18,11 @@ namespace fast_float { */ /** - * The smallest non-zero float (binary64) is 2^−1074. + * The smallest non-zero float (binary64) is 2^-1074. * We take as input numbers of the form w x 10^q where w < 2^64. * We have that w * 10^-343 < 2^(64-344) 5^-343 < 2^-1076. * However, we have that - * (2^64-1) * 10^-342 = (2^64-1) * 2^-342 * 5^-342 > 2^−1074. + * (2^64-1) * 10^-342 = (2^64-1) * 2^-342 * 5^-342 > 2^-1074. * Thus it is possible for a number of the form w * 10^-342 where * w is a 64-bit value to be a non-zero floating-point number. ********* @@ -37,665 +37,666 @@ constexpr static int smallest_power_of_five = binary_format::smallest_po constexpr static int largest_power_of_five = binary_format::largest_power_of_ten(); constexpr static int number_of_entries = 2 * (largest_power_of_five - smallest_power_of_five + 1); // Powers of five from 5^-342 all the way to 5^308 rounded toward one. -static const uint64_t power_of_five_128[number_of_entries]; +constexpr static uint64_t power_of_five_128[number_of_entries] = { + 0xeef453d6923bd65a,0x113faa2906a13b3f, + 0x9558b4661b6565f8,0x4ac7ca59a424c507, + 0xbaaee17fa23ebf76,0x5d79bcf00d2df649, + 0xe95a99df8ace6f53,0xf4d82c2c107973dc, + 0x91d8a02bb6c10594,0x79071b9b8a4be869, + 0xb64ec836a47146f9,0x9748e2826cdee284, + 0xe3e27a444d8d98b7,0xfd1b1b2308169b25, + 0x8e6d8c6ab0787f72,0xfe30f0f5e50e20f7, + 0xb208ef855c969f4f,0xbdbd2d335e51a935, + 0xde8b2b66b3bc4723,0xad2c788035e61382, + 0x8b16fb203055ac76,0x4c3bcb5021afcc31, + 0xaddcb9e83c6b1793,0xdf4abe242a1bbf3d, + 0xd953e8624b85dd78,0xd71d6dad34a2af0d, + 0x87d4713d6f33aa6b,0x8672648c40e5ad68, + 0xa9c98d8ccb009506,0x680efdaf511f18c2, + 0xd43bf0effdc0ba48,0x212bd1b2566def2, + 0x84a57695fe98746d,0x14bb630f7604b57, + 0xa5ced43b7e3e9188,0x419ea3bd35385e2d, + 0xcf42894a5dce35ea,0x52064cac828675b9, + 0x818995ce7aa0e1b2,0x7343efebd1940993, + 0xa1ebfb4219491a1f,0x1014ebe6c5f90bf8, + 0xca66fa129f9b60a6,0xd41a26e077774ef6, + 0xfd00b897478238d0,0x8920b098955522b4, + 0x9e20735e8cb16382,0x55b46e5f5d5535b0, + 0xc5a890362fddbc62,0xeb2189f734aa831d, + 0xf712b443bbd52b7b,0xa5e9ec7501d523e4, + 0x9a6bb0aa55653b2d,0x47b233c92125366e, + 0xc1069cd4eabe89f8,0x999ec0bb696e840a, + 0xf148440a256e2c76,0xc00670ea43ca250d, + 0x96cd2a865764dbca,0x380406926a5e5728, + 0xbc807527ed3e12bc,0xc605083704f5ecf2, + 0xeba09271e88d976b,0xf7864a44c633682e, + 0x93445b8731587ea3,0x7ab3ee6afbe0211d, + 0xb8157268fdae9e4c,0x5960ea05bad82964, + 0xe61acf033d1a45df,0x6fb92487298e33bd, + 0x8fd0c16206306bab,0xa5d3b6d479f8e056, + 0xb3c4f1ba87bc8696,0x8f48a4899877186c, + 0xe0b62e2929aba83c,0x331acdabfe94de87, + 0x8c71dcd9ba0b4925,0x9ff0c08b7f1d0b14, + 0xaf8e5410288e1b6f,0x7ecf0ae5ee44dd9, + 0xdb71e91432b1a24a,0xc9e82cd9f69d6150, + 0x892731ac9faf056e,0xbe311c083a225cd2, + 0xab70fe17c79ac6ca,0x6dbd630a48aaf406, + 0xd64d3d9db981787d,0x92cbbccdad5b108, + 0x85f0468293f0eb4e,0x25bbf56008c58ea5, + 0xa76c582338ed2621,0xaf2af2b80af6f24e, + 0xd1476e2c07286faa,0x1af5af660db4aee1, + 0x82cca4db847945ca,0x50d98d9fc890ed4d, + 0xa37fce126597973c,0xe50ff107bab528a0, + 0xcc5fc196fefd7d0c,0x1e53ed49a96272c8, + 0xff77b1fcbebcdc4f,0x25e8e89c13bb0f7a, + 0x9faacf3df73609b1,0x77b191618c54e9ac, + 0xc795830d75038c1d,0xd59df5b9ef6a2417, + 0xf97ae3d0d2446f25,0x4b0573286b44ad1d, + 0x9becce62836ac577,0x4ee367f9430aec32, + 0xc2e801fb244576d5,0x229c41f793cda73f, + 0xf3a20279ed56d48a,0x6b43527578c1110f, + 0x9845418c345644d6,0x830a13896b78aaa9, + 0xbe5691ef416bd60c,0x23cc986bc656d553, + 0xedec366b11c6cb8f,0x2cbfbe86b7ec8aa8, + 0x94b3a202eb1c3f39,0x7bf7d71432f3d6a9, + 0xb9e08a83a5e34f07,0xdaf5ccd93fb0cc53, + 0xe858ad248f5c22c9,0xd1b3400f8f9cff68, + 0x91376c36d99995be,0x23100809b9c21fa1, + 0xb58547448ffffb2d,0xabd40a0c2832a78a, + 0xe2e69915b3fff9f9,0x16c90c8f323f516c, + 0x8dd01fad907ffc3b,0xae3da7d97f6792e3, + 0xb1442798f49ffb4a,0x99cd11cfdf41779c, + 0xdd95317f31c7fa1d,0x40405643d711d583, + 0x8a7d3eef7f1cfc52,0x482835ea666b2572, + 0xad1c8eab5ee43b66,0xda3243650005eecf, + 0xd863b256369d4a40,0x90bed43e40076a82, + 0x873e4f75e2224e68,0x5a7744a6e804a291, + 0xa90de3535aaae202,0x711515d0a205cb36, + 0xd3515c2831559a83,0xd5a5b44ca873e03, + 0x8412d9991ed58091,0xe858790afe9486c2, + 0xa5178fff668ae0b6,0x626e974dbe39a872, + 0xce5d73ff402d98e3,0xfb0a3d212dc8128f, + 0x80fa687f881c7f8e,0x7ce66634bc9d0b99, + 0xa139029f6a239f72,0x1c1fffc1ebc44e80, + 0xc987434744ac874e,0xa327ffb266b56220, + 0xfbe9141915d7a922,0x4bf1ff9f0062baa8, + 0x9d71ac8fada6c9b5,0x6f773fc3603db4a9, + 0xc4ce17b399107c22,0xcb550fb4384d21d3, + 0xf6019da07f549b2b,0x7e2a53a146606a48, + 0x99c102844f94e0fb,0x2eda7444cbfc426d, + 0xc0314325637a1939,0xfa911155fefb5308, + 0xf03d93eebc589f88,0x793555ab7eba27ca, + 0x96267c7535b763b5,0x4bc1558b2f3458de, + 0xbbb01b9283253ca2,0x9eb1aaedfb016f16, + 0xea9c227723ee8bcb,0x465e15a979c1cadc, + 0x92a1958a7675175f,0xbfacd89ec191ec9, + 0xb749faed14125d36,0xcef980ec671f667b, + 0xe51c79a85916f484,0x82b7e12780e7401a, + 0x8f31cc0937ae58d2,0xd1b2ecb8b0908810, + 0xb2fe3f0b8599ef07,0x861fa7e6dcb4aa15, + 0xdfbdcece67006ac9,0x67a791e093e1d49a, + 0x8bd6a141006042bd,0xe0c8bb2c5c6d24e0, + 0xaecc49914078536d,0x58fae9f773886e18, + 0xda7f5bf590966848,0xaf39a475506a899e, + 0x888f99797a5e012d,0x6d8406c952429603, + 0xaab37fd7d8f58178,0xc8e5087ba6d33b83, + 0xd5605fcdcf32e1d6,0xfb1e4a9a90880a64, + 0x855c3be0a17fcd26,0x5cf2eea09a55067f, + 0xa6b34ad8c9dfc06f,0xf42faa48c0ea481e, + 0xd0601d8efc57b08b,0xf13b94daf124da26, + 0x823c12795db6ce57,0x76c53d08d6b70858, + 0xa2cb1717b52481ed,0x54768c4b0c64ca6e, + 0xcb7ddcdda26da268,0xa9942f5dcf7dfd09, + 0xfe5d54150b090b02,0xd3f93b35435d7c4c, + 0x9efa548d26e5a6e1,0xc47bc5014a1a6daf, + 0xc6b8e9b0709f109a,0x359ab6419ca1091b, + 0xf867241c8cc6d4c0,0xc30163d203c94b62, + 0x9b407691d7fc44f8,0x79e0de63425dcf1d, + 0xc21094364dfb5636,0x985915fc12f542e4, + 0xf294b943e17a2bc4,0x3e6f5b7b17b2939d, + 0x979cf3ca6cec5b5a,0xa705992ceecf9c42, + 0xbd8430bd08277231,0x50c6ff782a838353, + 0xece53cec4a314ebd,0xa4f8bf5635246428, + 0x940f4613ae5ed136,0x871b7795e136be99, + 0xb913179899f68584,0x28e2557b59846e3f, + 0xe757dd7ec07426e5,0x331aeada2fe589cf, + 0x9096ea6f3848984f,0x3ff0d2c85def7621, + 0xb4bca50b065abe63,0xfed077a756b53a9, + 0xe1ebce4dc7f16dfb,0xd3e8495912c62894, + 0x8d3360f09cf6e4bd,0x64712dd7abbbd95c, + 0xb080392cc4349dec,0xbd8d794d96aacfb3, + 0xdca04777f541c567,0xecf0d7a0fc5583a0, + 0x89e42caaf9491b60,0xf41686c49db57244, + 0xac5d37d5b79b6239,0x311c2875c522ced5, + 0xd77485cb25823ac7,0x7d633293366b828b, + 0x86a8d39ef77164bc,0xae5dff9c02033197, + 0xa8530886b54dbdeb,0xd9f57f830283fdfc, + 0xd267caa862a12d66,0xd072df63c324fd7b, + 0x8380dea93da4bc60,0x4247cb9e59f71e6d, + 0xa46116538d0deb78,0x52d9be85f074e608, + 0xcd795be870516656,0x67902e276c921f8b, + 0x806bd9714632dff6,0xba1cd8a3db53b6, + 0xa086cfcd97bf97f3,0x80e8a40eccd228a4, + 0xc8a883c0fdaf7df0,0x6122cd128006b2cd, + 0xfad2a4b13d1b5d6c,0x796b805720085f81, + 0x9cc3a6eec6311a63,0xcbe3303674053bb0, + 0xc3f490aa77bd60fc,0xbedbfc4411068a9c, + 0xf4f1b4d515acb93b,0xee92fb5515482d44, + 0x991711052d8bf3c5,0x751bdd152d4d1c4a, + 0xbf5cd54678eef0b6,0xd262d45a78a0635d, + 0xef340a98172aace4,0x86fb897116c87c34, + 0x9580869f0e7aac0e,0xd45d35e6ae3d4da0, + 0xbae0a846d2195712,0x8974836059cca109, + 0xe998d258869facd7,0x2bd1a438703fc94b, + 0x91ff83775423cc06,0x7b6306a34627ddcf, + 0xb67f6455292cbf08,0x1a3bc84c17b1d542, + 0xe41f3d6a7377eeca,0x20caba5f1d9e4a93, + 0x8e938662882af53e,0x547eb47b7282ee9c, + 0xb23867fb2a35b28d,0xe99e619a4f23aa43, + 0xdec681f9f4c31f31,0x6405fa00e2ec94d4, + 0x8b3c113c38f9f37e,0xde83bc408dd3dd04, + 0xae0b158b4738705e,0x9624ab50b148d445, + 0xd98ddaee19068c76,0x3badd624dd9b0957, + 0x87f8a8d4cfa417c9,0xe54ca5d70a80e5d6, + 0xa9f6d30a038d1dbc,0x5e9fcf4ccd211f4c, + 0xd47487cc8470652b,0x7647c3200069671f, + 0x84c8d4dfd2c63f3b,0x29ecd9f40041e073, + 0xa5fb0a17c777cf09,0xf468107100525890, + 0xcf79cc9db955c2cc,0x7182148d4066eeb4, + 0x81ac1fe293d599bf,0xc6f14cd848405530, + 0xa21727db38cb002f,0xb8ada00e5a506a7c, + 0xca9cf1d206fdc03b,0xa6d90811f0e4851c, + 0xfd442e4688bd304a,0x908f4a166d1da663, + 0x9e4a9cec15763e2e,0x9a598e4e043287fe, + 0xc5dd44271ad3cdba,0x40eff1e1853f29fd, + 0xf7549530e188c128,0xd12bee59e68ef47c, + 0x9a94dd3e8cf578b9,0x82bb74f8301958ce, + 0xc13a148e3032d6e7,0xe36a52363c1faf01, + 0xf18899b1bc3f8ca1,0xdc44e6c3cb279ac1, + 0x96f5600f15a7b7e5,0x29ab103a5ef8c0b9, + 0xbcb2b812db11a5de,0x7415d448f6b6f0e7, + 0xebdf661791d60f56,0x111b495b3464ad21, + 0x936b9fcebb25c995,0xcab10dd900beec34, + 0xb84687c269ef3bfb,0x3d5d514f40eea742, + 0xe65829b3046b0afa,0xcb4a5a3112a5112, + 0x8ff71a0fe2c2e6dc,0x47f0e785eaba72ab, + 0xb3f4e093db73a093,0x59ed216765690f56, + 0xe0f218b8d25088b8,0x306869c13ec3532c, + 0x8c974f7383725573,0x1e414218c73a13fb, + 0xafbd2350644eeacf,0xe5d1929ef90898fa, + 0xdbac6c247d62a583,0xdf45f746b74abf39, + 0x894bc396ce5da772,0x6b8bba8c328eb783, + 0xab9eb47c81f5114f,0x66ea92f3f326564, + 0xd686619ba27255a2,0xc80a537b0efefebd, + 0x8613fd0145877585,0xbd06742ce95f5f36, + 0xa798fc4196e952e7,0x2c48113823b73704, + 0xd17f3b51fca3a7a0,0xf75a15862ca504c5, + 0x82ef85133de648c4,0x9a984d73dbe722fb, + 0xa3ab66580d5fdaf5,0xc13e60d0d2e0ebba, + 0xcc963fee10b7d1b3,0x318df905079926a8, + 0xffbbcfe994e5c61f,0xfdf17746497f7052, + 0x9fd561f1fd0f9bd3,0xfeb6ea8bedefa633, + 0xc7caba6e7c5382c8,0xfe64a52ee96b8fc0, + 0xf9bd690a1b68637b,0x3dfdce7aa3c673b0, + 0x9c1661a651213e2d,0x6bea10ca65c084e, + 0xc31bfa0fe5698db8,0x486e494fcff30a62, + 0xf3e2f893dec3f126,0x5a89dba3c3efccfa, + 0x986ddb5c6b3a76b7,0xf89629465a75e01c, + 0xbe89523386091465,0xf6bbb397f1135823, + 0xee2ba6c0678b597f,0x746aa07ded582e2c, + 0x94db483840b717ef,0xa8c2a44eb4571cdc, + 0xba121a4650e4ddeb,0x92f34d62616ce413, + 0xe896a0d7e51e1566,0x77b020baf9c81d17, + 0x915e2486ef32cd60,0xace1474dc1d122e, + 0xb5b5ada8aaff80b8,0xd819992132456ba, + 0xe3231912d5bf60e6,0x10e1fff697ed6c69, + 0x8df5efabc5979c8f,0xca8d3ffa1ef463c1, + 0xb1736b96b6fd83b3,0xbd308ff8a6b17cb2, + 0xddd0467c64bce4a0,0xac7cb3f6d05ddbde, + 0x8aa22c0dbef60ee4,0x6bcdf07a423aa96b, + 0xad4ab7112eb3929d,0x86c16c98d2c953c6, + 0xd89d64d57a607744,0xe871c7bf077ba8b7, + 0x87625f056c7c4a8b,0x11471cd764ad4972, + 0xa93af6c6c79b5d2d,0xd598e40d3dd89bcf, + 0xd389b47879823479,0x4aff1d108d4ec2c3, + 0x843610cb4bf160cb,0xcedf722a585139ba, + 0xa54394fe1eedb8fe,0xc2974eb4ee658828, + 0xce947a3da6a9273e,0x733d226229feea32, + 0x811ccc668829b887,0x806357d5a3f525f, + 0xa163ff802a3426a8,0xca07c2dcb0cf26f7, + 0xc9bcff6034c13052,0xfc89b393dd02f0b5, + 0xfc2c3f3841f17c67,0xbbac2078d443ace2, + 0x9d9ba7832936edc0,0xd54b944b84aa4c0d, + 0xc5029163f384a931,0xa9e795e65d4df11, + 0xf64335bcf065d37d,0x4d4617b5ff4a16d5, + 0x99ea0196163fa42e,0x504bced1bf8e4e45, + 0xc06481fb9bcf8d39,0xe45ec2862f71e1d6, + 0xf07da27a82c37088,0x5d767327bb4e5a4c, + 0x964e858c91ba2655,0x3a6a07f8d510f86f, + 0xbbe226efb628afea,0x890489f70a55368b, + 0xeadab0aba3b2dbe5,0x2b45ac74ccea842e, + 0x92c8ae6b464fc96f,0x3b0b8bc90012929d, + 0xb77ada0617e3bbcb,0x9ce6ebb40173744, + 0xe55990879ddcaabd,0xcc420a6a101d0515, + 0x8f57fa54c2a9eab6,0x9fa946824a12232d, + 0xb32df8e9f3546564,0x47939822dc96abf9, + 0xdff9772470297ebd,0x59787e2b93bc56f7, + 0x8bfbea76c619ef36,0x57eb4edb3c55b65a, + 0xaefae51477a06b03,0xede622920b6b23f1, + 0xdab99e59958885c4,0xe95fab368e45eced, + 0x88b402f7fd75539b,0x11dbcb0218ebb414, + 0xaae103b5fcd2a881,0xd652bdc29f26a119, + 0xd59944a37c0752a2,0x4be76d3346f0495f, + 0x857fcae62d8493a5,0x6f70a4400c562ddb, + 0xa6dfbd9fb8e5b88e,0xcb4ccd500f6bb952, + 0xd097ad07a71f26b2,0x7e2000a41346a7a7, + 0x825ecc24c873782f,0x8ed400668c0c28c8, + 0xa2f67f2dfa90563b,0x728900802f0f32fa, + 0xcbb41ef979346bca,0x4f2b40a03ad2ffb9, + 0xfea126b7d78186bc,0xe2f610c84987bfa8, + 0x9f24b832e6b0f436,0xdd9ca7d2df4d7c9, + 0xc6ede63fa05d3143,0x91503d1c79720dbb, + 0xf8a95fcf88747d94,0x75a44c6397ce912a, + 0x9b69dbe1b548ce7c,0xc986afbe3ee11aba, + 0xc24452da229b021b,0xfbe85badce996168, + 0xf2d56790ab41c2a2,0xfae27299423fb9c3, + 0x97c560ba6b0919a5,0xdccd879fc967d41a, + 0xbdb6b8e905cb600f,0x5400e987bbc1c920, + 0xed246723473e3813,0x290123e9aab23b68, + 0x9436c0760c86e30b,0xf9a0b6720aaf6521, + 0xb94470938fa89bce,0xf808e40e8d5b3e69, + 0xe7958cb87392c2c2,0xb60b1d1230b20e04, + 0x90bd77f3483bb9b9,0xb1c6f22b5e6f48c2, + 0xb4ecd5f01a4aa828,0x1e38aeb6360b1af3, + 0xe2280b6c20dd5232,0x25c6da63c38de1b0, + 0x8d590723948a535f,0x579c487e5a38ad0e, + 0xb0af48ec79ace837,0x2d835a9df0c6d851, + 0xdcdb1b2798182244,0xf8e431456cf88e65, + 0x8a08f0f8bf0f156b,0x1b8e9ecb641b58ff, + 0xac8b2d36eed2dac5,0xe272467e3d222f3f, + 0xd7adf884aa879177,0x5b0ed81dcc6abb0f, + 0x86ccbb52ea94baea,0x98e947129fc2b4e9, + 0xa87fea27a539e9a5,0x3f2398d747b36224, + 0xd29fe4b18e88640e,0x8eec7f0d19a03aad, + 0x83a3eeeef9153e89,0x1953cf68300424ac, + 0xa48ceaaab75a8e2b,0x5fa8c3423c052dd7, + 0xcdb02555653131b6,0x3792f412cb06794d, + 0x808e17555f3ebf11,0xe2bbd88bbee40bd0, + 0xa0b19d2ab70e6ed6,0x5b6aceaeae9d0ec4, + 0xc8de047564d20a8b,0xf245825a5a445275, + 0xfb158592be068d2e,0xeed6e2f0f0d56712, + 0x9ced737bb6c4183d,0x55464dd69685606b, + 0xc428d05aa4751e4c,0xaa97e14c3c26b886, + 0xf53304714d9265df,0xd53dd99f4b3066a8, + 0x993fe2c6d07b7fab,0xe546a8038efe4029, + 0xbf8fdb78849a5f96,0xde98520472bdd033, + 0xef73d256a5c0f77c,0x963e66858f6d4440, + 0x95a8637627989aad,0xdde7001379a44aa8, + 0xbb127c53b17ec159,0x5560c018580d5d52, + 0xe9d71b689dde71af,0xaab8f01e6e10b4a6, + 0x9226712162ab070d,0xcab3961304ca70e8, + 0xb6b00d69bb55c8d1,0x3d607b97c5fd0d22, + 0xe45c10c42a2b3b05,0x8cb89a7db77c506a, + 0x8eb98a7a9a5b04e3,0x77f3608e92adb242, + 0xb267ed1940f1c61c,0x55f038b237591ed3, + 0xdf01e85f912e37a3,0x6b6c46dec52f6688, + 0x8b61313bbabce2c6,0x2323ac4b3b3da015, + 0xae397d8aa96c1b77,0xabec975e0a0d081a, + 0xd9c7dced53c72255,0x96e7bd358c904a21, + 0x881cea14545c7575,0x7e50d64177da2e54, + 0xaa242499697392d2,0xdde50bd1d5d0b9e9, + 0xd4ad2dbfc3d07787,0x955e4ec64b44e864, + 0x84ec3c97da624ab4,0xbd5af13bef0b113e, + 0xa6274bbdd0fadd61,0xecb1ad8aeacdd58e, + 0xcfb11ead453994ba,0x67de18eda5814af2, + 0x81ceb32c4b43fcf4,0x80eacf948770ced7, + 0xa2425ff75e14fc31,0xa1258379a94d028d, + 0xcad2f7f5359a3b3e,0x96ee45813a04330, + 0xfd87b5f28300ca0d,0x8bca9d6e188853fc, + 0x9e74d1b791e07e48,0x775ea264cf55347e, + 0xc612062576589dda,0x95364afe032a819e, + 0xf79687aed3eec551,0x3a83ddbd83f52205, + 0x9abe14cd44753b52,0xc4926a9672793543, + 0xc16d9a0095928a27,0x75b7053c0f178294, + 0xf1c90080baf72cb1,0x5324c68b12dd6339, + 0x971da05074da7bee,0xd3f6fc16ebca5e04, + 0xbce5086492111aea,0x88f4bb1ca6bcf585, + 0xec1e4a7db69561a5,0x2b31e9e3d06c32e6, + 0x9392ee8e921d5d07,0x3aff322e62439fd0, + 0xb877aa3236a4b449,0x9befeb9fad487c3, + 0xe69594bec44de15b,0x4c2ebe687989a9b4, + 0x901d7cf73ab0acd9,0xf9d37014bf60a11, + 0xb424dc35095cd80f,0x538484c19ef38c95, + 0xe12e13424bb40e13,0x2865a5f206b06fba, + 0x8cbccc096f5088cb,0xf93f87b7442e45d4, + 0xafebff0bcb24aafe,0xf78f69a51539d749, + 0xdbe6fecebdedd5be,0xb573440e5a884d1c, + 0x89705f4136b4a597,0x31680a88f8953031, + 0xabcc77118461cefc,0xfdc20d2b36ba7c3e, + 0xd6bf94d5e57a42bc,0x3d32907604691b4d, + 0x8637bd05af6c69b5,0xa63f9a49c2c1b110, + 0xa7c5ac471b478423,0xfcf80dc33721d54, + 0xd1b71758e219652b,0xd3c36113404ea4a9, + 0x83126e978d4fdf3b,0x645a1cac083126ea, + 0xa3d70a3d70a3d70a,0x3d70a3d70a3d70a4, + 0xcccccccccccccccc,0xcccccccccccccccd, + 0x8000000000000000,0x0, + 0xa000000000000000,0x0, + 0xc800000000000000,0x0, + 0xfa00000000000000,0x0, + 0x9c40000000000000,0x0, + 0xc350000000000000,0x0, + 0xf424000000000000,0x0, + 0x9896800000000000,0x0, + 0xbebc200000000000,0x0, + 0xee6b280000000000,0x0, + 0x9502f90000000000,0x0, + 0xba43b74000000000,0x0, + 0xe8d4a51000000000,0x0, + 0x9184e72a00000000,0x0, + 0xb5e620f480000000,0x0, + 0xe35fa931a0000000,0x0, + 0x8e1bc9bf04000000,0x0, + 0xb1a2bc2ec5000000,0x0, + 0xde0b6b3a76400000,0x0, + 0x8ac7230489e80000,0x0, + 0xad78ebc5ac620000,0x0, + 0xd8d726b7177a8000,0x0, + 0x878678326eac9000,0x0, + 0xa968163f0a57b400,0x0, + 0xd3c21bcecceda100,0x0, + 0x84595161401484a0,0x0, + 0xa56fa5b99019a5c8,0x0, + 0xcecb8f27f4200f3a,0x0, + 0x813f3978f8940984,0x4000000000000000, + 0xa18f07d736b90be5,0x5000000000000000, + 0xc9f2c9cd04674ede,0xa400000000000000, + 0xfc6f7c4045812296,0x4d00000000000000, + 0x9dc5ada82b70b59d,0xf020000000000000, + 0xc5371912364ce305,0x6c28000000000000, + 0xf684df56c3e01bc6,0xc732000000000000, + 0x9a130b963a6c115c,0x3c7f400000000000, + 0xc097ce7bc90715b3,0x4b9f100000000000, + 0xf0bdc21abb48db20,0x1e86d40000000000, + 0x96769950b50d88f4,0x1314448000000000, + 0xbc143fa4e250eb31,0x17d955a000000000, + 0xeb194f8e1ae525fd,0x5dcfab0800000000, + 0x92efd1b8d0cf37be,0x5aa1cae500000000, + 0xb7abc627050305ad,0xf14a3d9e40000000, + 0xe596b7b0c643c719,0x6d9ccd05d0000000, + 0x8f7e32ce7bea5c6f,0xe4820023a2000000, + 0xb35dbf821ae4f38b,0xdda2802c8a800000, + 0xe0352f62a19e306e,0xd50b2037ad200000, + 0x8c213d9da502de45,0x4526f422cc340000, + 0xaf298d050e4395d6,0x9670b12b7f410000, + 0xdaf3f04651d47b4c,0x3c0cdd765f114000, + 0x88d8762bf324cd0f,0xa5880a69fb6ac800, + 0xab0e93b6efee0053,0x8eea0d047a457a00, + 0xd5d238a4abe98068,0x72a4904598d6d880, + 0x85a36366eb71f041,0x47a6da2b7f864750, + 0xa70c3c40a64e6c51,0x999090b65f67d924, + 0xd0cf4b50cfe20765,0xfff4b4e3f741cf6d, + 0x82818f1281ed449f,0xbff8f10e7a8921a4, + 0xa321f2d7226895c7,0xaff72d52192b6a0d, + 0xcbea6f8ceb02bb39,0x9bf4f8a69f764490, + 0xfee50b7025c36a08,0x2f236d04753d5b4, + 0x9f4f2726179a2245,0x1d762422c946590, + 0xc722f0ef9d80aad6,0x424d3ad2b7b97ef5, + 0xf8ebad2b84e0d58b,0xd2e0898765a7deb2, + 0x9b934c3b330c8577,0x63cc55f49f88eb2f, + 0xc2781f49ffcfa6d5,0x3cbf6b71c76b25fb, + 0xf316271c7fc3908a,0x8bef464e3945ef7a, + 0x97edd871cfda3a56,0x97758bf0e3cbb5ac, + 0xbde94e8e43d0c8ec,0x3d52eeed1cbea317, + 0xed63a231d4c4fb27,0x4ca7aaa863ee4bdd, + 0x945e455f24fb1cf8,0x8fe8caa93e74ef6a, + 0xb975d6b6ee39e436,0xb3e2fd538e122b44, + 0xe7d34c64a9c85d44,0x60dbbca87196b616, + 0x90e40fbeea1d3a4a,0xbc8955e946fe31cd, + 0xb51d13aea4a488dd,0x6babab6398bdbe41, + 0xe264589a4dcdab14,0xc696963c7eed2dd1, + 0x8d7eb76070a08aec,0xfc1e1de5cf543ca2, + 0xb0de65388cc8ada8,0x3b25a55f43294bcb, + 0xdd15fe86affad912,0x49ef0eb713f39ebe, + 0x8a2dbf142dfcc7ab,0x6e3569326c784337, + 0xacb92ed9397bf996,0x49c2c37f07965404, + 0xd7e77a8f87daf7fb,0xdc33745ec97be906, + 0x86f0ac99b4e8dafd,0x69a028bb3ded71a3, + 0xa8acd7c0222311bc,0xc40832ea0d68ce0c, + 0xd2d80db02aabd62b,0xf50a3fa490c30190, + 0x83c7088e1aab65db,0x792667c6da79e0fa, + 0xa4b8cab1a1563f52,0x577001b891185938, + 0xcde6fd5e09abcf26,0xed4c0226b55e6f86, + 0x80b05e5ac60b6178,0x544f8158315b05b4, + 0xa0dc75f1778e39d6,0x696361ae3db1c721, + 0xc913936dd571c84c,0x3bc3a19cd1e38e9, + 0xfb5878494ace3a5f,0x4ab48a04065c723, + 0x9d174b2dcec0e47b,0x62eb0d64283f9c76, + 0xc45d1df942711d9a,0x3ba5d0bd324f8394, + 0xf5746577930d6500,0xca8f44ec7ee36479, + 0x9968bf6abbe85f20,0x7e998b13cf4e1ecb, + 0xbfc2ef456ae276e8,0x9e3fedd8c321a67e, + 0xefb3ab16c59b14a2,0xc5cfe94ef3ea101e, + 0x95d04aee3b80ece5,0xbba1f1d158724a12, + 0xbb445da9ca61281f,0x2a8a6e45ae8edc97, + 0xea1575143cf97226,0xf52d09d71a3293bd, + 0x924d692ca61be758,0x593c2626705f9c56, + 0xb6e0c377cfa2e12e,0x6f8b2fb00c77836c, + 0xe498f455c38b997a,0xb6dfb9c0f956447, + 0x8edf98b59a373fec,0x4724bd4189bd5eac, + 0xb2977ee300c50fe7,0x58edec91ec2cb657, + 0xdf3d5e9bc0f653e1,0x2f2967b66737e3ed, + 0x8b865b215899f46c,0xbd79e0d20082ee74, + 0xae67f1e9aec07187,0xecd8590680a3aa11, + 0xda01ee641a708de9,0xe80e6f4820cc9495, + 0x884134fe908658b2,0x3109058d147fdcdd, + 0xaa51823e34a7eede,0xbd4b46f0599fd415, + 0xd4e5e2cdc1d1ea96,0x6c9e18ac7007c91a, + 0x850fadc09923329e,0x3e2cf6bc604ddb0, + 0xa6539930bf6bff45,0x84db8346b786151c, + 0xcfe87f7cef46ff16,0xe612641865679a63, + 0x81f14fae158c5f6e,0x4fcb7e8f3f60c07e, + 0xa26da3999aef7749,0xe3be5e330f38f09d, + 0xcb090c8001ab551c,0x5cadf5bfd3072cc5, + 0xfdcb4fa002162a63,0x73d9732fc7c8f7f6, + 0x9e9f11c4014dda7e,0x2867e7fddcdd9afa, + 0xc646d63501a1511d,0xb281e1fd541501b8, + 0xf7d88bc24209a565,0x1f225a7ca91a4226, + 0x9ae757596946075f,0x3375788de9b06958, + 0xc1a12d2fc3978937,0x52d6b1641c83ae, + 0xf209787bb47d6b84,0xc0678c5dbd23a49a, + 0x9745eb4d50ce6332,0xf840b7ba963646e0, + 0xbd176620a501fbff,0xb650e5a93bc3d898, + 0xec5d3fa8ce427aff,0xa3e51f138ab4cebe, + 0x93ba47c980e98cdf,0xc66f336c36b10137, + 0xb8a8d9bbe123f017,0xb80b0047445d4184, + 0xe6d3102ad96cec1d,0xa60dc059157491e5, + 0x9043ea1ac7e41392,0x87c89837ad68db2f, + 0xb454e4a179dd1877,0x29babe4598c311fb, + 0xe16a1dc9d8545e94,0xf4296dd6fef3d67a, + 0x8ce2529e2734bb1d,0x1899e4a65f58660c, + 0xb01ae745b101e9e4,0x5ec05dcff72e7f8f, + 0xdc21a1171d42645d,0x76707543f4fa1f73, + 0x899504ae72497eba,0x6a06494a791c53a8, + 0xabfa45da0edbde69,0x487db9d17636892, + 0xd6f8d7509292d603,0x45a9d2845d3c42b6, + 0x865b86925b9bc5c2,0xb8a2392ba45a9b2, + 0xa7f26836f282b732,0x8e6cac7768d7141e, + 0xd1ef0244af2364ff,0x3207d795430cd926, + 0x8335616aed761f1f,0x7f44e6bd49e807b8, + 0xa402b9c5a8d3a6e7,0x5f16206c9c6209a6, + 0xcd036837130890a1,0x36dba887c37a8c0f, + 0x802221226be55a64,0xc2494954da2c9789, + 0xa02aa96b06deb0fd,0xf2db9baa10b7bd6c, + 0xc83553c5c8965d3d,0x6f92829494e5acc7, + 0xfa42a8b73abbf48c,0xcb772339ba1f17f9, + 0x9c69a97284b578d7,0xff2a760414536efb, + 0xc38413cf25e2d70d,0xfef5138519684aba, + 0xf46518c2ef5b8cd1,0x7eb258665fc25d69, + 0x98bf2f79d5993802,0xef2f773ffbd97a61, + 0xbeeefb584aff8603,0xaafb550ffacfd8fa, + 0xeeaaba2e5dbf6784,0x95ba2a53f983cf38, + 0x952ab45cfa97a0b2,0xdd945a747bf26183, + 0xba756174393d88df,0x94f971119aeef9e4, + 0xe912b9d1478ceb17,0x7a37cd5601aab85d, + 0x91abb422ccb812ee,0xac62e055c10ab33a, + 0xb616a12b7fe617aa,0x577b986b314d6009, + 0xe39c49765fdf9d94,0xed5a7e85fda0b80b, + 0x8e41ade9fbebc27d,0x14588f13be847307, + 0xb1d219647ae6b31c,0x596eb2d8ae258fc8, + 0xde469fbd99a05fe3,0x6fca5f8ed9aef3bb, + 0x8aec23d680043bee,0x25de7bb9480d5854, + 0xada72ccc20054ae9,0xaf561aa79a10ae6a, + 0xd910f7ff28069da4,0x1b2ba1518094da04, + 0x87aa9aff79042286,0x90fb44d2f05d0842, + 0xa99541bf57452b28,0x353a1607ac744a53, + 0xd3fa922f2d1675f2,0x42889b8997915ce8, + 0x847c9b5d7c2e09b7,0x69956135febada11, + 0xa59bc234db398c25,0x43fab9837e699095, + 0xcf02b2c21207ef2e,0x94f967e45e03f4bb, + 0x8161afb94b44f57d,0x1d1be0eebac278f5, + 0xa1ba1ba79e1632dc,0x6462d92a69731732, + 0xca28a291859bbf93,0x7d7b8f7503cfdcfe, + 0xfcb2cb35e702af78,0x5cda735244c3d43e, + 0x9defbf01b061adab,0x3a0888136afa64a7, + 0xc56baec21c7a1916,0x88aaa1845b8fdd0, + 0xf6c69a72a3989f5b,0x8aad549e57273d45, + 0x9a3c2087a63f6399,0x36ac54e2f678864b, + 0xc0cb28a98fcf3c7f,0x84576a1bb416a7dd, + 0xf0fdf2d3f3c30b9f,0x656d44a2a11c51d5, + 0x969eb7c47859e743,0x9f644ae5a4b1b325, + 0xbc4665b596706114,0x873d5d9f0dde1fee, + 0xeb57ff22fc0c7959,0xa90cb506d155a7ea, + 0x9316ff75dd87cbd8,0x9a7f12442d588f2, + 0xb7dcbf5354e9bece,0xc11ed6d538aeb2f, + 0xe5d3ef282a242e81,0x8f1668c8a86da5fa, + 0x8fa475791a569d10,0xf96e017d694487bc, + 0xb38d92d760ec4455,0x37c981dcc395a9ac, + 0xe070f78d3927556a,0x85bbe253f47b1417, + 0x8c469ab843b89562,0x93956d7478ccec8e, + 0xaf58416654a6babb,0x387ac8d1970027b2, + 0xdb2e51bfe9d0696a,0x6997b05fcc0319e, + 0x88fcf317f22241e2,0x441fece3bdf81f03, + 0xab3c2fddeeaad25a,0xd527e81cad7626c3, + 0xd60b3bd56a5586f1,0x8a71e223d8d3b074, + 0x85c7056562757456,0xf6872d5667844e49, + 0xa738c6bebb12d16c,0xb428f8ac016561db, + 0xd106f86e69d785c7,0xe13336d701beba52, + 0x82a45b450226b39c,0xecc0024661173473, + 0xa34d721642b06084,0x27f002d7f95d0190, + 0xcc20ce9bd35c78a5,0x31ec038df7b441f4, + 0xff290242c83396ce,0x7e67047175a15271, + 0x9f79a169bd203e41,0xf0062c6e984d386, + 0xc75809c42c684dd1,0x52c07b78a3e60868, + 0xf92e0c3537826145,0xa7709a56ccdf8a82, + 0x9bbcc7a142b17ccb,0x88a66076400bb691, + 0xc2abf989935ddbfe,0x6acff893d00ea435, + 0xf356f7ebf83552fe,0x583f6b8c4124d43, + 0x98165af37b2153de,0xc3727a337a8b704a, + 0xbe1bf1b059e9a8d6,0x744f18c0592e4c5c, + 0xeda2ee1c7064130c,0x1162def06f79df73, + 0x9485d4d1c63e8be7,0x8addcb5645ac2ba8, + 0xb9a74a0637ce2ee1,0x6d953e2bd7173692, + 0xe8111c87c5c1ba99,0xc8fa8db6ccdd0437, + 0x910ab1d4db9914a0,0x1d9c9892400a22a2, + 0xb54d5e4a127f59c8,0x2503beb6d00cab4b, + 0xe2a0b5dc971f303a,0x2e44ae64840fd61d, + 0x8da471a9de737e24,0x5ceaecfed289e5d2, + 0xb10d8e1456105dad,0x7425a83e872c5f47, + 0xdd50f1996b947518,0xd12f124e28f77719, + 0x8a5296ffe33cc92f,0x82bd6b70d99aaa6f, + 0xace73cbfdc0bfb7b,0x636cc64d1001550b, + 0xd8210befd30efa5a,0x3c47f7e05401aa4e, + 0x8714a775e3e95c78,0x65acfaec34810a71, + 0xa8d9d1535ce3b396,0x7f1839a741a14d0d, + 0xd31045a8341ca07c,0x1ede48111209a050, + 0x83ea2b892091e44d,0x934aed0aab460432, + 0xa4e4b66b68b65d60,0xf81da84d5617853f, + 0xce1de40642e3f4b9,0x36251260ab9d668e, + 0x80d2ae83e9ce78f3,0xc1d72b7c6b426019, + 0xa1075a24e4421730,0xb24cf65b8612f81f, + 0xc94930ae1d529cfc,0xdee033f26797b627, + 0xfb9b7cd9a4a7443c,0x169840ef017da3b1, + 0x9d412e0806e88aa5,0x8e1f289560ee864e, + 0xc491798a08a2ad4e,0xf1a6f2bab92a27e2, + 0xf5b5d7ec8acb58a2,0xae10af696774b1db, + 0x9991a6f3d6bf1765,0xacca6da1e0a8ef29, + 0xbff610b0cc6edd3f,0x17fd090a58d32af3, + 0xeff394dcff8a948e,0xddfc4b4cef07f5b0, + 0x95f83d0a1fb69cd9,0x4abdaf101564f98e, + 0xbb764c4ca7a4440f,0x9d6d1ad41abe37f1, + 0xea53df5fd18d5513,0x84c86189216dc5ed, + 0x92746b9be2f8552c,0x32fd3cf5b4e49bb4, + 0xb7118682dbb66a77,0x3fbc8c33221dc2a1, + 0xe4d5e82392a40515,0xfabaf3feaa5334a, + 0x8f05b1163ba6832d,0x29cb4d87f2a7400e, + 0xb2c71d5bca9023f8,0x743e20e9ef511012, + 0xdf78e4b2bd342cf6,0x914da9246b255416, + 0x8bab8eefb6409c1a,0x1ad089b6c2f7548e, + 0xae9672aba3d0c320,0xa184ac2473b529b1, + 0xda3c0f568cc4f3e8,0xc9e5d72d90a2741e, + 0x8865899617fb1871,0x7e2fa67c7a658892, + 0xaa7eebfb9df9de8d,0xddbb901b98feeab7, + 0xd51ea6fa85785631,0x552a74227f3ea565, + 0x8533285c936b35de,0xd53a88958f87275f, + 0xa67ff273b8460356,0x8a892abaf368f137, + 0xd01fef10a657842c,0x2d2b7569b0432d85, + 0x8213f56a67f6b29b,0x9c3b29620e29fc73, + 0xa298f2c501f45f42,0x8349f3ba91b47b8f, + 0xcb3f2f7642717713,0x241c70a936219a73, + 0xfe0efb53d30dd4d7,0xed238cd383aa0110, + 0x9ec95d1463e8a506,0xf4363804324a40aa, + 0xc67bb4597ce2ce48,0xb143c6053edcd0d5, + 0xf81aa16fdc1b81da,0xdd94b7868e94050a, + 0x9b10a4e5e9913128,0xca7cf2b4191c8326, + 0xc1d4ce1f63f57d72,0xfd1c2f611f63a3f0, + 0xf24a01a73cf2dccf,0xbc633b39673c8cec, + 0x976e41088617ca01,0xd5be0503e085d813, + 0xbd49d14aa79dbc82,0x4b2d8644d8a74e18, + 0xec9c459d51852ba2,0xddf8e7d60ed1219e, + 0x93e1ab8252f33b45,0xcabb90e5c942b503, + 0xb8da1662e7b00a17,0x3d6a751f3b936243, + 0xe7109bfba19c0c9d,0xcc512670a783ad4, + 0x906a617d450187e2,0x27fb2b80668b24c5, + 0xb484f9dc9641e9da,0xb1f9f660802dedf6, + 0xe1a63853bbd26451,0x5e7873f8a0396973, + 0x8d07e33455637eb2,0xdb0b487b6423e1e8, + 0xb049dc016abc5e5f,0x91ce1a9a3d2cda62, + 0xdc5c5301c56b75f7,0x7641a140cc7810fb, + 0x89b9b3e11b6329ba,0xa9e904c87fcb0a9d, + 0xac2820d9623bf429,0x546345fa9fbdcd44, + 0xd732290fbacaf133,0xa97c177947ad4095, + 0x867f59a9d4bed6c0,0x49ed8eabcccc485d, + 0xa81f301449ee8c70,0x5c68f256bfff5a74, + 0xd226fc195c6a2f8c,0x73832eec6fff3111, + 0x83585d8fd9c25db7,0xc831fd53c5ff7eab, + 0xa42e74f3d032f525,0xba3e7ca8b77f5e55, + 0xcd3a1230c43fb26f,0x28ce1bd2e55f35eb, + 0x80444b5e7aa7cf85,0x7980d163cf5b81b3, + 0xa0555e361951c366,0xd7e105bcc332621f, + 0xc86ab5c39fa63440,0x8dd9472bf3fefaa7, + 0xfa856334878fc150,0xb14f98f6f0feb951, + 0x9c935e00d4b9d8d2,0x6ed1bf9a569f33d3, + 0xc3b8358109e84f07,0xa862f80ec4700c8, + 0xf4a642e14c6262c8,0xcd27bb612758c0fa, + 0x98e7e9cccfbd7dbd,0x8038d51cb897789c, + 0xbf21e44003acdd2c,0xe0470a63e6bd56c3, + 0xeeea5d5004981478,0x1858ccfce06cac74, + 0x95527a5202df0ccb,0xf37801e0c43ebc8, + 0xbaa718e68396cffd,0xd30560258f54e6ba, + 0xe950df20247c83fd,0x47c6b82ef32a2069, + 0x91d28b7416cdd27e,0x4cdc331d57fa5441, + 0xb6472e511c81471d,0xe0133fe4adf8e952, + 0xe3d8f9e563a198e5,0x58180fddd97723a6, + 0x8e679c2f5e44ff8f,0x570f09eaa7ea7648,}; }; template -const uint64_t powers_template::power_of_five_128[number_of_entries] = { - 0xeef453d6923bd65a,0x113faa2906a13b3f, - 0x9558b4661b6565f8,0x4ac7ca59a424c507, - 0xbaaee17fa23ebf76,0x5d79bcf00d2df649, - 0xe95a99df8ace6f53,0xf4d82c2c107973dc, - 0x91d8a02bb6c10594,0x79071b9b8a4be869, - 0xb64ec836a47146f9,0x9748e2826cdee284, - 0xe3e27a444d8d98b7,0xfd1b1b2308169b25, - 0x8e6d8c6ab0787f72,0xfe30f0f5e50e20f7, - 0xb208ef855c969f4f,0xbdbd2d335e51a935, - 0xde8b2b66b3bc4723,0xad2c788035e61382, - 0x8b16fb203055ac76,0x4c3bcb5021afcc31, - 0xaddcb9e83c6b1793,0xdf4abe242a1bbf3d, - 0xd953e8624b85dd78,0xd71d6dad34a2af0d, - 0x87d4713d6f33aa6b,0x8672648c40e5ad68, - 0xa9c98d8ccb009506,0x680efdaf511f18c2, - 0xd43bf0effdc0ba48,0x212bd1b2566def2, - 0x84a57695fe98746d,0x14bb630f7604b57, - 0xa5ced43b7e3e9188,0x419ea3bd35385e2d, - 0xcf42894a5dce35ea,0x52064cac828675b9, - 0x818995ce7aa0e1b2,0x7343efebd1940993, - 0xa1ebfb4219491a1f,0x1014ebe6c5f90bf8, - 0xca66fa129f9b60a6,0xd41a26e077774ef6, - 0xfd00b897478238d0,0x8920b098955522b4, - 0x9e20735e8cb16382,0x55b46e5f5d5535b0, - 0xc5a890362fddbc62,0xeb2189f734aa831d, - 0xf712b443bbd52b7b,0xa5e9ec7501d523e4, - 0x9a6bb0aa55653b2d,0x47b233c92125366e, - 0xc1069cd4eabe89f8,0x999ec0bb696e840a, - 0xf148440a256e2c76,0xc00670ea43ca250d, - 0x96cd2a865764dbca,0x380406926a5e5728, - 0xbc807527ed3e12bc,0xc605083704f5ecf2, - 0xeba09271e88d976b,0xf7864a44c633682e, - 0x93445b8731587ea3,0x7ab3ee6afbe0211d, - 0xb8157268fdae9e4c,0x5960ea05bad82964, - 0xe61acf033d1a45df,0x6fb92487298e33bd, - 0x8fd0c16206306bab,0xa5d3b6d479f8e056, - 0xb3c4f1ba87bc8696,0x8f48a4899877186c, - 0xe0b62e2929aba83c,0x331acdabfe94de87, - 0x8c71dcd9ba0b4925,0x9ff0c08b7f1d0b14, - 0xaf8e5410288e1b6f,0x7ecf0ae5ee44dd9, - 0xdb71e91432b1a24a,0xc9e82cd9f69d6150, - 0x892731ac9faf056e,0xbe311c083a225cd2, - 0xab70fe17c79ac6ca,0x6dbd630a48aaf406, - 0xd64d3d9db981787d,0x92cbbccdad5b108, - 0x85f0468293f0eb4e,0x25bbf56008c58ea5, - 0xa76c582338ed2621,0xaf2af2b80af6f24e, - 0xd1476e2c07286faa,0x1af5af660db4aee1, - 0x82cca4db847945ca,0x50d98d9fc890ed4d, - 0xa37fce126597973c,0xe50ff107bab528a0, - 0xcc5fc196fefd7d0c,0x1e53ed49a96272c8, - 0xff77b1fcbebcdc4f,0x25e8e89c13bb0f7a, - 0x9faacf3df73609b1,0x77b191618c54e9ac, - 0xc795830d75038c1d,0xd59df5b9ef6a2417, - 0xf97ae3d0d2446f25,0x4b0573286b44ad1d, - 0x9becce62836ac577,0x4ee367f9430aec32, - 0xc2e801fb244576d5,0x229c41f793cda73f, - 0xf3a20279ed56d48a,0x6b43527578c1110f, - 0x9845418c345644d6,0x830a13896b78aaa9, - 0xbe5691ef416bd60c,0x23cc986bc656d553, - 0xedec366b11c6cb8f,0x2cbfbe86b7ec8aa8, - 0x94b3a202eb1c3f39,0x7bf7d71432f3d6a9, - 0xb9e08a83a5e34f07,0xdaf5ccd93fb0cc53, - 0xe858ad248f5c22c9,0xd1b3400f8f9cff68, - 0x91376c36d99995be,0x23100809b9c21fa1, - 0xb58547448ffffb2d,0xabd40a0c2832a78a, - 0xe2e69915b3fff9f9,0x16c90c8f323f516c, - 0x8dd01fad907ffc3b,0xae3da7d97f6792e3, - 0xb1442798f49ffb4a,0x99cd11cfdf41779c, - 0xdd95317f31c7fa1d,0x40405643d711d583, - 0x8a7d3eef7f1cfc52,0x482835ea666b2572, - 0xad1c8eab5ee43b66,0xda3243650005eecf, - 0xd863b256369d4a40,0x90bed43e40076a82, - 0x873e4f75e2224e68,0x5a7744a6e804a291, - 0xa90de3535aaae202,0x711515d0a205cb36, - 0xd3515c2831559a83,0xd5a5b44ca873e03, - 0x8412d9991ed58091,0xe858790afe9486c2, - 0xa5178fff668ae0b6,0x626e974dbe39a872, - 0xce5d73ff402d98e3,0xfb0a3d212dc8128f, - 0x80fa687f881c7f8e,0x7ce66634bc9d0b99, - 0xa139029f6a239f72,0x1c1fffc1ebc44e80, - 0xc987434744ac874e,0xa327ffb266b56220, - 0xfbe9141915d7a922,0x4bf1ff9f0062baa8, - 0x9d71ac8fada6c9b5,0x6f773fc3603db4a9, - 0xc4ce17b399107c22,0xcb550fb4384d21d3, - 0xf6019da07f549b2b,0x7e2a53a146606a48, - 0x99c102844f94e0fb,0x2eda7444cbfc426d, - 0xc0314325637a1939,0xfa911155fefb5308, - 0xf03d93eebc589f88,0x793555ab7eba27ca, - 0x96267c7535b763b5,0x4bc1558b2f3458de, - 0xbbb01b9283253ca2,0x9eb1aaedfb016f16, - 0xea9c227723ee8bcb,0x465e15a979c1cadc, - 0x92a1958a7675175f,0xbfacd89ec191ec9, - 0xb749faed14125d36,0xcef980ec671f667b, - 0xe51c79a85916f484,0x82b7e12780e7401a, - 0x8f31cc0937ae58d2,0xd1b2ecb8b0908810, - 0xb2fe3f0b8599ef07,0x861fa7e6dcb4aa15, - 0xdfbdcece67006ac9,0x67a791e093e1d49a, - 0x8bd6a141006042bd,0xe0c8bb2c5c6d24e0, - 0xaecc49914078536d,0x58fae9f773886e18, - 0xda7f5bf590966848,0xaf39a475506a899e, - 0x888f99797a5e012d,0x6d8406c952429603, - 0xaab37fd7d8f58178,0xc8e5087ba6d33b83, - 0xd5605fcdcf32e1d6,0xfb1e4a9a90880a64, - 0x855c3be0a17fcd26,0x5cf2eea09a55067f, - 0xa6b34ad8c9dfc06f,0xf42faa48c0ea481e, - 0xd0601d8efc57b08b,0xf13b94daf124da26, - 0x823c12795db6ce57,0x76c53d08d6b70858, - 0xa2cb1717b52481ed,0x54768c4b0c64ca6e, - 0xcb7ddcdda26da268,0xa9942f5dcf7dfd09, - 0xfe5d54150b090b02,0xd3f93b35435d7c4c, - 0x9efa548d26e5a6e1,0xc47bc5014a1a6daf, - 0xc6b8e9b0709f109a,0x359ab6419ca1091b, - 0xf867241c8cc6d4c0,0xc30163d203c94b62, - 0x9b407691d7fc44f8,0x79e0de63425dcf1d, - 0xc21094364dfb5636,0x985915fc12f542e4, - 0xf294b943e17a2bc4,0x3e6f5b7b17b2939d, - 0x979cf3ca6cec5b5a,0xa705992ceecf9c42, - 0xbd8430bd08277231,0x50c6ff782a838353, - 0xece53cec4a314ebd,0xa4f8bf5635246428, - 0x940f4613ae5ed136,0x871b7795e136be99, - 0xb913179899f68584,0x28e2557b59846e3f, - 0xe757dd7ec07426e5,0x331aeada2fe589cf, - 0x9096ea6f3848984f,0x3ff0d2c85def7621, - 0xb4bca50b065abe63,0xfed077a756b53a9, - 0xe1ebce4dc7f16dfb,0xd3e8495912c62894, - 0x8d3360f09cf6e4bd,0x64712dd7abbbd95c, - 0xb080392cc4349dec,0xbd8d794d96aacfb3, - 0xdca04777f541c567,0xecf0d7a0fc5583a0, - 0x89e42caaf9491b60,0xf41686c49db57244, - 0xac5d37d5b79b6239,0x311c2875c522ced5, - 0xd77485cb25823ac7,0x7d633293366b828b, - 0x86a8d39ef77164bc,0xae5dff9c02033197, - 0xa8530886b54dbdeb,0xd9f57f830283fdfc, - 0xd267caa862a12d66,0xd072df63c324fd7b, - 0x8380dea93da4bc60,0x4247cb9e59f71e6d, - 0xa46116538d0deb78,0x52d9be85f074e608, - 0xcd795be870516656,0x67902e276c921f8b, - 0x806bd9714632dff6,0xba1cd8a3db53b6, - 0xa086cfcd97bf97f3,0x80e8a40eccd228a4, - 0xc8a883c0fdaf7df0,0x6122cd128006b2cd, - 0xfad2a4b13d1b5d6c,0x796b805720085f81, - 0x9cc3a6eec6311a63,0xcbe3303674053bb0, - 0xc3f490aa77bd60fc,0xbedbfc4411068a9c, - 0xf4f1b4d515acb93b,0xee92fb5515482d44, - 0x991711052d8bf3c5,0x751bdd152d4d1c4a, - 0xbf5cd54678eef0b6,0xd262d45a78a0635d, - 0xef340a98172aace4,0x86fb897116c87c34, - 0x9580869f0e7aac0e,0xd45d35e6ae3d4da0, - 0xbae0a846d2195712,0x8974836059cca109, - 0xe998d258869facd7,0x2bd1a438703fc94b, - 0x91ff83775423cc06,0x7b6306a34627ddcf, - 0xb67f6455292cbf08,0x1a3bc84c17b1d542, - 0xe41f3d6a7377eeca,0x20caba5f1d9e4a93, - 0x8e938662882af53e,0x547eb47b7282ee9c, - 0xb23867fb2a35b28d,0xe99e619a4f23aa43, - 0xdec681f9f4c31f31,0x6405fa00e2ec94d4, - 0x8b3c113c38f9f37e,0xde83bc408dd3dd04, - 0xae0b158b4738705e,0x9624ab50b148d445, - 0xd98ddaee19068c76,0x3badd624dd9b0957, - 0x87f8a8d4cfa417c9,0xe54ca5d70a80e5d6, - 0xa9f6d30a038d1dbc,0x5e9fcf4ccd211f4c, - 0xd47487cc8470652b,0x7647c3200069671f, - 0x84c8d4dfd2c63f3b,0x29ecd9f40041e073, - 0xa5fb0a17c777cf09,0xf468107100525890, - 0xcf79cc9db955c2cc,0x7182148d4066eeb4, - 0x81ac1fe293d599bf,0xc6f14cd848405530, - 0xa21727db38cb002f,0xb8ada00e5a506a7c, - 0xca9cf1d206fdc03b,0xa6d90811f0e4851c, - 0xfd442e4688bd304a,0x908f4a166d1da663, - 0x9e4a9cec15763e2e,0x9a598e4e043287fe, - 0xc5dd44271ad3cdba,0x40eff1e1853f29fd, - 0xf7549530e188c128,0xd12bee59e68ef47c, - 0x9a94dd3e8cf578b9,0x82bb74f8301958ce, - 0xc13a148e3032d6e7,0xe36a52363c1faf01, - 0xf18899b1bc3f8ca1,0xdc44e6c3cb279ac1, - 0x96f5600f15a7b7e5,0x29ab103a5ef8c0b9, - 0xbcb2b812db11a5de,0x7415d448f6b6f0e7, - 0xebdf661791d60f56,0x111b495b3464ad21, - 0x936b9fcebb25c995,0xcab10dd900beec34, - 0xb84687c269ef3bfb,0x3d5d514f40eea742, - 0xe65829b3046b0afa,0xcb4a5a3112a5112, - 0x8ff71a0fe2c2e6dc,0x47f0e785eaba72ab, - 0xb3f4e093db73a093,0x59ed216765690f56, - 0xe0f218b8d25088b8,0x306869c13ec3532c, - 0x8c974f7383725573,0x1e414218c73a13fb, - 0xafbd2350644eeacf,0xe5d1929ef90898fa, - 0xdbac6c247d62a583,0xdf45f746b74abf39, - 0x894bc396ce5da772,0x6b8bba8c328eb783, - 0xab9eb47c81f5114f,0x66ea92f3f326564, - 0xd686619ba27255a2,0xc80a537b0efefebd, - 0x8613fd0145877585,0xbd06742ce95f5f36, - 0xa798fc4196e952e7,0x2c48113823b73704, - 0xd17f3b51fca3a7a0,0xf75a15862ca504c5, - 0x82ef85133de648c4,0x9a984d73dbe722fb, - 0xa3ab66580d5fdaf5,0xc13e60d0d2e0ebba, - 0xcc963fee10b7d1b3,0x318df905079926a8, - 0xffbbcfe994e5c61f,0xfdf17746497f7052, - 0x9fd561f1fd0f9bd3,0xfeb6ea8bedefa633, - 0xc7caba6e7c5382c8,0xfe64a52ee96b8fc0, - 0xf9bd690a1b68637b,0x3dfdce7aa3c673b0, - 0x9c1661a651213e2d,0x6bea10ca65c084e, - 0xc31bfa0fe5698db8,0x486e494fcff30a62, - 0xf3e2f893dec3f126,0x5a89dba3c3efccfa, - 0x986ddb5c6b3a76b7,0xf89629465a75e01c, - 0xbe89523386091465,0xf6bbb397f1135823, - 0xee2ba6c0678b597f,0x746aa07ded582e2c, - 0x94db483840b717ef,0xa8c2a44eb4571cdc, - 0xba121a4650e4ddeb,0x92f34d62616ce413, - 0xe896a0d7e51e1566,0x77b020baf9c81d17, - 0x915e2486ef32cd60,0xace1474dc1d122e, - 0xb5b5ada8aaff80b8,0xd819992132456ba, - 0xe3231912d5bf60e6,0x10e1fff697ed6c69, - 0x8df5efabc5979c8f,0xca8d3ffa1ef463c1, - 0xb1736b96b6fd83b3,0xbd308ff8a6b17cb2, - 0xddd0467c64bce4a0,0xac7cb3f6d05ddbde, - 0x8aa22c0dbef60ee4,0x6bcdf07a423aa96b, - 0xad4ab7112eb3929d,0x86c16c98d2c953c6, - 0xd89d64d57a607744,0xe871c7bf077ba8b7, - 0x87625f056c7c4a8b,0x11471cd764ad4972, - 0xa93af6c6c79b5d2d,0xd598e40d3dd89bcf, - 0xd389b47879823479,0x4aff1d108d4ec2c3, - 0x843610cb4bf160cb,0xcedf722a585139ba, - 0xa54394fe1eedb8fe,0xc2974eb4ee658828, - 0xce947a3da6a9273e,0x733d226229feea32, - 0x811ccc668829b887,0x806357d5a3f525f, - 0xa163ff802a3426a8,0xca07c2dcb0cf26f7, - 0xc9bcff6034c13052,0xfc89b393dd02f0b5, - 0xfc2c3f3841f17c67,0xbbac2078d443ace2, - 0x9d9ba7832936edc0,0xd54b944b84aa4c0d, - 0xc5029163f384a931,0xa9e795e65d4df11, - 0xf64335bcf065d37d,0x4d4617b5ff4a16d5, - 0x99ea0196163fa42e,0x504bced1bf8e4e45, - 0xc06481fb9bcf8d39,0xe45ec2862f71e1d6, - 0xf07da27a82c37088,0x5d767327bb4e5a4c, - 0x964e858c91ba2655,0x3a6a07f8d510f86f, - 0xbbe226efb628afea,0x890489f70a55368b, - 0xeadab0aba3b2dbe5,0x2b45ac74ccea842e, - 0x92c8ae6b464fc96f,0x3b0b8bc90012929d, - 0xb77ada0617e3bbcb,0x9ce6ebb40173744, - 0xe55990879ddcaabd,0xcc420a6a101d0515, - 0x8f57fa54c2a9eab6,0x9fa946824a12232d, - 0xb32df8e9f3546564,0x47939822dc96abf9, - 0xdff9772470297ebd,0x59787e2b93bc56f7, - 0x8bfbea76c619ef36,0x57eb4edb3c55b65a, - 0xaefae51477a06b03,0xede622920b6b23f1, - 0xdab99e59958885c4,0xe95fab368e45eced, - 0x88b402f7fd75539b,0x11dbcb0218ebb414, - 0xaae103b5fcd2a881,0xd652bdc29f26a119, - 0xd59944a37c0752a2,0x4be76d3346f0495f, - 0x857fcae62d8493a5,0x6f70a4400c562ddb, - 0xa6dfbd9fb8e5b88e,0xcb4ccd500f6bb952, - 0xd097ad07a71f26b2,0x7e2000a41346a7a7, - 0x825ecc24c873782f,0x8ed400668c0c28c8, - 0xa2f67f2dfa90563b,0x728900802f0f32fa, - 0xcbb41ef979346bca,0x4f2b40a03ad2ffb9, - 0xfea126b7d78186bc,0xe2f610c84987bfa8, - 0x9f24b832e6b0f436,0xdd9ca7d2df4d7c9, - 0xc6ede63fa05d3143,0x91503d1c79720dbb, - 0xf8a95fcf88747d94,0x75a44c6397ce912a, - 0x9b69dbe1b548ce7c,0xc986afbe3ee11aba, - 0xc24452da229b021b,0xfbe85badce996168, - 0xf2d56790ab41c2a2,0xfae27299423fb9c3, - 0x97c560ba6b0919a5,0xdccd879fc967d41a, - 0xbdb6b8e905cb600f,0x5400e987bbc1c920, - 0xed246723473e3813,0x290123e9aab23b68, - 0x9436c0760c86e30b,0xf9a0b6720aaf6521, - 0xb94470938fa89bce,0xf808e40e8d5b3e69, - 0xe7958cb87392c2c2,0xb60b1d1230b20e04, - 0x90bd77f3483bb9b9,0xb1c6f22b5e6f48c2, - 0xb4ecd5f01a4aa828,0x1e38aeb6360b1af3, - 0xe2280b6c20dd5232,0x25c6da63c38de1b0, - 0x8d590723948a535f,0x579c487e5a38ad0e, - 0xb0af48ec79ace837,0x2d835a9df0c6d851, - 0xdcdb1b2798182244,0xf8e431456cf88e65, - 0x8a08f0f8bf0f156b,0x1b8e9ecb641b58ff, - 0xac8b2d36eed2dac5,0xe272467e3d222f3f, - 0xd7adf884aa879177,0x5b0ed81dcc6abb0f, - 0x86ccbb52ea94baea,0x98e947129fc2b4e9, - 0xa87fea27a539e9a5,0x3f2398d747b36224, - 0xd29fe4b18e88640e,0x8eec7f0d19a03aad, - 0x83a3eeeef9153e89,0x1953cf68300424ac, - 0xa48ceaaab75a8e2b,0x5fa8c3423c052dd7, - 0xcdb02555653131b6,0x3792f412cb06794d, - 0x808e17555f3ebf11,0xe2bbd88bbee40bd0, - 0xa0b19d2ab70e6ed6,0x5b6aceaeae9d0ec4, - 0xc8de047564d20a8b,0xf245825a5a445275, - 0xfb158592be068d2e,0xeed6e2f0f0d56712, - 0x9ced737bb6c4183d,0x55464dd69685606b, - 0xc428d05aa4751e4c,0xaa97e14c3c26b886, - 0xf53304714d9265df,0xd53dd99f4b3066a8, - 0x993fe2c6d07b7fab,0xe546a8038efe4029, - 0xbf8fdb78849a5f96,0xde98520472bdd033, - 0xef73d256a5c0f77c,0x963e66858f6d4440, - 0x95a8637627989aad,0xdde7001379a44aa8, - 0xbb127c53b17ec159,0x5560c018580d5d52, - 0xe9d71b689dde71af,0xaab8f01e6e10b4a6, - 0x9226712162ab070d,0xcab3961304ca70e8, - 0xb6b00d69bb55c8d1,0x3d607b97c5fd0d22, - 0xe45c10c42a2b3b05,0x8cb89a7db77c506a, - 0x8eb98a7a9a5b04e3,0x77f3608e92adb242, - 0xb267ed1940f1c61c,0x55f038b237591ed3, - 0xdf01e85f912e37a3,0x6b6c46dec52f6688, - 0x8b61313bbabce2c6,0x2323ac4b3b3da015, - 0xae397d8aa96c1b77,0xabec975e0a0d081a, - 0xd9c7dced53c72255,0x96e7bd358c904a21, - 0x881cea14545c7575,0x7e50d64177da2e54, - 0xaa242499697392d2,0xdde50bd1d5d0b9e9, - 0xd4ad2dbfc3d07787,0x955e4ec64b44e864, - 0x84ec3c97da624ab4,0xbd5af13bef0b113e, - 0xa6274bbdd0fadd61,0xecb1ad8aeacdd58e, - 0xcfb11ead453994ba,0x67de18eda5814af2, - 0x81ceb32c4b43fcf4,0x80eacf948770ced7, - 0xa2425ff75e14fc31,0xa1258379a94d028d, - 0xcad2f7f5359a3b3e,0x96ee45813a04330, - 0xfd87b5f28300ca0d,0x8bca9d6e188853fc, - 0x9e74d1b791e07e48,0x775ea264cf55347e, - 0xc612062576589dda,0x95364afe032a819e, - 0xf79687aed3eec551,0x3a83ddbd83f52205, - 0x9abe14cd44753b52,0xc4926a9672793543, - 0xc16d9a0095928a27,0x75b7053c0f178294, - 0xf1c90080baf72cb1,0x5324c68b12dd6339, - 0x971da05074da7bee,0xd3f6fc16ebca5e04, - 0xbce5086492111aea,0x88f4bb1ca6bcf585, - 0xec1e4a7db69561a5,0x2b31e9e3d06c32e6, - 0x9392ee8e921d5d07,0x3aff322e62439fd0, - 0xb877aa3236a4b449,0x9befeb9fad487c3, - 0xe69594bec44de15b,0x4c2ebe687989a9b4, - 0x901d7cf73ab0acd9,0xf9d37014bf60a11, - 0xb424dc35095cd80f,0x538484c19ef38c95, - 0xe12e13424bb40e13,0x2865a5f206b06fba, - 0x8cbccc096f5088cb,0xf93f87b7442e45d4, - 0xafebff0bcb24aafe,0xf78f69a51539d749, - 0xdbe6fecebdedd5be,0xb573440e5a884d1c, - 0x89705f4136b4a597,0x31680a88f8953031, - 0xabcc77118461cefc,0xfdc20d2b36ba7c3e, - 0xd6bf94d5e57a42bc,0x3d32907604691b4d, - 0x8637bd05af6c69b5,0xa63f9a49c2c1b110, - 0xa7c5ac471b478423,0xfcf80dc33721d54, - 0xd1b71758e219652b,0xd3c36113404ea4a9, - 0x83126e978d4fdf3b,0x645a1cac083126ea, - 0xa3d70a3d70a3d70a,0x3d70a3d70a3d70a4, - 0xcccccccccccccccc,0xcccccccccccccccd, - 0x8000000000000000,0x0, - 0xa000000000000000,0x0, - 0xc800000000000000,0x0, - 0xfa00000000000000,0x0, - 0x9c40000000000000,0x0, - 0xc350000000000000,0x0, - 0xf424000000000000,0x0, - 0x9896800000000000,0x0, - 0xbebc200000000000,0x0, - 0xee6b280000000000,0x0, - 0x9502f90000000000,0x0, - 0xba43b74000000000,0x0, - 0xe8d4a51000000000,0x0, - 0x9184e72a00000000,0x0, - 0xb5e620f480000000,0x0, - 0xe35fa931a0000000,0x0, - 0x8e1bc9bf04000000,0x0, - 0xb1a2bc2ec5000000,0x0, - 0xde0b6b3a76400000,0x0, - 0x8ac7230489e80000,0x0, - 0xad78ebc5ac620000,0x0, - 0xd8d726b7177a8000,0x0, - 0x878678326eac9000,0x0, - 0xa968163f0a57b400,0x0, - 0xd3c21bcecceda100,0x0, - 0x84595161401484a0,0x0, - 0xa56fa5b99019a5c8,0x0, - 0xcecb8f27f4200f3a,0x0, - 0x813f3978f8940984,0x4000000000000000, - 0xa18f07d736b90be5,0x5000000000000000, - 0xc9f2c9cd04674ede,0xa400000000000000, - 0xfc6f7c4045812296,0x4d00000000000000, - 0x9dc5ada82b70b59d,0xf020000000000000, - 0xc5371912364ce305,0x6c28000000000000, - 0xf684df56c3e01bc6,0xc732000000000000, - 0x9a130b963a6c115c,0x3c7f400000000000, - 0xc097ce7bc90715b3,0x4b9f100000000000, - 0xf0bdc21abb48db20,0x1e86d40000000000, - 0x96769950b50d88f4,0x1314448000000000, - 0xbc143fa4e250eb31,0x17d955a000000000, - 0xeb194f8e1ae525fd,0x5dcfab0800000000, - 0x92efd1b8d0cf37be,0x5aa1cae500000000, - 0xb7abc627050305ad,0xf14a3d9e40000000, - 0xe596b7b0c643c719,0x6d9ccd05d0000000, - 0x8f7e32ce7bea5c6f,0xe4820023a2000000, - 0xb35dbf821ae4f38b,0xdda2802c8a800000, - 0xe0352f62a19e306e,0xd50b2037ad200000, - 0x8c213d9da502de45,0x4526f422cc340000, - 0xaf298d050e4395d6,0x9670b12b7f410000, - 0xdaf3f04651d47b4c,0x3c0cdd765f114000, - 0x88d8762bf324cd0f,0xa5880a69fb6ac800, - 0xab0e93b6efee0053,0x8eea0d047a457a00, - 0xd5d238a4abe98068,0x72a4904598d6d880, - 0x85a36366eb71f041,0x47a6da2b7f864750, - 0xa70c3c40a64e6c51,0x999090b65f67d924, - 0xd0cf4b50cfe20765,0xfff4b4e3f741cf6d, - 0x82818f1281ed449f,0xbff8f10e7a8921a4, - 0xa321f2d7226895c7,0xaff72d52192b6a0d, - 0xcbea6f8ceb02bb39,0x9bf4f8a69f764490, - 0xfee50b7025c36a08,0x2f236d04753d5b4, - 0x9f4f2726179a2245,0x1d762422c946590, - 0xc722f0ef9d80aad6,0x424d3ad2b7b97ef5, - 0xf8ebad2b84e0d58b,0xd2e0898765a7deb2, - 0x9b934c3b330c8577,0x63cc55f49f88eb2f, - 0xc2781f49ffcfa6d5,0x3cbf6b71c76b25fb, - 0xf316271c7fc3908a,0x8bef464e3945ef7a, - 0x97edd871cfda3a56,0x97758bf0e3cbb5ac, - 0xbde94e8e43d0c8ec,0x3d52eeed1cbea317, - 0xed63a231d4c4fb27,0x4ca7aaa863ee4bdd, - 0x945e455f24fb1cf8,0x8fe8caa93e74ef6a, - 0xb975d6b6ee39e436,0xb3e2fd538e122b44, - 0xe7d34c64a9c85d44,0x60dbbca87196b616, - 0x90e40fbeea1d3a4a,0xbc8955e946fe31cd, - 0xb51d13aea4a488dd,0x6babab6398bdbe41, - 0xe264589a4dcdab14,0xc696963c7eed2dd1, - 0x8d7eb76070a08aec,0xfc1e1de5cf543ca2, - 0xb0de65388cc8ada8,0x3b25a55f43294bcb, - 0xdd15fe86affad912,0x49ef0eb713f39ebe, - 0x8a2dbf142dfcc7ab,0x6e3569326c784337, - 0xacb92ed9397bf996,0x49c2c37f07965404, - 0xd7e77a8f87daf7fb,0xdc33745ec97be906, - 0x86f0ac99b4e8dafd,0x69a028bb3ded71a3, - 0xa8acd7c0222311bc,0xc40832ea0d68ce0c, - 0xd2d80db02aabd62b,0xf50a3fa490c30190, - 0x83c7088e1aab65db,0x792667c6da79e0fa, - 0xa4b8cab1a1563f52,0x577001b891185938, - 0xcde6fd5e09abcf26,0xed4c0226b55e6f86, - 0x80b05e5ac60b6178,0x544f8158315b05b4, - 0xa0dc75f1778e39d6,0x696361ae3db1c721, - 0xc913936dd571c84c,0x3bc3a19cd1e38e9, - 0xfb5878494ace3a5f,0x4ab48a04065c723, - 0x9d174b2dcec0e47b,0x62eb0d64283f9c76, - 0xc45d1df942711d9a,0x3ba5d0bd324f8394, - 0xf5746577930d6500,0xca8f44ec7ee36479, - 0x9968bf6abbe85f20,0x7e998b13cf4e1ecb, - 0xbfc2ef456ae276e8,0x9e3fedd8c321a67e, - 0xefb3ab16c59b14a2,0xc5cfe94ef3ea101e, - 0x95d04aee3b80ece5,0xbba1f1d158724a12, - 0xbb445da9ca61281f,0x2a8a6e45ae8edc97, - 0xea1575143cf97226,0xf52d09d71a3293bd, - 0x924d692ca61be758,0x593c2626705f9c56, - 0xb6e0c377cfa2e12e,0x6f8b2fb00c77836c, - 0xe498f455c38b997a,0xb6dfb9c0f956447, - 0x8edf98b59a373fec,0x4724bd4189bd5eac, - 0xb2977ee300c50fe7,0x58edec91ec2cb657, - 0xdf3d5e9bc0f653e1,0x2f2967b66737e3ed, - 0x8b865b215899f46c,0xbd79e0d20082ee74, - 0xae67f1e9aec07187,0xecd8590680a3aa11, - 0xda01ee641a708de9,0xe80e6f4820cc9495, - 0x884134fe908658b2,0x3109058d147fdcdd, - 0xaa51823e34a7eede,0xbd4b46f0599fd415, - 0xd4e5e2cdc1d1ea96,0x6c9e18ac7007c91a, - 0x850fadc09923329e,0x3e2cf6bc604ddb0, - 0xa6539930bf6bff45,0x84db8346b786151c, - 0xcfe87f7cef46ff16,0xe612641865679a63, - 0x81f14fae158c5f6e,0x4fcb7e8f3f60c07e, - 0xa26da3999aef7749,0xe3be5e330f38f09d, - 0xcb090c8001ab551c,0x5cadf5bfd3072cc5, - 0xfdcb4fa002162a63,0x73d9732fc7c8f7f6, - 0x9e9f11c4014dda7e,0x2867e7fddcdd9afa, - 0xc646d63501a1511d,0xb281e1fd541501b8, - 0xf7d88bc24209a565,0x1f225a7ca91a4226, - 0x9ae757596946075f,0x3375788de9b06958, - 0xc1a12d2fc3978937,0x52d6b1641c83ae, - 0xf209787bb47d6b84,0xc0678c5dbd23a49a, - 0x9745eb4d50ce6332,0xf840b7ba963646e0, - 0xbd176620a501fbff,0xb650e5a93bc3d898, - 0xec5d3fa8ce427aff,0xa3e51f138ab4cebe, - 0x93ba47c980e98cdf,0xc66f336c36b10137, - 0xb8a8d9bbe123f017,0xb80b0047445d4184, - 0xe6d3102ad96cec1d,0xa60dc059157491e5, - 0x9043ea1ac7e41392,0x87c89837ad68db2f, - 0xb454e4a179dd1877,0x29babe4598c311fb, - 0xe16a1dc9d8545e94,0xf4296dd6fef3d67a, - 0x8ce2529e2734bb1d,0x1899e4a65f58660c, - 0xb01ae745b101e9e4,0x5ec05dcff72e7f8f, - 0xdc21a1171d42645d,0x76707543f4fa1f73, - 0x899504ae72497eba,0x6a06494a791c53a8, - 0xabfa45da0edbde69,0x487db9d17636892, - 0xd6f8d7509292d603,0x45a9d2845d3c42b6, - 0x865b86925b9bc5c2,0xb8a2392ba45a9b2, - 0xa7f26836f282b732,0x8e6cac7768d7141e, - 0xd1ef0244af2364ff,0x3207d795430cd926, - 0x8335616aed761f1f,0x7f44e6bd49e807b8, - 0xa402b9c5a8d3a6e7,0x5f16206c9c6209a6, - 0xcd036837130890a1,0x36dba887c37a8c0f, - 0x802221226be55a64,0xc2494954da2c9789, - 0xa02aa96b06deb0fd,0xf2db9baa10b7bd6c, - 0xc83553c5c8965d3d,0x6f92829494e5acc7, - 0xfa42a8b73abbf48c,0xcb772339ba1f17f9, - 0x9c69a97284b578d7,0xff2a760414536efb, - 0xc38413cf25e2d70d,0xfef5138519684aba, - 0xf46518c2ef5b8cd1,0x7eb258665fc25d69, - 0x98bf2f79d5993802,0xef2f773ffbd97a61, - 0xbeeefb584aff8603,0xaafb550ffacfd8fa, - 0xeeaaba2e5dbf6784,0x95ba2a53f983cf38, - 0x952ab45cfa97a0b2,0xdd945a747bf26183, - 0xba756174393d88df,0x94f971119aeef9e4, - 0xe912b9d1478ceb17,0x7a37cd5601aab85d, - 0x91abb422ccb812ee,0xac62e055c10ab33a, - 0xb616a12b7fe617aa,0x577b986b314d6009, - 0xe39c49765fdf9d94,0xed5a7e85fda0b80b, - 0x8e41ade9fbebc27d,0x14588f13be847307, - 0xb1d219647ae6b31c,0x596eb2d8ae258fc8, - 0xde469fbd99a05fe3,0x6fca5f8ed9aef3bb, - 0x8aec23d680043bee,0x25de7bb9480d5854, - 0xada72ccc20054ae9,0xaf561aa79a10ae6a, - 0xd910f7ff28069da4,0x1b2ba1518094da04, - 0x87aa9aff79042286,0x90fb44d2f05d0842, - 0xa99541bf57452b28,0x353a1607ac744a53, - 0xd3fa922f2d1675f2,0x42889b8997915ce8, - 0x847c9b5d7c2e09b7,0x69956135febada11, - 0xa59bc234db398c25,0x43fab9837e699095, - 0xcf02b2c21207ef2e,0x94f967e45e03f4bb, - 0x8161afb94b44f57d,0x1d1be0eebac278f5, - 0xa1ba1ba79e1632dc,0x6462d92a69731732, - 0xca28a291859bbf93,0x7d7b8f7503cfdcfe, - 0xfcb2cb35e702af78,0x5cda735244c3d43e, - 0x9defbf01b061adab,0x3a0888136afa64a7, - 0xc56baec21c7a1916,0x88aaa1845b8fdd0, - 0xf6c69a72a3989f5b,0x8aad549e57273d45, - 0x9a3c2087a63f6399,0x36ac54e2f678864b, - 0xc0cb28a98fcf3c7f,0x84576a1bb416a7dd, - 0xf0fdf2d3f3c30b9f,0x656d44a2a11c51d5, - 0x969eb7c47859e743,0x9f644ae5a4b1b325, - 0xbc4665b596706114,0x873d5d9f0dde1fee, - 0xeb57ff22fc0c7959,0xa90cb506d155a7ea, - 0x9316ff75dd87cbd8,0x9a7f12442d588f2, - 0xb7dcbf5354e9bece,0xc11ed6d538aeb2f, - 0xe5d3ef282a242e81,0x8f1668c8a86da5fa, - 0x8fa475791a569d10,0xf96e017d694487bc, - 0xb38d92d760ec4455,0x37c981dcc395a9ac, - 0xe070f78d3927556a,0x85bbe253f47b1417, - 0x8c469ab843b89562,0x93956d7478ccec8e, - 0xaf58416654a6babb,0x387ac8d1970027b2, - 0xdb2e51bfe9d0696a,0x6997b05fcc0319e, - 0x88fcf317f22241e2,0x441fece3bdf81f03, - 0xab3c2fddeeaad25a,0xd527e81cad7626c3, - 0xd60b3bd56a5586f1,0x8a71e223d8d3b074, - 0x85c7056562757456,0xf6872d5667844e49, - 0xa738c6bebb12d16c,0xb428f8ac016561db, - 0xd106f86e69d785c7,0xe13336d701beba52, - 0x82a45b450226b39c,0xecc0024661173473, - 0xa34d721642b06084,0x27f002d7f95d0190, - 0xcc20ce9bd35c78a5,0x31ec038df7b441f4, - 0xff290242c83396ce,0x7e67047175a15271, - 0x9f79a169bd203e41,0xf0062c6e984d386, - 0xc75809c42c684dd1,0x52c07b78a3e60868, - 0xf92e0c3537826145,0xa7709a56ccdf8a82, - 0x9bbcc7a142b17ccb,0x88a66076400bb691, - 0xc2abf989935ddbfe,0x6acff893d00ea435, - 0xf356f7ebf83552fe,0x583f6b8c4124d43, - 0x98165af37b2153de,0xc3727a337a8b704a, - 0xbe1bf1b059e9a8d6,0x744f18c0592e4c5c, - 0xeda2ee1c7064130c,0x1162def06f79df73, - 0x9485d4d1c63e8be7,0x8addcb5645ac2ba8, - 0xb9a74a0637ce2ee1,0x6d953e2bd7173692, - 0xe8111c87c5c1ba99,0xc8fa8db6ccdd0437, - 0x910ab1d4db9914a0,0x1d9c9892400a22a2, - 0xb54d5e4a127f59c8,0x2503beb6d00cab4b, - 0xe2a0b5dc971f303a,0x2e44ae64840fd61d, - 0x8da471a9de737e24,0x5ceaecfed289e5d2, - 0xb10d8e1456105dad,0x7425a83e872c5f47, - 0xdd50f1996b947518,0xd12f124e28f77719, - 0x8a5296ffe33cc92f,0x82bd6b70d99aaa6f, - 0xace73cbfdc0bfb7b,0x636cc64d1001550b, - 0xd8210befd30efa5a,0x3c47f7e05401aa4e, - 0x8714a775e3e95c78,0x65acfaec34810a71, - 0xa8d9d1535ce3b396,0x7f1839a741a14d0d, - 0xd31045a8341ca07c,0x1ede48111209a050, - 0x83ea2b892091e44d,0x934aed0aab460432, - 0xa4e4b66b68b65d60,0xf81da84d5617853f, - 0xce1de40642e3f4b9,0x36251260ab9d668e, - 0x80d2ae83e9ce78f3,0xc1d72b7c6b426019, - 0xa1075a24e4421730,0xb24cf65b8612f81f, - 0xc94930ae1d529cfc,0xdee033f26797b627, - 0xfb9b7cd9a4a7443c,0x169840ef017da3b1, - 0x9d412e0806e88aa5,0x8e1f289560ee864e, - 0xc491798a08a2ad4e,0xf1a6f2bab92a27e2, - 0xf5b5d7ec8acb58a2,0xae10af696774b1db, - 0x9991a6f3d6bf1765,0xacca6da1e0a8ef29, - 0xbff610b0cc6edd3f,0x17fd090a58d32af3, - 0xeff394dcff8a948e,0xddfc4b4cef07f5b0, - 0x95f83d0a1fb69cd9,0x4abdaf101564f98e, - 0xbb764c4ca7a4440f,0x9d6d1ad41abe37f1, - 0xea53df5fd18d5513,0x84c86189216dc5ed, - 0x92746b9be2f8552c,0x32fd3cf5b4e49bb4, - 0xb7118682dbb66a77,0x3fbc8c33221dc2a1, - 0xe4d5e82392a40515,0xfabaf3feaa5334a, - 0x8f05b1163ba6832d,0x29cb4d87f2a7400e, - 0xb2c71d5bca9023f8,0x743e20e9ef511012, - 0xdf78e4b2bd342cf6,0x914da9246b255416, - 0x8bab8eefb6409c1a,0x1ad089b6c2f7548e, - 0xae9672aba3d0c320,0xa184ac2473b529b1, - 0xda3c0f568cc4f3e8,0xc9e5d72d90a2741e, - 0x8865899617fb1871,0x7e2fa67c7a658892, - 0xaa7eebfb9df9de8d,0xddbb901b98feeab7, - 0xd51ea6fa85785631,0x552a74227f3ea565, - 0x8533285c936b35de,0xd53a88958f87275f, - 0xa67ff273b8460356,0x8a892abaf368f137, - 0xd01fef10a657842c,0x2d2b7569b0432d85, - 0x8213f56a67f6b29b,0x9c3b29620e29fc73, - 0xa298f2c501f45f42,0x8349f3ba91b47b8f, - 0xcb3f2f7642717713,0x241c70a936219a73, - 0xfe0efb53d30dd4d7,0xed238cd383aa0110, - 0x9ec95d1463e8a506,0xf4363804324a40aa, - 0xc67bb4597ce2ce48,0xb143c6053edcd0d5, - 0xf81aa16fdc1b81da,0xdd94b7868e94050a, - 0x9b10a4e5e9913128,0xca7cf2b4191c8326, - 0xc1d4ce1f63f57d72,0xfd1c2f611f63a3f0, - 0xf24a01a73cf2dccf,0xbc633b39673c8cec, - 0x976e41088617ca01,0xd5be0503e085d813, - 0xbd49d14aa79dbc82,0x4b2d8644d8a74e18, - 0xec9c459d51852ba2,0xddf8e7d60ed1219e, - 0x93e1ab8252f33b45,0xcabb90e5c942b503, - 0xb8da1662e7b00a17,0x3d6a751f3b936243, - 0xe7109bfba19c0c9d,0xcc512670a783ad4, - 0x906a617d450187e2,0x27fb2b80668b24c5, - 0xb484f9dc9641e9da,0xb1f9f660802dedf6, - 0xe1a63853bbd26451,0x5e7873f8a0396973, - 0x8d07e33455637eb2,0xdb0b487b6423e1e8, - 0xb049dc016abc5e5f,0x91ce1a9a3d2cda62, - 0xdc5c5301c56b75f7,0x7641a140cc7810fb, - 0x89b9b3e11b6329ba,0xa9e904c87fcb0a9d, - 0xac2820d9623bf429,0x546345fa9fbdcd44, - 0xd732290fbacaf133,0xa97c177947ad4095, - 0x867f59a9d4bed6c0,0x49ed8eabcccc485d, - 0xa81f301449ee8c70,0x5c68f256bfff5a74, - 0xd226fc195c6a2f8c,0x73832eec6fff3111, - 0x83585d8fd9c25db7,0xc831fd53c5ff7eab, - 0xa42e74f3d032f525,0xba3e7ca8b77f5e55, - 0xcd3a1230c43fb26f,0x28ce1bd2e55f35eb, - 0x80444b5e7aa7cf85,0x7980d163cf5b81b3, - 0xa0555e361951c366,0xd7e105bcc332621f, - 0xc86ab5c39fa63440,0x8dd9472bf3fefaa7, - 0xfa856334878fc150,0xb14f98f6f0feb951, - 0x9c935e00d4b9d8d2,0x6ed1bf9a569f33d3, - 0xc3b8358109e84f07,0xa862f80ec4700c8, - 0xf4a642e14c6262c8,0xcd27bb612758c0fa, - 0x98e7e9cccfbd7dbd,0x8038d51cb897789c, - 0xbf21e44003acdd2c,0xe0470a63e6bd56c3, - 0xeeea5d5004981478,0x1858ccfce06cac74, - 0x95527a5202df0ccb,0xf37801e0c43ebc8, - 0xbaa718e68396cffd,0xd30560258f54e6ba, - 0xe950df20247c83fd,0x47c6b82ef32a2069, - 0x91d28b7416cdd27e,0x4cdc331d57fa5441, - 0xb6472e511c81471d,0xe0133fe4adf8e952, - 0xe3d8f9e563a198e5,0x58180fddd97723a6, - 0x8e679c2f5e44ff8f,0x570f09eaa7ea7648,}; +constexpr uint64_t powers_template::power_of_five_128[number_of_entries]; + using powers = powers_template<>; -} +} // namespace fast_float } // namespace arrow_vendored #endif diff --git a/cpp/src/arrow/vendored/fast_float/float_common.h b/cpp/src/arrow/vendored/fast_float/float_common.h index 0d6bfe7efb88b..717320126750c 100644 --- a/cpp/src/arrow/vendored/fast_float/float_common.h +++ b/cpp/src/arrow/vendored/fast_float/float_common.h @@ -7,6 +7,25 @@ #include #include +#ifdef __has_include +#if __has_include() +#include +#endif +#endif + +#if __cpp_lib_bit_cast >= 201806L +#include +#define FASTFLOAT_HAS_BIT_CAST 1 +#else +#define FASTFLOAT_HAS_BIT_CAST 0 +#endif + +#if __cpp_lib_is_constant_evaluated >= 201811L +#define FASTFLOAT_HAS_IS_CONSTANT_EVALUATED 1 +#else +#define FASTFLOAT_HAS_IS_CONSTANT_EVALUATED 0 +#endif + #if (defined(__x86_64) || defined(__x86_64__) || defined(_M_X64) \ || defined(__amd64) || defined(__aarch64__) || defined(_M_ARM64) \ || defined(__MINGW64__) \ @@ -14,7 +33,7 @@ || (defined(__ppc64__) || defined(__PPC64__) || defined(__ppc64le__) || defined(__PPC64LE__)) ) #define FASTFLOAT_64BIT 1 #elif (defined(__i386) || defined(__i386__) || defined(_M_IX86) \ - || defined(__arm__) || defined(_M_ARM) \ + || defined(__arm__) || defined(_M_ARM) || defined(__ppc__) \ || defined(__MINGW32__) || defined(__EMSCRIPTEN__)) #define FASTFLOAT_32BIT 1 #else @@ -50,7 +69,11 @@ #elif defined(sun) || defined(__sun) #include #else +#ifdef __has_include +#if __has_include() #include +#endif //__has_include() +#endif //__has_include #endif # #ifndef __BYTE_ORDER__ @@ -77,23 +100,46 @@ #endif #ifndef FASTFLOAT_ASSERT -#define FASTFLOAT_ASSERT(x) { if (!(x)) abort(); } +#define FASTFLOAT_ASSERT(x) { ((void)(x)); } #endif #ifndef FASTFLOAT_DEBUG_ASSERT -#include -#define FASTFLOAT_DEBUG_ASSERT(x) assert(x) +#define FASTFLOAT_DEBUG_ASSERT(x) { ((void)(x)); } #endif // rust style `try!()` macro, or `?` operator #define FASTFLOAT_TRY(x) { if (!(x)) return false; } +// Testing for https://wg21.link/N3652, adopted in C++14 +#if __cpp_constexpr >= 201304 +#define FASTFLOAT_CONSTEXPR14 constexpr +#else +#define FASTFLOAT_CONSTEXPR14 +#endif + +// Testing for relevant C++20 constexpr library features +#if FASTFLOAT_HAS_IS_CONSTANT_EVALUATED \ + && FASTFLOAT_HAS_BIT_CAST \ + && __cpp_lib_constexpr_algorithms >= 201806L /*For std::copy and std::fill*/ +#define FASTFLOAT_CONSTEXPR20 constexpr +#else +#define FASTFLOAT_CONSTEXPR20 +#endif + namespace arrow_vendored { namespace fast_float { +fastfloat_really_inline constexpr bool cpp20_and_in_constexpr() { +#if FASTFLOAT_HAS_IS_CONSTANT_EVALUATED + return std::is_constant_evaluated(); +#else + return false; +#endif +} + // Compares two ASCII strings in a case insensitive manner. -inline bool fastfloat_strncasecmp(const char *input1, const char *input2, - size_t length) { +inline FASTFLOAT_CONSTEXPR14 bool +fastfloat_strncasecmp(const char *input1, const char *input2, size_t length) { char running_diff{0}; for (size_t i = 0; i < length; i++) { running_diff |= (input1[i] ^ input2[i]); @@ -110,14 +156,14 @@ template struct span { const T* ptr; size_t length; - span(const T* _ptr, size_t _length) : ptr(_ptr), length(_length) {} - span() : ptr(nullptr), length(0) {} + constexpr span(const T* _ptr, size_t _length) : ptr(_ptr), length(_length) {} + constexpr span() : ptr(nullptr), length(0) {} constexpr size_t len() const noexcept { return length; } - const T& operator[](size_t index) const noexcept { + FASTFLOAT_CONSTEXPR14 const T& operator[](size_t index) const noexcept { FASTFLOAT_DEBUG_ASSERT(index < length); return ptr[index]; } @@ -126,13 +172,31 @@ struct span { struct value128 { uint64_t low; uint64_t high; - value128(uint64_t _low, uint64_t _high) : low(_low), high(_high) {} - value128() : low(0), high(0) {} + constexpr value128(uint64_t _low, uint64_t _high) : low(_low), high(_high) {} + constexpr value128() : low(0), high(0) {} }; +/* Helper C++11 constexpr generic implementation of leading_zeroes */ +fastfloat_really_inline constexpr +int leading_zeroes_generic(uint64_t input_num, int last_bit = 0) { + return ( + ((input_num & uint64_t(0xffffffff00000000)) && (input_num >>= 32, last_bit |= 32)), + ((input_num & uint64_t( 0xffff0000)) && (input_num >>= 16, last_bit |= 16)), + ((input_num & uint64_t( 0xff00)) && (input_num >>= 8, last_bit |= 8)), + ((input_num & uint64_t( 0xf0)) && (input_num >>= 4, last_bit |= 4)), + ((input_num & uint64_t( 0xc)) && (input_num >>= 2, last_bit |= 2)), + ((input_num & uint64_t( 0x2)) && (input_num >>= 1, last_bit |= 1)), + 63 - last_bit + ); +} + /* result might be undefined when input_num is zero */ -fastfloat_really_inline int leading_zeroes(uint64_t input_num) { +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 +int leading_zeroes(uint64_t input_num) { assert(input_num > 0); + if (cpp20_and_in_constexpr()) { + return leading_zeroes_generic(input_num); + } #ifdef FASTFLOAT_VISUAL_STUDIO #if defined(_M_X64) || defined(_M_ARM64) unsigned long leading_zero = 0; @@ -141,31 +205,20 @@ fastfloat_really_inline int leading_zeroes(uint64_t input_num) { _BitScanReverse64(&leading_zero, input_num); return (int)(63 - leading_zero); #else - int last_bit = 0; - if(input_num & uint64_t(0xffffffff00000000)) input_num >>= 32, last_bit |= 32; - if(input_num & uint64_t( 0xffff0000)) input_num >>= 16, last_bit |= 16; - if(input_num & uint64_t( 0xff00)) input_num >>= 8, last_bit |= 8; - if(input_num & uint64_t( 0xf0)) input_num >>= 4, last_bit |= 4; - if(input_num & uint64_t( 0xc)) input_num >>= 2, last_bit |= 2; - if(input_num & uint64_t( 0x2)) input_num >>= 1, last_bit |= 1; - return 63 - last_bit; + return leading_zeroes_generic(input_num); #endif #else return __builtin_clzll(input_num); #endif } -#ifdef FASTFLOAT_32BIT - // slow emulation routine for 32-bit -fastfloat_really_inline uint64_t emulu(uint32_t x, uint32_t y) { +fastfloat_really_inline constexpr uint64_t emulu(uint32_t x, uint32_t y) { return x * (uint64_t)y; } -// slow emulation routine for 32-bit -#if !defined(__MINGW64__) -fastfloat_really_inline uint64_t _umul128(uint64_t ab, uint64_t cd, - uint64_t *hi) { +fastfloat_really_inline FASTFLOAT_CONSTEXPR14 +uint64_t umul128_generic(uint64_t ab, uint64_t cd, uint64_t *hi) { uint64_t ad = emulu((uint32_t)(ab >> 32), (uint32_t)cd); uint64_t bd = emulu((uint32_t)ab, (uint32_t)cd); uint64_t adbc = ad + emulu((uint32_t)ab, (uint32_t)(cd >> 32)); @@ -175,14 +228,28 @@ fastfloat_really_inline uint64_t _umul128(uint64_t ab, uint64_t cd, (adbc_carry << 32) + !!(lo < bd); return lo; } + +#ifdef FASTFLOAT_32BIT + +// slow emulation routine for 32-bit +#if !defined(__MINGW64__) +fastfloat_really_inline FASTFLOAT_CONSTEXPR14 +uint64_t _umul128(uint64_t ab, uint64_t cd, uint64_t *hi) { + return umul128_generic(ab, cd, hi); +} #endif // !__MINGW64__ #endif // FASTFLOAT_32BIT // compute 64-bit a*b -fastfloat_really_inline value128 full_multiplication(uint64_t a, - uint64_t b) { +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 +value128 full_multiplication(uint64_t a, uint64_t b) { + if (cpp20_and_in_constexpr()) { + value128 answer; + answer.low = umul128_generic(a, b, &answer.high); + return answer; + } value128 answer; #if defined(_M_ARM64) && !defined(__MINGW32__) // ARM64 has native support for 64-bit multiplications, no need to emulate @@ -196,7 +263,7 @@ fastfloat_really_inline value128 full_multiplication(uint64_t a, answer.low = uint64_t(r); answer.high = uint64_t(r >> 64); #else - #error Not implemented + answer.low = umul128_generic(a, b, &answer.high); #endif return answer; } @@ -205,10 +272,10 @@ struct adjusted_mantissa { uint64_t mantissa{0}; int32_t power2{0}; // a negative value indicates an invalid result adjusted_mantissa() = default; - bool operator==(const adjusted_mantissa &o) const { + constexpr bool operator==(const adjusted_mantissa &o) const { return mantissa == o.mantissa && power2 == o.power2; } - bool operator!=(const adjusted_mantissa &o) const { + constexpr bool operator!=(const adjusted_mantissa &o) const { return mantissa != o.mantissa || power2 != o.power2; } }; @@ -219,8 +286,8 @@ constexpr static int32_t invalid_am_bias = -0x8000; constexpr static double powers_of_ten_double[] = { 1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9, 1e10, 1e11, 1e12, 1e13, 1e14, 1e15, 1e16, 1e17, 1e18, 1e19, 1e20, 1e21, 1e22}; -constexpr static float powers_of_ten_float[] = {1e0, 1e1, 1e2, 1e3, 1e4, 1e5, - 1e6, 1e7, 1e8, 1e9, 1e10}; +constexpr static float powers_of_ten_float[] = {1e0f, 1e1f, 1e2f, 1e3f, 1e4f, 1e5f, + 1e6f, 1e7f, 1e8f, 1e9f, 1e10f}; // used for max_mantissa_double and max_mantissa_float constexpr uint64_t constant_55555 = 5 * 5 * 5 * 5 * 5; // Largest integer value v so that (5**index * v) <= 1<<53. @@ -433,23 +500,41 @@ template <> inline constexpr binary_format::equiv_uint } template -fastfloat_really_inline void to_float(bool negative, adjusted_mantissa am, T &value) { - uint64_t word = am.mantissa; - word |= uint64_t(am.power2) << binary_format::mantissa_explicit_bits(); - word = negative - ? word | (uint64_t(1) << binary_format::sign_index()) : word; -#if FASTFLOAT_IS_BIG_ENDIAN == 1 - if (std::is_same::value) { - ::memcpy(&value, (char *)&word + 4, sizeof(T)); // extract value at offset 4-7 if float on big-endian - } else { - ::memcpy(&value, &word, sizeof(T)); - } +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 +void to_float(bool negative, adjusted_mantissa am, T &value) { + using uint = typename binary_format::equiv_uint; + uint word = (uint)am.mantissa; + word |= uint(am.power2) << binary_format::mantissa_explicit_bits(); + word |= uint(negative) << binary_format::sign_index(); +#if FASTFLOAT_HAS_BIT_CAST + value = std::bit_cast(word); #else - // For little-endian systems: - ::memcpy(&value, &word, sizeof(T)); + ::memcpy(&value, &word, sizeof(T)); #endif } +#if FASTFLOAT_SKIP_WHITE_SPACE // disabled by default +template +struct space_lut { + static constexpr bool value[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; +}; + +template +constexpr bool space_lut::value[]; + +inline constexpr bool is_space(uint8_t c) { return space_lut<>::value[c]; } +#endif } // namespace fast_float } // namespace arrow_vendored diff --git a/cpp/src/arrow/vendored/fast_float/parse_number.h b/cpp/src/arrow/vendored/fast_float/parse_number.h index e1c9603aeaa94..905d614c9db29 100644 --- a/cpp/src/arrow/vendored/fast_float/parse_number.h +++ b/cpp/src/arrow/vendored/fast_float/parse_number.h @@ -30,6 +30,11 @@ from_chars_result parse_infnan(const char *first, const char *last, T &value) n minusSign = true; ++first; } +#if FASTFLOAT_ALLOWS_LEADING_PLUS // disabled by default + if (*first == '+') { + ++first; + } +#endif if (last - first >= 3) { if (fastfloat_strncasecmp(first, "nan", 3)) { answer.ptr = (first += 3); @@ -67,6 +72,10 @@ from_chars_result parse_infnan(const char *first, const char *last, T &value) n * Credit : @mwalcott3 */ fastfloat_really_inline bool rounds_to_nearest() noexcept { + // https://lemire.me/blog/2020/06/26/gcc-not-nearest/ +#if (FLT_EVAL_METHOD != 1) && (FLT_EVAL_METHOD != 0) + return false; +#endif // See // A fast function to check your floating-point rounding mode // https://lemire.me/blog/2022/11/16/a-fast-function-to-check-your-floating-point-rounding-mode/ @@ -100,7 +109,25 @@ fastfloat_really_inline bool rounds_to_nearest() noexcept { // // Note: This may fail to be accurate if fast-math has been // enabled, as rounding conventions may not apply. + #if FASTFLOAT_VISUAL_STUDIO + # pragma warning(push) + // todo: is there a VS warning? + // see https://stackoverflow.com/questions/46079446/is-there-a-warning-for-floating-point-equality-checking-in-visual-studio-2013 + #elif defined(__clang__) + # pragma clang diagnostic push + # pragma clang diagnostic ignored "-Wfloat-equal" + #elif defined(__GNUC__) + # pragma GCC diagnostic push + # pragma GCC diagnostic ignored "-Wfloat-equal" + #endif return (fmini + 1.0f == 1.0f - fmini); + #if FASTFLOAT_VISUAL_STUDIO + # pragma warning(pop) + #elif defined(__clang__) + # pragma clang diagnostic pop + #elif defined(__GNUC__) + # pragma GCC diagnostic pop + #endif } } // namespace detail @@ -119,6 +146,11 @@ from_chars_result from_chars_advanced(const char *first, const char *last, from_chars_result answer; +#if FASTFLOAT_SKIP_WHITE_SPACE // disabled by default + while ((first != last) && fast_float::is_space(uint8_t(*first))) { + first++; + } +#endif if (first == last) { answer.ec = std::errc::invalid_argument; answer.ptr = first; diff --git a/cpp/src/arrow/vendored/fast_float/update.sh b/cpp/src/arrow/vendored/fast_float/update.sh index ab6e9515da5d8..f0e2d3dc508c5 100755 --- a/cpp/src/arrow/vendored/fast_float/update.sh +++ b/cpp/src/arrow/vendored/fast_float/update.sh @@ -23,7 +23,7 @@ source_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" if [ "$#" -ne 1 ]; then echo "Usage: $0 VERSION" - echo " e.g.: $0 3.8.1" + echo " e.g.: $0 3.10.1" exit 1 fi diff --git a/cpp/src/gandiva/precompiled/CMakeLists.txt b/cpp/src/gandiva/precompiled/CMakeLists.txt index d7c7ef157b442..4ca5cc655b2a7 100644 --- a/cpp/src/gandiva/precompiled/CMakeLists.txt +++ b/cpp/src/gandiva/precompiled/CMakeLists.txt @@ -83,7 +83,9 @@ foreach(SRC_FILE ${PRECOMPILED_SRCS}) -I${ARROW_BINARY_DIR}/src) if(NOT ARROW_USE_NATIVE_INT128) - list(APPEND PRECOMPILE_COMMAND -I${Boost_INCLUDE_DIR}) + foreach(boost_include_dir ${Boost_INCLUDE_DIRS}) + list(APPEND PRECOMPILE_COMMAND -I${boost_include_dir}) + endforeach() endif() add_custom_command(OUTPUT ${BC_FILE} COMMAND ${PRECOMPILE_COMMAND} diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index e6aad7cee2a3e..eb2e2d8fed88f 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -401,11 +401,14 @@ endif() add_parquet_test(file_deserialize_test SOURCES file_deserialize_test.cc test_util.cc) add_parquet_test(schema_test) -add_parquet_benchmark(bloom_filter_benchmark) +add_parquet_benchmark(bloom_filter_benchmark SOURCES bloom_filter_benchmark.cc + benchmark_util.cc) add_parquet_benchmark(column_reader_benchmark) add_parquet_benchmark(column_io_benchmark) add_parquet_benchmark(encoding_benchmark) add_parquet_benchmark(level_conversion_benchmark) +add_parquet_benchmark(page_index_benchmark SOURCES page_index_benchmark.cc + benchmark_util.cc) add_parquet_benchmark(arrow/reader_writer_benchmark PREFIX "parquet-arrow") if(ARROW_WITH_BROTLI) diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index 08280163f1903..3e2dc3e213bfc 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -2428,9 +2428,9 @@ TEST(TestArrowReadWrite, CoalescedReadsAndNonCoalescedReads) { ASSERT_EQ(2, reader->num_row_groups()); - // Pre-buffer 3 columns in the 2nd row group. + // Pre-buffer column 0 and column 3 in the 2nd row group. const std::vector row_groups = {1}; - const std::vector column_indices = {0, 1, 4}; + const std::vector column_indices = {0, 3}; reader->parquet_reader()->PreBuffer(row_groups, column_indices, ::arrow::io::IOContext(), ::arrow::io::CacheOptions::Defaults()); diff --git a/cpp/src/parquet/arrow/arrow_schema_test.cc b/cpp/src/parquet/arrow/arrow_schema_test.cc index fc593718ab65e..a1cc989ba8ea0 100644 --- a/cpp/src/parquet/arrow/arrow_schema_test.cc +++ b/cpp/src/parquet/arrow/arrow_schema_test.cc @@ -50,9 +50,7 @@ using parquet::schema::PrimitiveNode; using ::testing::ElementsAre; -namespace parquet { - -namespace arrow { +namespace parquet::arrow { const auto BOOL = ::arrow::boolean(); const auto UINT8 = ::arrow::uint8(); @@ -1762,5 +1760,4 @@ TEST_F(TestLevels, ListErrors) { } } -} // namespace arrow -} // namespace parquet +} // namespace parquet::arrow diff --git a/cpp/src/parquet/arrow/arrow_statistics_test.cc b/cpp/src/parquet/arrow/arrow_statistics_test.cc index 604f163a66f84..ad4496933ef4c 100644 --- a/cpp/src/parquet/arrow/arrow_statistics_test.cc +++ b/cpp/src/parquet/arrow/arrow_statistics_test.cc @@ -36,8 +36,7 @@ using arrow::Table; using arrow::io::BufferReader; -namespace parquet { -namespace arrow { +namespace parquet::arrow { struct StatisticsTestParam { std::shared_ptr<::arrow::Table> table; @@ -157,5 +156,4 @@ INSTANTIATE_TEST_SUITE_P( /*expected_min=*/"z", /*expected_max=*/"z"})); -} // namespace arrow -} // namespace parquet +} // namespace parquet::arrow diff --git a/cpp/src/parquet/arrow/path_internal.cc b/cpp/src/parquet/arrow/path_internal.cc index 2aeee6e500f5d..919c97f4323b6 100644 --- a/cpp/src/parquet/arrow/path_internal.cc +++ b/cpp/src/parquet/arrow/path_internal.cc @@ -108,8 +108,7 @@ #include "parquet/properties.h" -namespace parquet { -namespace arrow { +namespace parquet::arrow { namespace { @@ -901,5 +900,4 @@ Status MultipathLevelBuilder::Write(const Array& array, bool array_field_nullabl return Status::OK(); } -} // namespace arrow -} // namespace parquet +} // namespace parquet::arrow diff --git a/cpp/src/parquet/arrow/path_internal_test.cc b/cpp/src/parquet/arrow/path_internal_test.cc index 4645807007478..fb9c404247f3b 100644 --- a/cpp/src/parquet/arrow/path_internal_test.cc +++ b/cpp/src/parquet/arrow/path_internal_test.cc @@ -29,8 +29,7 @@ #include "parquet/properties.h" -namespace parquet { -namespace arrow { +namespace parquet::arrow { using ::arrow::default_memory_pool; using ::arrow::field; @@ -644,5 +643,4 @@ TEST_F(MultipathLevelBuilderTest, TestPrimitiveNonNullable) { EXPECT_THAT(results_[0].post_list_elements[0].end, Eq(4)); } -} // namespace arrow -} // namespace parquet +} // namespace parquet::arrow diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc index 40fbdcbb562b1..142119b770b8c 100644 --- a/cpp/src/parquet/arrow/reader.cc +++ b/cpp/src/parquet/arrow/reader.cc @@ -76,8 +76,7 @@ using parquet::internal::RecordReader; namespace bit_util = arrow::bit_util; -namespace parquet { -namespace arrow { +namespace parquet::arrow { namespace { ::arrow::Result> ChunksToSingle(const ChunkedArray& chunked) { @@ -260,16 +259,6 @@ class FileReaderImpl : public FileReader { reader_->metadata()->key_value_metadata(), out); } - Status ReadSchemaField(int i, std::shared_ptr* out) override { - auto included_leaves = VectorToSharedSet(Iota(reader_->metadata()->num_columns())); - std::vector row_groups = Iota(reader_->metadata()->num_row_groups()); - - std::unique_ptr reader; - RETURN_NOT_OK(GetFieldReader(i, included_leaves, row_groups, &reader)); - - return ReadColumn(i, row_groups, reader.get(), out); - } - Status ReadColumn(int i, const std::vector& row_groups, ColumnReader* reader, std::shared_ptr* out) { BEGIN_PARQUET_CATCH_EXCEPTIONS @@ -1414,5 +1403,4 @@ Status FuzzReader(const uint8_t* data, int64_t size) { } // namespace internal -} // namespace arrow -} // namespace parquet +} // namespace parquet::arrow diff --git a/cpp/src/parquet/arrow/reader.h b/cpp/src/parquet/arrow/reader.h index 2cbd36176f5e3..6e46ca43f7b18 100644 --- a/cpp/src/parquet/arrow/reader.h +++ b/cpp/src/parquet/arrow/reader.h @@ -141,26 +141,19 @@ class PARQUET_EXPORT FileReader { /// \brief Read column as a whole into a chunked array. /// - /// The indicated column index is relative to the schema + /// The index i refers the index of the top level schema field, which may + /// be nested or flat - e.g. + /// + /// 0 foo.bar + /// foo.bar.baz + /// foo.qux + /// 1 foo2 + /// 2 foo3 + /// + /// i=0 will read the entire foo struct, i=1 the foo2 primitive column etc virtual ::arrow::Status ReadColumn(int i, std::shared_ptr<::arrow::ChunkedArray>* out) = 0; - // NOTE: Experimental API - // Reads a specific top level schema field into an Array - // The index i refers the index of the top level schema field, which may - // be nested or flat - e.g. - // - // 0 foo.bar - // foo.bar.baz - // foo.qux - // 1 foo2 - // 2 foo3 - // - // i=0 will read the entire foo struct, i=1 the foo2 primitive column etc - ARROW_DEPRECATED("Deprecated in 9.0.0. Use ReadColumn instead.") - virtual ::arrow::Status ReadSchemaField( - int i, std::shared_ptr<::arrow::ChunkedArray>* out) = 0; - /// \brief Return a RecordBatchReader of all row groups and columns. virtual ::arrow::Status GetRecordBatchReader( std::unique_ptr<::arrow::RecordBatchReader>* out) = 0; diff --git a/cpp/src/parquet/arrow/reader_internal.cc b/cpp/src/parquet/arrow/reader_internal.cc index ca61682357b9c..bbc5c35713225 100644 --- a/cpp/src/parquet/arrow/reader_internal.cc +++ b/cpp/src/parquet/arrow/reader_internal.cc @@ -96,8 +96,7 @@ using ParquetType = parquet::Type; namespace bit_util = arrow::bit_util; -namespace parquet { -namespace arrow { +namespace parquet::arrow { namespace { template @@ -936,5 +935,4 @@ Status TransferColumnData(RecordReader* reader, const std::shared_ptr& va return Status::OK(); } -} // namespace arrow -} // namespace parquet +} // namespace parquet::arrow diff --git a/cpp/src/parquet/arrow/reconstruct_internal_test.cc b/cpp/src/parquet/arrow/reconstruct_internal_test.cc index 8a69f8266f1e6..4e1f421498e85 100644 --- a/cpp/src/parquet/arrow/reconstruct_internal_test.cc +++ b/cpp/src/parquet/arrow/reconstruct_internal_test.cc @@ -65,8 +65,7 @@ using testing::Eq; using testing::NotNull; using testing::SizeIs; -namespace parquet { -namespace arrow { +namespace parquet::arrow { using parquet::schema::GroupNode; using parquet::schema::NodePtr; @@ -1637,5 +1636,4 @@ TEST_F(TestReconstructColumn, ListList6) { // TODO legacy-list-in-struct etc.? -} // namespace arrow -} // namespace parquet +} // namespace parquet::arrow diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc index 276b5f9c55751..b840f65b052f2 100644 --- a/cpp/src/parquet/arrow/schema.cc +++ b/cpp/src/parquet/arrow/schema.cc @@ -63,9 +63,7 @@ using parquet::LogicalType; using parquet::internal::LevelInfo; -namespace parquet { - -namespace arrow { +namespace parquet::arrow { // ---------------------------------------------------------------------- // Parquet to Arrow schema conversion @@ -1111,5 +1109,4 @@ Status SchemaManifest::Make(const SchemaDescriptor* schema, return Status::OK(); } -} // namespace arrow -} // namespace parquet +} // namespace parquet::arrow diff --git a/cpp/src/parquet/arrow/schema_internal.cc b/cpp/src/parquet/arrow/schema_internal.cc index 4896e656da350..bb75cce084097 100644 --- a/cpp/src/parquet/arrow/schema_internal.cc +++ b/cpp/src/parquet/arrow/schema_internal.cc @@ -23,9 +23,7 @@ using ArrowType = ::arrow::DataType; using ArrowTypeId = ::arrow::Type; using ParquetType = parquet::Type; -namespace parquet { - -namespace arrow { +namespace parquet::arrow { using ::arrow::Result; using ::arrow::Status; @@ -220,5 +218,4 @@ Result> GetArrowType( primitive.type_length(), int96_arrow_time_unit); } -} // namespace arrow -} // namespace parquet +} // namespace parquet::arrow diff --git a/cpp/src/parquet/arrow/schema_internal.h b/cpp/src/parquet/arrow/schema_internal.h index fb837c3ee6cab..55292ac35ab9c 100644 --- a/cpp/src/parquet/arrow/schema_internal.h +++ b/cpp/src/parquet/arrow/schema_internal.h @@ -24,8 +24,7 @@ namespace arrow { class DataType; } -namespace parquet { -namespace arrow { +namespace parquet::arrow { using ::arrow::Result; @@ -47,5 +46,4 @@ Result> GetArrowType( const schema::PrimitiveNode& primitive, ::arrow::TimeUnit::type int96_arrow_time_unit = ::arrow::TimeUnit::NANO); -} // namespace arrow -} // namespace parquet +} // namespace parquet::arrow diff --git a/cpp/src/parquet/arrow/writer.cc b/cpp/src/parquet/arrow/writer.cc index 6d22f318f6b97..0c67e8d6bb3d4 100644 --- a/cpp/src/parquet/arrow/writer.cc +++ b/cpp/src/parquet/arrow/writer.cc @@ -73,8 +73,7 @@ using parquet::ParquetFileWriter; using parquet::ParquetVersion; using parquet::schema::GroupNode; -namespace parquet { -namespace arrow { +namespace parquet::arrow { namespace { @@ -600,5 +599,4 @@ Status WriteTable(const ::arrow::Table& table, ::arrow::MemoryPool* pool, return writer->Close(); } -} // namespace arrow -} // namespace parquet +} // namespace parquet::arrow diff --git a/cpp/src/parquet/benchmark_util.cc b/cpp/src/parquet/benchmark_util.cc new file mode 100644 index 0000000000000..6220336e1c39e --- /dev/null +++ b/cpp/src/parquet/benchmark_util.cc @@ -0,0 +1,126 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "parquet/benchmark_util.h" + +#include + +namespace parquet::benchmark { + +namespace { + +void GenerateRandomString(uint32_t length, uint32_t seed, std::vector* heap) { + // Character set used to generate random string + const std::string charset = + "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; + + std::default_random_engine gen(seed); + std::uniform_int_distribution dist(0, static_cast(charset.size() - 1)); + + for (uint32_t i = 0; i < length; i++) { + heap->emplace_back(charset[dist(gen)]); + } +} + +template +void GenerateBenchmarkDataIntegerImpl(uint32_t size, uint32_t seed, T* data, + std::vector* heap, uint32_t) { + static_assert(std::is_integral_v); + heap->clear(); + std::default_random_engine gen(seed); + std::uniform_int_distribution d(std::numeric_limits::min(), + std::numeric_limits::max()); + for (uint32_t i = 0; i < size; ++i) { + data[i] = d(gen); + } +} + +template +void GenerateBenchmarkDataFloatImpl(uint32_t size, uint32_t seed, T* data, + std::vector* heap, uint32_t) { + static_assert(std::is_floating_point_v); + heap->clear(); + std::default_random_engine gen(seed); + std::uniform_real_distribution d(std::numeric_limits::lowest(), + std::numeric_limits::max()); + for (uint32_t i = 0; i < size; ++i) { + data[i] = d(gen); + } +} + +} // namespace + +template <> +void GenerateBenchmarkData(uint32_t size, uint32_t seed, int32_t* data, + std::vector* heap, uint32_t data_string_length) { + GenerateBenchmarkDataIntegerImpl(size, seed, data, heap, data_string_length); +} + +template <> +void GenerateBenchmarkData(uint32_t size, uint32_t seed, int64_t* data, + std::vector* heap, uint32_t data_string_length) { + GenerateBenchmarkDataIntegerImpl(size, seed, data, heap, data_string_length); +} + +template <> +void GenerateBenchmarkData(uint32_t size, uint32_t seed, float* data, + std::vector* heap, uint32_t data_string_length) { + GenerateBenchmarkDataFloatImpl(size, seed, data, heap, data_string_length); +} + +template <> +void GenerateBenchmarkData(uint32_t size, uint32_t seed, double* data, + std::vector* heap, uint32_t data_string_length) { + GenerateBenchmarkDataFloatImpl(size, seed, data, heap, data_string_length); +} + +template <> +void GenerateBenchmarkData(uint32_t size, uint32_t seed, Int96* data, + std::vector* heap, uint32_t) { + heap->clear(); + std::default_random_engine gen(seed); + std::uniform_int_distribution d(std::numeric_limits::min(), + std::numeric_limits::max()); + for (uint32_t i = 0; i < size; ++i) { + data[i].value[0] = d(gen); + data[i].value[1] = d(gen); + data[i].value[2] = d(gen); + } +} + +template <> +void GenerateBenchmarkData(uint32_t size, uint32_t seed, FLBA* data, + std::vector* heap, uint32_t data_string_length) { + heap->clear(); + GenerateRandomString(data_string_length * size, seed, heap); + for (uint32_t i = 0; i < size; ++i) { + data[i].ptr = heap->data() + i * data_string_length; + } +} + +template <> +void GenerateBenchmarkData(uint32_t size, uint32_t seed, ByteArray* data, + std::vector* heap, uint32_t data_string_length) { + heap->clear(); + GenerateRandomString(data_string_length * size, seed, heap); + for (uint32_t i = 0; i < size; ++i) { + data[i].ptr = heap->data() + i * data_string_length; + data[i].len = data_string_length; + } +} + +} // namespace parquet::benchmark diff --git a/cpp/src/parquet/benchmark_util.h b/cpp/src/parquet/benchmark_util.h new file mode 100644 index 0000000000000..7996f7f85e898 --- /dev/null +++ b/cpp/src/parquet/benchmark_util.h @@ -0,0 +1,47 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "parquet/types.h" + +namespace parquet::benchmark { + +template +void GenerateBenchmarkData(uint32_t size, uint32_t seed, T* data, + std::vector* heap, uint32_t data_string_length); + +#define _GENERATE_BENCHMARK_DATA_DECL(KLASS) \ + template <> \ + void GenerateBenchmarkData(uint32_t size, uint32_t seed, KLASS* data, \ + std::vector* heap, uint32_t data_string_length); + +_GENERATE_BENCHMARK_DATA_DECL(int32_t) +_GENERATE_BENCHMARK_DATA_DECL(int64_t) +_GENERATE_BENCHMARK_DATA_DECL(float) +_GENERATE_BENCHMARK_DATA_DECL(double) +_GENERATE_BENCHMARK_DATA_DECL(ByteArray) +_GENERATE_BENCHMARK_DATA_DECL(FLBA) +_GENERATE_BENCHMARK_DATA_DECL(Int96) + +#undef _GENERATE_BENCHMARK_DATA_DECL + +} // namespace parquet::benchmark diff --git a/cpp/src/parquet/bloom_filter_benchmark.cc b/cpp/src/parquet/bloom_filter_benchmark.cc index fa934b1d5290a..13c731d975b2c 100644 --- a/cpp/src/parquet/bloom_filter_benchmark.cc +++ b/cpp/src/parquet/bloom_filter_benchmark.cc @@ -18,13 +18,13 @@ #include "benchmark/benchmark.h" #include "arrow/util/logging.h" +#include "parquet/benchmark_util.h" #include "parquet/bloom_filter.h" #include "parquet/properties.h" #include -namespace parquet { -namespace benchmark { +namespace parquet::benchmark { constexpr static uint32_t kNumBloomFilterInserts = 16 * 1024; // The sample string length for FLBA and ByteArray benchmarks @@ -40,63 +40,11 @@ std::unique_ptr CreateBloomFilter(uint32_t num_values) { return bloom_filter; } -void GenerateRandomString(uint32_t length, uint32_t seed, std::vector* heap) { - // Character set used to generate random string - const std::string charset = - "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; - - std::default_random_engine gen(seed); - std::uniform_int_distribution dist(0, static_cast(charset.size() - 1)); - - for (uint32_t i = 0; i < length; i++) { - heap->push_back(charset[dist(gen)]); - } -} - -template -void GenerateBenchmarkData(uint32_t size, uint32_t seed, T* data, - [[maybe_unused]] std::vector* heap = nullptr) { - if constexpr (std::is_integral_v) { - std::default_random_engine gen(seed); - std::uniform_int_distribution d(std::numeric_limits::min(), - std::numeric_limits::max()); - for (uint32_t i = 0; i < size; ++i) { - data[i] = d(gen); - } - } else if constexpr (std::is_floating_point_v) { - std::default_random_engine gen(seed); - std::uniform_real_distribution d(std::numeric_limits::lowest(), - std::numeric_limits::max()); - for (uint32_t i = 0; i < size; ++i) { - data[i] = d(gen); - } - } else if constexpr (std::is_same_v) { - GenerateRandomString(kDataStringLength * size, seed, heap); - for (uint32_t i = 0; i < size; ++i) { - data[i].ptr = heap->data() + i * kDataStringLength; - } - } else if constexpr (std::is_same_v) { - GenerateRandomString(kDataStringLength * size, seed, heap); - for (uint32_t i = 0; i < size; ++i) { - data[i].ptr = heap->data() + i * kDataStringLength; - data[i].len = kDataStringLength; - } - } else if constexpr (std::is_same_v) { - std::default_random_engine gen(seed); - std::uniform_int_distribution d(std::numeric_limits::min(), - std::numeric_limits::max()); - for (uint32_t i = 0; i < size; ++i) { - data[i].value[0] = d(gen); - data[i].value[1] = d(gen); - data[i].value[2] = d(gen); - } - } -} - std::vector GetHashValues(uint32_t num_values, uint32_t seed) { // Generate sample data values std::vector values(num_values); - GenerateBenchmarkData(num_values, seed, values.data()); + std::vector heap; + GenerateBenchmarkData(num_values, seed, values.data(), &heap, kDataStringLength); // Create a temp filter to compute hash values auto filter = CreateBloomFilter(/*num_values=*/8); std::vector hashes(num_values); @@ -109,7 +57,8 @@ static void BM_ComputeHash(::benchmark::State& state) { using T = typename DType::c_type; std::vector values(kNumBloomFilterInserts); std::vector heap; - GenerateBenchmarkData(kNumBloomFilterInserts, /*seed=*/0, values.data(), &heap); + GenerateBenchmarkData(kNumBloomFilterInserts, /*seed=*/0, values.data(), &heap, + kDataStringLength); auto filter = CreateBloomFilter(kNumBloomFilterInserts); for (auto _ : state) { uint64_t total = 0; @@ -136,7 +85,8 @@ static void BM_BatchComputeHash(::benchmark::State& state) { using T = typename DType::c_type; std::vector values(kNumBloomFilterInserts); std::vector heap; - GenerateBenchmarkData(kNumBloomFilterInserts, /*seed=*/0, values.data(), &heap); + GenerateBenchmarkData(kNumBloomFilterInserts, /*seed=*/0, values.data(), &heap, + kDataStringLength); auto filter = CreateBloomFilter(kNumBloomFilterInserts); std::vector hashes(kNumBloomFilterInserts); for (auto _ : state) { @@ -231,5 +181,4 @@ BENCHMARK(BM_BatchInsertHash); BENCHMARK(BM_FindExistingHash); BENCHMARK(BM_FindNonExistingHash); -} // namespace benchmark -} // namespace parquet +} // namespace parquet::benchmark diff --git a/cpp/src/parquet/bloom_filter_reader_test.cc b/cpp/src/parquet/bloom_filter_reader_test.cc index 64dd0d9b9d190..e297ab7045120 100644 --- a/cpp/src/parquet/bloom_filter_reader_test.cc +++ b/cpp/src/parquet/bloom_filter_reader_test.cc @@ -22,8 +22,7 @@ #include "parquet/file_reader.h" #include "parquet/test_util.h" -namespace parquet { -namespace test { +namespace parquet::test { TEST(BloomFilterReader, ReadBloomFilter) { std::string dir_string(parquet::test::get_data_dir()); @@ -70,5 +69,4 @@ TEST(BloomFilterReader, FileNotHaveBloomFilter) { ASSERT_EQ(nullptr, bloom_filter); } -} // namespace test -} // namespace parquet +} // namespace parquet::test diff --git a/cpp/src/parquet/column_io_benchmark.cc b/cpp/src/parquet/column_io_benchmark.cc index 6ee579bec9a69..48e434a342e72 100644 --- a/cpp/src/parquet/column_io_benchmark.cc +++ b/cpp/src/parquet/column_io_benchmark.cc @@ -40,8 +40,7 @@ std::shared_ptr BuildWriter(int64_t output_size, ColumnDescriptor* schema, const WriterProperties* properties, Compression::type codec) { - std::unique_ptr pager = - PageWriter::Open(dst, codec, Codec::UseDefaultCompressionLevel(), metadata); + std::unique_ptr pager = PageWriter::Open(dst, codec, metadata); std::shared_ptr writer = ColumnWriter::Make(metadata, std::move(pager), properties); return std::static_pointer_cast(writer); diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index 3294aaaf283f1..6fe1ce9da60fe 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -43,6 +43,7 @@ #include "arrow/util/int_util_overflow.h" #include "arrow/util/logging.h" #include "arrow/util/rle_encoding.h" +#include "arrow/util/unreachable.h" #include "parquet/column_page.h" #include "parquet/encoding.h" #include "parquet/encryption/encryption_internal.h" @@ -103,7 +104,7 @@ inline void CheckNumberDecoded(int64_t number_decoded, int64_t expected) { LevelDecoder::LevelDecoder() : num_values_remaining_(0) {} -LevelDecoder::~LevelDecoder() {} +LevelDecoder::~LevelDecoder() = default; int LevelDecoder::SetData(Encoding::type encoding, int16_t max_level, int num_buffered_values, const uint8_t* data, @@ -435,9 +436,7 @@ std::shared_ptr SerializedPageReader::NextPage() { // until a maximum allowed header limit while (true) { PARQUET_ASSIGN_OR_THROW(auto view, stream_->Peek(allowed_page_size)); - if (view.size() == 0) { - return std::shared_ptr(nullptr); - } + if (view.size() == 0) return nullptr; // This gets used, then set by DeserializeThriftMsg header_size = static_cast(view.size()); @@ -1267,11 +1266,10 @@ int64_t TypedColumnReaderImpl::Skip(int64_t num_values_to_skip) { ARROW_DCHECK_NE(this->scratch_for_skip_, nullptr); do { int64_t batch_size = std::min(kSkipScratchBatchSize, values_to_skip); - values_read = ReadBatch( - static_cast(batch_size), - reinterpret_cast(this->scratch_for_skip_->mutable_data()), - reinterpret_cast(this->scratch_for_skip_->mutable_data()), - reinterpret_cast(this->scratch_for_skip_->mutable_data()), &values_read); + values_read = ReadBatch(static_cast(batch_size), + scratch_for_skip_->mutable_data_as(), + scratch_for_skip_->mutable_data_as(), + scratch_for_skip_->mutable_data_as(), &values_read); values_to_skip -= values_read; } while (values_read > 0 && values_to_skip > 0); } @@ -1315,8 +1313,7 @@ std::shared_ptr ColumnReader::Make(const ColumnDescriptor* descr, default: ParquetException::NYI("type reader not implemented"); } - // Unreachable code, but suppress compiler warning - return std::shared_ptr(nullptr); + ::arrow::Unreachable(); } // ---------------------------------------------------------------------- @@ -1454,7 +1451,7 @@ class TypedRecordReader : public TypedColumnReaderImpl, int64_t levels_remaining = levels_written_ - gap; auto left_shift = [&](::arrow::ResizableBuffer* buffer) { - int16_t* data = reinterpret_cast(buffer->mutable_data()); + auto* data = buffer->mutable_data_as(); std::copy(data + levels_position_, data + levels_written_, data + start_levels_position); PARQUET_THROW_NOT_OK(buffer->Resize(levels_remaining * sizeof(int16_t), @@ -1619,7 +1616,7 @@ class TypedRecordReader : public TypedColumnReaderImpl, do { int64_t batch_size = std::min(kSkipScratchBatchSize, values_left); values_read = this->ReadValues( - batch_size, reinterpret_cast(this->scratch_for_skip_->mutable_data())); + batch_size, this->scratch_for_skip_->template mutable_data_as()); values_left -= values_read; } while (values_read > 0 && values_left > 0); if (values_left > 0) { @@ -2033,7 +2030,7 @@ class TypedRecordReader : public TypedColumnReaderImpl, protected: template T* ValuesHead() { - return reinterpret_cast(values_->mutable_data()) + values_written_; + return values_->mutable_data_as() + values_written_; } LevelInfo leaf_info_; }; diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 47c064795dd2f..61e3d14cf04e3 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -251,14 +251,14 @@ int LevelEncoder::Encode(int batch_size, const int16_t* levels) { class SerializedPageWriter : public PageWriter { public: SerializedPageWriter(std::shared_ptr sink, Compression::type codec, - int compression_level, ColumnChunkMetaDataBuilder* metadata, - int16_t row_group_ordinal, int16_t column_chunk_ordinal, - bool use_page_checksum_verification, + ColumnChunkMetaDataBuilder* metadata, int16_t row_group_ordinal, + int16_t column_chunk_ordinal, bool use_page_checksum_verification, MemoryPool* pool = ::arrow::default_memory_pool(), std::shared_ptr meta_encryptor = nullptr, std::shared_ptr data_encryptor = nullptr, ColumnIndexBuilder* column_index_builder = nullptr, - OffsetIndexBuilder* offset_index_builder = nullptr) + OffsetIndexBuilder* offset_index_builder = nullptr, + const CodecOptions& codec_options = CodecOptions{}) : sink_(std::move(sink)), metadata_(metadata), pool_(pool), @@ -279,7 +279,7 @@ class SerializedPageWriter : public PageWriter { if (data_encryptor_ != nullptr || meta_encryptor_ != nullptr) { InitEncryption(); } - compressor_ = GetCodec(codec, compression_level); + compressor_ = GetCodec(codec, codec_options); thrift_serializer_ = std::make_unique(); } @@ -620,21 +620,21 @@ class SerializedPageWriter : public PageWriter { class BufferedPageWriter : public PageWriter { public: BufferedPageWriter(std::shared_ptr sink, Compression::type codec, - int compression_level, ColumnChunkMetaDataBuilder* metadata, - int16_t row_group_ordinal, int16_t current_column_ordinal, - bool use_page_checksum_verification, + ColumnChunkMetaDataBuilder* metadata, int16_t row_group_ordinal, + int16_t current_column_ordinal, bool use_page_checksum_verification, MemoryPool* pool = ::arrow::default_memory_pool(), std::shared_ptr meta_encryptor = nullptr, std::shared_ptr data_encryptor = nullptr, ColumnIndexBuilder* column_index_builder = nullptr, - OffsetIndexBuilder* offset_index_builder = nullptr) + OffsetIndexBuilder* offset_index_builder = nullptr, + const CodecOptions& codec_options = CodecOptions{}) : final_sink_(std::move(sink)), metadata_(metadata), has_dictionary_pages_(false) { in_memory_sink_ = CreateOutputStream(pool); pager_ = std::make_unique( - in_memory_sink_, codec, compression_level, metadata, row_group_ordinal, - current_column_ordinal, use_page_checksum_verification, pool, - std::move(meta_encryptor), std::move(data_encryptor), column_index_builder, - offset_index_builder); + in_memory_sink_, codec, metadata, row_group_ordinal, current_column_ordinal, + use_page_checksum_verification, pool, std::move(meta_encryptor), + std::move(data_encryptor), column_index_builder, offset_index_builder, + codec_options); } int64_t WriteDictionaryPage(const DictionaryPage& page) override { @@ -692,26 +692,38 @@ class BufferedPageWriter : public PageWriter { std::unique_ptr PageWriter::Open( std::shared_ptr sink, Compression::type codec, - int compression_level, ColumnChunkMetaDataBuilder* metadata, - int16_t row_group_ordinal, int16_t column_chunk_ordinal, MemoryPool* pool, - bool buffered_row_group, std::shared_ptr meta_encryptor, - std::shared_ptr data_encryptor, bool page_write_checksum_enabled, - ColumnIndexBuilder* column_index_builder, OffsetIndexBuilder* offset_index_builder) { + ColumnChunkMetaDataBuilder* metadata, int16_t row_group_ordinal, + int16_t column_chunk_ordinal, MemoryPool* pool, bool buffered_row_group, + std::shared_ptr meta_encryptor, std::shared_ptr data_encryptor, + bool page_write_checksum_enabled, ColumnIndexBuilder* column_index_builder, + OffsetIndexBuilder* offset_index_builder, const CodecOptions& codec_options) { if (buffered_row_group) { return std::unique_ptr(new BufferedPageWriter( - std::move(sink), codec, compression_level, metadata, row_group_ordinal, - column_chunk_ordinal, page_write_checksum_enabled, pool, - std::move(meta_encryptor), std::move(data_encryptor), column_index_builder, - offset_index_builder)); + std::move(sink), codec, metadata, row_group_ordinal, column_chunk_ordinal, + page_write_checksum_enabled, pool, std::move(meta_encryptor), + std::move(data_encryptor), column_index_builder, offset_index_builder, + codec_options)); } else { return std::unique_ptr(new SerializedPageWriter( - std::move(sink), codec, compression_level, metadata, row_group_ordinal, - column_chunk_ordinal, page_write_checksum_enabled, pool, - std::move(meta_encryptor), std::move(data_encryptor), column_index_builder, - offset_index_builder)); + std::move(sink), codec, metadata, row_group_ordinal, column_chunk_ordinal, + page_write_checksum_enabled, pool, std::move(meta_encryptor), + std::move(data_encryptor), column_index_builder, offset_index_builder, + codec_options)); } } +std::unique_ptr PageWriter::Open( + std::shared_ptr sink, Compression::type codec, + int compression_level, ColumnChunkMetaDataBuilder* metadata, + int16_t row_group_ordinal, int16_t column_chunk_ordinal, MemoryPool* pool, + bool buffered_row_group, std::shared_ptr meta_encryptor, + std::shared_ptr data_encryptor, bool page_write_checksum_enabled, + ColumnIndexBuilder* column_index_builder, OffsetIndexBuilder* offset_index_builder) { + return PageWriter::Open(sink, codec, metadata, row_group_ordinal, column_chunk_ordinal, + pool, buffered_row_group, meta_encryptor, data_encryptor, + page_write_checksum_enabled, column_index_builder, + offset_index_builder, CodecOptions{compression_level}); +} // ---------------------------------------------------------------------- // ColumnWriter diff --git a/cpp/src/parquet/column_writer.h b/cpp/src/parquet/column_writer.h index 792b108ac8835..88a42acc2f706 100644 --- a/cpp/src/parquet/column_writer.h +++ b/cpp/src/parquet/column_writer.h @@ -21,6 +21,7 @@ #include #include +#include "arrow/util/compression.h" #include "parquet/exception.h" #include "parquet/platform.h" #include "parquet/types.h" @@ -35,6 +36,7 @@ class BitWriter; namespace util { class RleEncoder; +class CodecOptions; } // namespace util } // namespace arrow @@ -85,6 +87,22 @@ class PARQUET_EXPORT PageWriter { public: virtual ~PageWriter() {} + static std::unique_ptr Open( + std::shared_ptr sink, Compression::type codec, + ColumnChunkMetaDataBuilder* metadata, int16_t row_group_ordinal = -1, + int16_t column_chunk_ordinal = -1, + ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(), + bool buffered_row_group = false, + std::shared_ptr header_encryptor = NULLPTR, + std::shared_ptr data_encryptor = NULLPTR, + bool page_write_checksum_enabled = false, + // column_index_builder MUST outlive the PageWriter + ColumnIndexBuilder* column_index_builder = NULLPTR, + // offset_index_builder MUST outlive the PageWriter + OffsetIndexBuilder* offset_index_builder = NULLPTR, + const CodecOptions& codec_options = CodecOptions{}); + + ARROW_DEPRECATED("Deprecated in 13.0.0. Use CodecOptions-taking overload instead.") static std::unique_ptr Open( std::shared_ptr sink, Compression::type codec, int compression_level, ColumnChunkMetaDataBuilder* metadata, diff --git a/cpp/src/parquet/column_writer_test.cc b/cpp/src/parquet/column_writer_test.cc index af9876370ee42..58199c402bd7a 100644 --- a/cpp/src/parquet/column_writer_test.cc +++ b/cpp/src/parquet/column_writer_test.cc @@ -118,8 +118,8 @@ class TestPrimitiveWriter : public PrimitiveTypedTest { metadata_ = ColumnChunkMetaDataBuilder::Make(writer_properties_, this->descr_); std::unique_ptr pager = PageWriter::Open( - sink_, column_properties.compression(), Codec::UseDefaultCompressionLevel(), - metadata_.get(), /* row_group_ordinal */ -1, /* column_chunk_ordinal*/ -1, + sink_, column_properties.compression(), metadata_.get(), + /* row_group_ordinal */ -1, /* column_chunk_ordinal*/ -1, ::arrow::default_memory_pool(), /* buffered_row_group */ false, /* header_encryptor */ NULLPTR, /* data_encryptor */ NULLPTR, enable_checksum); std::shared_ptr writer = @@ -162,6 +162,20 @@ class TestPrimitiveWriter : public PrimitiveTypedTest { ASSERT_NO_FATAL_FAILURE(this->ReadAndCompare(compression, num_rows, enable_checksum)); } + void TestRequiredWithCodecOptions(Encoding::type encoding, + Compression::type compression, bool enable_dictionary, + bool enable_statistics, int64_t num_rows = SMALL_SIZE, + const std::shared_ptr& codec_options = + std::make_shared(), + bool enable_checksum = false) { + this->GenerateData(num_rows); + + this->WriteRequiredWithCodecOptions(encoding, compression, enable_dictionary, + enable_statistics, codec_options, num_rows, + enable_checksum); + ASSERT_NO_FATAL_FAILURE(this->ReadAndCompare(compression, num_rows, enable_checksum)); + } + void TestDictionaryFallbackEncoding(ParquetVersion::type version) { this->GenerateData(VERY_LARGE_SIZE); ColumnProperties column_properties; @@ -238,7 +252,8 @@ class TestPrimitiveWriter : public PrimitiveTypedTest { bool enable_checksum) { ColumnProperties column_properties(encoding, compression, enable_dictionary, enable_statistics); - column_properties.set_compression_level(compression_level); + column_properties.set_codec_options( + std::make_shared(compression_level)); std::shared_ptr> writer = this->BuildWriter( num_rows, column_properties, ParquetVersion::PARQUET_1_0, enable_checksum); writer->WriteBatch(this->values_.size(), nullptr, nullptr, this->values_ptr_); @@ -256,7 +271,8 @@ class TestPrimitiveWriter : public PrimitiveTypedTest { bit_util::BytesForBits(static_cast(this->values_.size())) + 1, 255); ColumnProperties column_properties(encoding, compression, enable_dictionary, enable_statistics); - column_properties.set_compression_level(compression_level); + column_properties.set_codec_options( + std::make_shared(compression_level)); std::shared_ptr> writer = this->BuildWriter( num_rows, column_properties, ParquetVersion::PARQUET_1_0, enable_checksum); writer->WriteBatchSpaced(this->values_.size(), nullptr, nullptr, valid_bits.data(), 0, @@ -266,6 +282,22 @@ class TestPrimitiveWriter : public PrimitiveTypedTest { writer->Close(); } + void WriteRequiredWithCodecOptions(Encoding::type encoding, + Compression::type compression, + bool enable_dictionary, bool enable_statistics, + const std::shared_ptr& codec_options, + int64_t num_rows, bool enable_checksum) { + ColumnProperties column_properties(encoding, compression, enable_dictionary, + enable_statistics); + column_properties.set_codec_options(codec_options); + std::shared_ptr> writer = this->BuildWriter( + num_rows, column_properties, ParquetVersion::PARQUET_1_0, enable_checksum); + writer->WriteBatch(this->values_.size(), nullptr, nullptr, this->values_ptr_); + // The behaviour should be independent from the number of Close() calls + writer->Close(); + writer->Close(); + } + void ReadAndCompare(Compression::type compression, int64_t num_rows, bool page_checksum_verify) { this->SetupValuesOut(num_rows); @@ -522,6 +554,14 @@ TYPED_TEST(TestPrimitiveWriter, RequiredPlainWithStatsAndGzipCompression) { this->TestRequiredWithSettings(Encoding::PLAIN, Compression::GZIP, false, true, LARGE_SIZE); } + +TYPED_TEST(TestPrimitiveWriter, RequiredPlainWithGzipCodecOptions) { + auto codec_options = std::make_shared<::arrow::util::GZipCodecOptions>(); + codec_options->gzip_format = ::arrow::util::GZipFormat::GZIP; + codec_options->window_bits = 12; + this->TestRequiredWithCodecOptions(Encoding::PLAIN, Compression::GZIP, false, false, + LARGE_SIZE, codec_options); +} #endif #ifdef ARROW_WITH_LZ4 @@ -818,8 +858,7 @@ TEST(TestColumnWriter, RepeatedListsUpdateSpacedBug) { auto metadata = ColumnChunkMetaDataBuilder::Make(props, schema.Column(0)); std::unique_ptr pager = - PageWriter::Open(sink, Compression::UNCOMPRESSED, - Codec::UseDefaultCompressionLevel(), metadata.get()); + PageWriter::Open(sink, Compression::UNCOMPRESSED, metadata.get()); std::shared_ptr writer = ColumnWriter::Make(metadata.get(), std::move(pager), props.get()); auto typed_writer = std::static_pointer_cast>(writer); @@ -1350,7 +1389,7 @@ class ColumnWriterTestSizeEstimated : public ::testing::Test { schema_descriptor_->Column(0)); std::unique_ptr pager = PageWriter::Open( - sink_, compression, Codec::UseDefaultCompressionLevel(), metadata_.get(), + sink_, compression, metadata_.get(), /* row_group_ordinal */ -1, /* column_chunk_ordinal*/ -1, ::arrow::default_memory_pool(), /* buffered_row_group */ buffered, /* header_encryptor */ NULLPTR, /* data_encryptor */ NULLPTR, diff --git a/cpp/src/parquet/encoding_test.cc b/cpp/src/parquet/encoding_test.cc index 6285c4c12539d..7a910e4220831 100644 --- a/cpp/src/parquet/encoding_test.cc +++ b/cpp/src/parquet/encoding_test.cc @@ -48,9 +48,7 @@ using arrow::internal::checked_cast; namespace bit_util = arrow::bit_util; -namespace parquet { - -namespace test { +namespace parquet::test { TEST(VectorBooleanTest, TestEncodeBoolDecode) { // PARQUET-454 @@ -1910,5 +1908,4 @@ TEST(DeltaLengthByteArrayEncodingAdHoc, ArrowDirectPut) { CheckDecode(encoded, ::arrow::ArrayFromJSON(::arrow::large_binary(), values)); } -} // namespace test -} // namespace parquet +} // namespace parquet::test diff --git a/cpp/src/parquet/encryption/crypto_factory.cc b/cpp/src/parquet/encryption/crypto_factory.cc index 67e3d8c5f297e..ebb7c3c7b37fb 100644 --- a/cpp/src/parquet/encryption/crypto_factory.cc +++ b/cpp/src/parquet/encryption/crypto_factory.cc @@ -26,8 +26,7 @@ #include "parquet/encryption/file_system_key_material_store.h" #include "parquet/encryption/key_toolkit_internal.h" -namespace parquet { -namespace encryption { +namespace parquet::encryption { void CryptoFactory::RegisterKmsClientFactory( std::shared_ptr kms_client_factory) { @@ -192,5 +191,4 @@ void CryptoFactory::RotateMasterKeys( double_wrapping, cache_lifetime_seconds); } -} // namespace encryption -} // namespace parquet +} // namespace parquet::encryption diff --git a/cpp/src/parquet/encryption/crypto_factory.h b/cpp/src/parquet/encryption/crypto_factory.h index 14015a95c85e5..291cccf30f8e3 100644 --- a/cpp/src/parquet/encryption/crypto_factory.h +++ b/cpp/src/parquet/encryption/crypto_factory.h @@ -25,8 +25,7 @@ #include "parquet/encryption/kms_client_factory.h" #include "parquet/platform.h" -namespace parquet { -namespace encryption { +namespace parquet::encryption { static constexpr ParquetCipher::type kDefaultEncryptionAlgorithm = ParquetCipher::AES_GCM_V1; @@ -152,5 +151,4 @@ class PARQUET_EXPORT CryptoFactory { KeyToolkit key_toolkit_; }; -} // namespace encryption -} // namespace parquet +} // namespace parquet::encryption diff --git a/cpp/src/parquet/encryption/encryption_internal.cc b/cpp/src/parquet/encryption/encryption_internal.cc index 1c4d3d8dc4897..6e66efeff6326 100644 --- a/cpp/src/parquet/encryption/encryption_internal.cc +++ b/cpp/src/parquet/encryption/encryption_internal.cc @@ -31,8 +31,7 @@ using parquet::ParquetException; -namespace parquet { -namespace encryption { +namespace parquet::encryption { constexpr int kGcmMode = 0; constexpr int kCtrMode = 1; @@ -649,5 +648,4 @@ void QuickUpdatePageAad(int32_t new_page_ordinal, std::string* AAD) { void RandBytes(unsigned char* buf, int num) { RAND_bytes(buf, num); } -} // namespace encryption -} // namespace parquet +} // namespace parquet::encryption diff --git a/cpp/src/parquet/encryption/encryption_internal.h b/cpp/src/parquet/encryption/encryption_internal.h index 24093c68be531..4ed5b5cf61243 100644 --- a/cpp/src/parquet/encryption/encryption_internal.h +++ b/cpp/src/parquet/encryption/encryption_internal.h @@ -26,8 +26,7 @@ using parquet::ParquetCipher; -namespace parquet { -namespace encryption { +namespace parquet::encryption { constexpr int kGcmTagLength = 16; constexpr int kNonceLength = 12; @@ -129,5 +128,4 @@ void QuickUpdatePageAad(int32_t new_page_ordinal, std::string* AAD); // Wraps OpenSSL RAND_bytes function void RandBytes(unsigned char* buf, int num); -} // namespace encryption -} // namespace parquet +} // namespace parquet::encryption diff --git a/cpp/src/parquet/encryption/encryption_internal_nossl.cc b/cpp/src/parquet/encryption/encryption_internal_nossl.cc index bb203f0fd877d..0241923474de9 100644 --- a/cpp/src/parquet/encryption/encryption_internal_nossl.cc +++ b/cpp/src/parquet/encryption/encryption_internal_nossl.cc @@ -18,8 +18,7 @@ #include "parquet/encryption/encryption_internal.h" #include "parquet/exception.h" -namespace parquet { -namespace encryption { +namespace parquet::encryption { void ThrowOpenSSLRequiredException() { throw ParquetException( @@ -115,5 +114,4 @@ void QuickUpdatePageAad(int32_t new_page_ordinal, std::string* AAD) { void RandBytes(unsigned char* buf, int num) { ThrowOpenSSLRequiredException(); } -} // namespace encryption -} // namespace parquet +} // namespace parquet::encryption diff --git a/cpp/src/parquet/encryption/file_key_material_store.h b/cpp/src/parquet/encryption/file_key_material_store.h index 862e8d9761b0d..83f028a4bc1e9 100644 --- a/cpp/src/parquet/encryption/file_key_material_store.h +++ b/cpp/src/parquet/encryption/file_key_material_store.h @@ -24,8 +24,7 @@ #include "arrow/filesystem/filesystem.h" #include "parquet/platform.h" -namespace parquet { -namespace encryption { +namespace parquet::encryption { /// Stores encryption key material outside the Parquet file, for example in a separate /// small file in the same folder. This is important for “key rotation”, when MEKs have to @@ -55,5 +54,4 @@ class PARQUET_EXPORT FileKeyMaterialStore { virtual ~FileKeyMaterialStore() {} }; -} // namespace encryption -} // namespace parquet +} // namespace parquet::encryption diff --git a/cpp/src/parquet/encryption/file_key_unwrapper.cc b/cpp/src/parquet/encryption/file_key_unwrapper.cc index a2e2d2df5b284..50cc6eee539f7 100644 --- a/cpp/src/parquet/encryption/file_key_unwrapper.cc +++ b/cpp/src/parquet/encryption/file_key_unwrapper.cc @@ -22,8 +22,7 @@ #include "parquet/encryption/file_key_unwrapper.h" #include "parquet/encryption/key_metadata.h" -namespace parquet { -namespace encryption { +namespace parquet::encryption { FileKeyUnwrapper::FileKeyUnwrapper( KeyToolkit* key_toolkit, const KmsConnectionConfig& kms_connection_config, @@ -136,5 +135,4 @@ std::shared_ptr FileKeyUnwrapper::GetKmsClientFromConfigOrKeyMaterial cache_entry_lifetime_seconds_); } -} // namespace encryption -} // namespace parquet +} // namespace parquet::encryption diff --git a/cpp/src/parquet/encryption/file_key_unwrapper.h b/cpp/src/parquet/encryption/file_key_unwrapper.h index 3400641ed91c5..71b245788a713 100644 --- a/cpp/src/parquet/encryption/file_key_unwrapper.h +++ b/cpp/src/parquet/encryption/file_key_unwrapper.h @@ -27,8 +27,7 @@ #include "parquet/encryption/kms_client.h" #include "parquet/platform.h" -namespace parquet { -namespace encryption { +namespace parquet::encryption { // This class will retrieve the key from "key metadata", following these steps: // 1. Parse "key metadata" (see structure in KeyMetadata class). @@ -78,5 +77,4 @@ class PARQUET_EXPORT FileKeyUnwrapper : public DecryptionKeyRetriever { std::shared_ptr<::arrow::fs::FileSystem> file_system_; }; -} // namespace encryption -} // namespace parquet +} // namespace parquet::encryption diff --git a/cpp/src/parquet/encryption/file_key_wrapper.cc b/cpp/src/parquet/encryption/file_key_wrapper.cc index 4f0f1d219acba..704651ebaa8b3 100644 --- a/cpp/src/parquet/encryption/file_key_wrapper.cc +++ b/cpp/src/parquet/encryption/file_key_wrapper.cc @@ -22,8 +22,7 @@ #include "parquet/encryption/key_toolkit_internal.h" #include "parquet/exception.h" -namespace parquet { -namespace encryption { +namespace parquet::encryption { FileKeyWrapper::FileKeyWrapper(KeyToolkit* key_toolkit, const KmsConnectionConfig& kms_connection_config, @@ -124,5 +123,4 @@ KeyEncryptionKey FileKeyWrapper::CreateKeyEncryptionKey( return KeyEncryptionKey(kek_bytes, kek_id, encoded_wrapped_kek); } -} // namespace encryption -} // namespace parquet +} // namespace parquet::encryption diff --git a/cpp/src/parquet/encryption/file_key_wrapper.h b/cpp/src/parquet/encryption/file_key_wrapper.h index 95ad6ec4829bf..26b9719de64db 100644 --- a/cpp/src/parquet/encryption/file_key_wrapper.h +++ b/cpp/src/parquet/encryption/file_key_wrapper.h @@ -29,8 +29,7 @@ #include "parquet/encryption/kms_client.h" #include "parquet/platform.h" -namespace parquet { -namespace encryption { +namespace parquet::encryption { // This class will generate "key metadata" from "data encryption key" and "master key", // following these steps: @@ -82,5 +81,4 @@ class PARQUET_EXPORT FileKeyWrapper { uint16_t key_counter_; }; -} // namespace encryption -} // namespace parquet +} // namespace parquet::encryption diff --git a/cpp/src/parquet/encryption/file_system_key_material_store.cc b/cpp/src/parquet/encryption/file_system_key_material_store.cc index 494a75e4cc307..2d898c1d3970f 100644 --- a/cpp/src/parquet/encryption/file_system_key_material_store.cc +++ b/cpp/src/parquet/encryption/file_system_key_material_store.cc @@ -27,8 +27,7 @@ #include "parquet/encryption/key_material.h" #include "parquet/exception.h" -namespace parquet { -namespace encryption { +namespace parquet::encryption { constexpr const char FileSystemKeyMaterialStore::kKeyMaterialFilePrefix[]; constexpr const char FileSystemKeyMaterialStore::kTempFilePrefix[]; @@ -139,5 +138,4 @@ void FileSystemKeyMaterialStore::MoveMaterialTo( } } -} // namespace encryption -} // namespace parquet +} // namespace parquet::encryption diff --git a/cpp/src/parquet/encryption/file_system_key_material_store.h b/cpp/src/parquet/encryption/file_system_key_material_store.h index 6fbdd55e9413c..896a53202f589 100644 --- a/cpp/src/parquet/encryption/file_system_key_material_store.h +++ b/cpp/src/parquet/encryption/file_system_key_material_store.h @@ -25,8 +25,7 @@ #include "parquet/encryption/file_key_material_store.h" -namespace parquet { -namespace encryption { +namespace parquet::encryption { /// A FileKeyMaterialStore that stores key material in a file system file in the same /// folder as the Parquet file. @@ -87,5 +86,4 @@ class PARQUET_EXPORT FileSystemKeyMaterialStore : public FileKeyMaterialStore { std::unordered_map key_material_map_; }; -} // namespace encryption -} // namespace parquet +} // namespace parquet::encryption diff --git a/cpp/src/parquet/encryption/key_encryption_key.h b/cpp/src/parquet/encryption/key_encryption_key.h index 153bb4b5e2885..62263ee3cd506 100644 --- a/cpp/src/parquet/encryption/key_encryption_key.h +++ b/cpp/src/parquet/encryption/key_encryption_key.h @@ -22,8 +22,7 @@ #include "arrow/util/base64.h" -namespace parquet { -namespace encryption { +namespace parquet::encryption { // In the double wrapping mode, each "data encryption key" (DEK) is encrypted with a “key // encryption key” (KEK), that in turn is encrypted with a "master encryption key" (MEK). @@ -55,5 +54,4 @@ class KeyEncryptionKey { std::string encoded_wrapped_kek_; }; -} // namespace encryption -} // namespace parquet +} // namespace parquet::encryption diff --git a/cpp/src/parquet/encryption/key_management_test.cc b/cpp/src/parquet/encryption/key_management_test.cc index 5eebde0c29584..f733c43ee1e79 100644 --- a/cpp/src/parquet/encryption/key_management_test.cc +++ b/cpp/src/parquet/encryption/key_management_test.cc @@ -37,9 +37,7 @@ #include "parquet/file_reader.h" #include "parquet/test_util.h" -namespace parquet { -namespace encryption { -namespace test { +namespace parquet::encryption::test { class TestEncryptionKeyManagement : public ::testing::Test { protected: @@ -387,6 +385,4 @@ TEST_F(TestEncryptionKeyManagement, ReadParquetMRExternalKeyMaterialFile) { } } -} // namespace test -} // namespace encryption -} // namespace parquet +} // namespace parquet::encryption::test diff --git a/cpp/src/parquet/encryption/key_material.cc b/cpp/src/parquet/encryption/key_material.cc index 372279c33a5bd..1cebf5900f316 100644 --- a/cpp/src/parquet/encryption/key_material.cc +++ b/cpp/src/parquet/encryption/key_material.cc @@ -25,8 +25,7 @@ using ::arrow::json::internal::ObjectParser; using ::arrow::json::internal::ObjectWriter; -namespace parquet { -namespace encryption { +namespace parquet::encryption { constexpr const char KeyMaterial::kKeyMaterialTypeField[]; constexpr const char KeyMaterial::kKeyMaterialType1[]; @@ -155,5 +154,4 @@ std::string KeyMaterial::SerializeToJson( return json_writer.Serialize(); } -} // namespace encryption -} // namespace parquet +} // namespace parquet::encryption diff --git a/cpp/src/parquet/encryption/key_material.h b/cpp/src/parquet/encryption/key_material.h index f20d23ea35d3b..3e7e862c996d3 100644 --- a/cpp/src/parquet/encryption/key_material.h +++ b/cpp/src/parquet/encryption/key_material.h @@ -29,8 +29,7 @@ class ObjectParser; } // namespace json } // namespace arrow -namespace parquet { -namespace encryption { +namespace parquet::encryption { // KeyMaterial class represents the "key material", keeping the information that allows // readers to recover an encryption key (see description of the KeyMetadata class). The @@ -127,5 +126,4 @@ class PARQUET_EXPORT KeyMaterial { std::string encoded_wrapped_dek_; }; -} // namespace encryption -} // namespace parquet +} // namespace parquet::encryption diff --git a/cpp/src/parquet/encryption/key_metadata.cc b/cpp/src/parquet/encryption/key_metadata.cc index 624626c890cc5..e23a67b6b86ee 100644 --- a/cpp/src/parquet/encryption/key_metadata.cc +++ b/cpp/src/parquet/encryption/key_metadata.cc @@ -24,8 +24,7 @@ using ::arrow::json::internal::ObjectParser; using ::arrow::json::internal::ObjectWriter; -namespace parquet { -namespace encryption { +namespace parquet::encryption { constexpr const char KeyMetadata::kKeyMaterialInternalStorageField[]; constexpr const char KeyMetadata::kKeyReferenceField[]; @@ -85,5 +84,4 @@ std::string KeyMetadata::CreateSerializedForExternalMaterial( return json_writer.Serialize(); } -} // namespace encryption -} // namespace parquet +} // namespace parquet::encryption diff --git a/cpp/src/parquet/encryption/key_metadata.h b/cpp/src/parquet/encryption/key_metadata.h index b6dc349f19bdf..6fe8ac7ccb9db 100644 --- a/cpp/src/parquet/encryption/key_metadata.h +++ b/cpp/src/parquet/encryption/key_metadata.h @@ -24,8 +24,7 @@ #include "parquet/exception.h" #include "parquet/platform.h" -namespace parquet { -namespace encryption { +namespace parquet::encryption { // Parquet encryption specification defines "key metadata" as an arbitrary byte array, // generated by file writers for each encryption key, and passed to the low level API for @@ -89,5 +88,4 @@ class PARQUET_EXPORT KeyMetadata { ::std::variant key_material_or_reference_; }; -} // namespace encryption -} // namespace parquet +} // namespace parquet::encryption diff --git a/cpp/src/parquet/encryption/key_metadata_test.cc b/cpp/src/parquet/encryption/key_metadata_test.cc index 3f891ef26db83..f9409edf2a8d9 100644 --- a/cpp/src/parquet/encryption/key_metadata_test.cc +++ b/cpp/src/parquet/encryption/key_metadata_test.cc @@ -22,9 +22,7 @@ #include "parquet/encryption/key_material.h" #include "parquet/encryption/key_metadata.h" -namespace parquet { -namespace encryption { -namespace test { +namespace parquet::encryption::test { TEST(KeyMetadataTest, InternalMaterialStorage) { bool is_footer_key = true; @@ -72,6 +70,4 @@ TEST(KeyMetadataTest, ExternalMaterialStorage) { ASSERT_EQ(key_metadata.key_reference(), key_reference); } -} // namespace test -} // namespace encryption -} // namespace parquet +} // namespace parquet::encryption::test diff --git a/cpp/src/parquet/encryption/key_toolkit.cc b/cpp/src/parquet/encryption/key_toolkit.cc index 0b8543b458289..cb488d3fa23a0 100644 --- a/cpp/src/parquet/encryption/key_toolkit.cc +++ b/cpp/src/parquet/encryption/key_toolkit.cc @@ -27,8 +27,7 @@ #include "parquet/encryption/file_system_key_material_store.h" #include "parquet/encryption/key_toolkit_internal.h" -namespace parquet { -namespace encryption { +namespace parquet::encryption { std::shared_ptr KeyToolkit::GetKmsClient( const KmsConnectionConfig& kms_connection_config, double cache_entry_lifetime_ms) { @@ -119,5 +118,4 @@ void KeyToolkit::RemoveCacheEntriesForAllTokens() { kek_read_cache_per_token().Clear(); } -} // namespace encryption -} // namespace parquet +} // namespace parquet::encryption diff --git a/cpp/src/parquet/encryption/key_toolkit.h b/cpp/src/parquet/encryption/key_toolkit.h index d65f5d8a2d0f9..f63ade4c8c93f 100644 --- a/cpp/src/parquet/encryption/key_toolkit.h +++ b/cpp/src/parquet/encryption/key_toolkit.h @@ -26,8 +26,7 @@ #include "parquet/encryption/two_level_cache_with_expiration.h" #include "parquet/platform.h" -namespace parquet { -namespace encryption { +namespace parquet::encryption { static constexpr uint64_t kCacheCleanPeriodForKeyRotation = 60 * 60; // 1 hour @@ -104,5 +103,4 @@ class PARQUET_EXPORT KeyWithMasterId { const std::string master_id_; }; -} // namespace encryption -} // namespace parquet +} // namespace parquet::encryption diff --git a/cpp/src/parquet/encryption/key_toolkit_internal.cc b/cpp/src/parquet/encryption/key_toolkit_internal.cc index 6e0e4e6c65e1e..bdd65d8de3919 100644 --- a/cpp/src/parquet/encryption/key_toolkit_internal.cc +++ b/cpp/src/parquet/encryption/key_toolkit_internal.cc @@ -20,9 +20,7 @@ #include "parquet/encryption/encryption_internal.h" #include "parquet/encryption/key_toolkit_internal.h" -namespace parquet { -namespace encryption { -namespace internal { +namespace parquet::encryption::internal { // Acceptable key lengths in number of bits, used to validate the data key lengths // configured by users and the master key lengths fetched from KMS server. @@ -77,6 +75,4 @@ bool ValidateKeyLength(int32_t key_length_bits) { return found_key_length != std::end(kAcceptableDataKeyLengths); } -} // namespace internal -} // namespace encryption -} // namespace parquet +} // namespace parquet::encryption::internal diff --git a/cpp/src/parquet/encryption/key_toolkit_internal.h b/cpp/src/parquet/encryption/key_toolkit_internal.h index bcc60bdad68e2..8474a91fc1aba 100644 --- a/cpp/src/parquet/encryption/key_toolkit_internal.h +++ b/cpp/src/parquet/encryption/key_toolkit_internal.h @@ -21,9 +21,7 @@ #include "parquet/platform.h" -namespace parquet { -namespace encryption { -namespace internal { +namespace parquet::encryption::internal { /// Encrypts "key" with "master_key", using AES-GCM and the "aad" PARQUET_EXPORT @@ -38,6 +36,4 @@ std::string DecryptKeyLocally(const std::string& encoded_encrypted_key, PARQUET_EXPORT bool ValidateKeyLength(int32_t key_length_bits); -} // namespace internal -} // namespace encryption -} // namespace parquet +} // namespace parquet::encryption::internal diff --git a/cpp/src/parquet/encryption/key_wrapping_test.cc b/cpp/src/parquet/encryption/key_wrapping_test.cc index dba9d67dfe13d..198ceb9bf4b11 100644 --- a/cpp/src/parquet/encryption/key_wrapping_test.cc +++ b/cpp/src/parquet/encryption/key_wrapping_test.cc @@ -26,9 +26,7 @@ #include "parquet/encryption/test_encryption_util.h" #include "parquet/encryption/test_in_memory_kms.h" -namespace parquet { -namespace encryption { -namespace test { +namespace parquet::encryption::test { class KeyWrappingTest : public ::testing::Test { protected: @@ -113,6 +111,4 @@ TEST_F(KeyWrappingTest, ExternalMaterialStorage) { this->WrapThenUnwrap(false, false, false); } -} // namespace test -} // namespace encryption -} // namespace parquet +} // namespace parquet::encryption::test diff --git a/cpp/src/parquet/encryption/kms_client.cc b/cpp/src/parquet/encryption/kms_client.cc index b9c720272c479..fee03dd3db656 100644 --- a/cpp/src/parquet/encryption/kms_client.cc +++ b/cpp/src/parquet/encryption/kms_client.cc @@ -17,8 +17,7 @@ #include "parquet/encryption/kms_client.h" -namespace parquet { -namespace encryption { +namespace parquet::encryption { constexpr const char KmsClient::kKmsInstanceIdDefault[]; constexpr const char KmsClient::kKmsInstanceUrlDefault[]; @@ -40,5 +39,4 @@ void KmsConnectionConfig::SetDefaultIfEmpty() { } } -} // namespace encryption -} // namespace parquet +} // namespace parquet::encryption diff --git a/cpp/src/parquet/encryption/kms_client.h b/cpp/src/parquet/encryption/kms_client.h index 5ffa604ffd198..a55fd552eed5f 100644 --- a/cpp/src/parquet/encryption/kms_client.h +++ b/cpp/src/parquet/encryption/kms_client.h @@ -26,8 +26,7 @@ #include "parquet/exception.h" #include "parquet/platform.h" -namespace parquet { -namespace encryption { +namespace parquet::encryption { /// This class wraps the key access token of a KMS server. If your token changes over /// time, you should keep the reference to the KeyAccessToken object and call Refresh() @@ -91,5 +90,4 @@ class PARQUET_EXPORT KmsClient { virtual ~KmsClient() {} }; -} // namespace encryption -} // namespace parquet +} // namespace parquet::encryption diff --git a/cpp/src/parquet/encryption/kms_client_factory.h b/cpp/src/parquet/encryption/kms_client_factory.h index eac8dfc5d06e2..7a7c77c7eebbf 100644 --- a/cpp/src/parquet/encryption/kms_client_factory.h +++ b/cpp/src/parquet/encryption/kms_client_factory.h @@ -20,8 +20,7 @@ #include "parquet/encryption/kms_client.h" #include "parquet/platform.h" -namespace parquet { -namespace encryption { +namespace parquet::encryption { class PARQUET_EXPORT KmsClientFactory { public: @@ -36,5 +35,4 @@ class PARQUET_EXPORT KmsClientFactory { bool wrap_locally_; }; -} // namespace encryption -} // namespace parquet +} // namespace parquet::encryption diff --git a/cpp/src/parquet/encryption/local_wrap_kms_client.cc b/cpp/src/parquet/encryption/local_wrap_kms_client.cc index 1b89dc57d0e52..23e28bb8e61be 100644 --- a/cpp/src/parquet/encryption/local_wrap_kms_client.cc +++ b/cpp/src/parquet/encryption/local_wrap_kms_client.cc @@ -25,8 +25,7 @@ using ::arrow::json::internal::ObjectParser; using ::arrow::json::internal::ObjectWriter; -namespace parquet { -namespace encryption { +namespace parquet::encryption { constexpr const char LocalWrapKmsClient::kLocalWrapNoKeyVersion[]; @@ -112,5 +111,4 @@ std::string LocalWrapKmsClient::GetKeyFromServer(const std::string& key_identifi return master_key; } -} // namespace encryption -} // namespace parquet +} // namespace parquet::encryption diff --git a/cpp/src/parquet/encryption/local_wrap_kms_client.h b/cpp/src/parquet/encryption/local_wrap_kms_client.h index 65cf8f42c7964..3c90d82960525 100644 --- a/cpp/src/parquet/encryption/local_wrap_kms_client.h +++ b/cpp/src/parquet/encryption/local_wrap_kms_client.h @@ -25,8 +25,7 @@ #include "parquet/encryption/kms_client.h" #include "parquet/platform.h" -namespace parquet { -namespace encryption { +namespace parquet::encryption { /// This class supports local wrapping mode, master keys will be fetched from the KMS /// server and used to encrypt other keys (data encryption keys or key encryption keys). @@ -92,5 +91,4 @@ class PARQUET_EXPORT LocalWrapKmsClient : public KmsClient { ::arrow::util::ConcurrentMap master_key_cache_; }; -} // namespace encryption -} // namespace parquet +} // namespace parquet::encryption diff --git a/cpp/src/parquet/encryption/properties_test.cc b/cpp/src/parquet/encryption/properties_test.cc index 0eb5cba201a24..895cf6c63431e 100644 --- a/cpp/src/parquet/encryption/properties_test.cc +++ b/cpp/src/parquet/encryption/properties_test.cc @@ -22,9 +22,7 @@ #include "parquet/encryption/encryption.h" #include "parquet/encryption/test_encryption_util.h" -namespace parquet { -namespace encryption { -namespace test { +namespace parquet::encryption::test { TEST(TestColumnEncryptionProperties, ColumnEncryptedWithOwnKey) { std::string column_path_1 = "column_1"; @@ -271,6 +269,4 @@ TEST(TestDecryptionProperties, UsingExplicitFooterAndColumnKeys) { ASSERT_EQ(kColumnEncryptionKey2, props->column_key(column_path_2)); } -} // namespace test -} // namespace encryption -} // namespace parquet +} // namespace parquet::encryption::test diff --git a/cpp/src/parquet/encryption/read_configurations_test.cc b/cpp/src/parquet/encryption/read_configurations_test.cc index 0bdb67ee9eadc..10de7198ac5ff 100644 --- a/cpp/src/parquet/encryption/read_configurations_test.cc +++ b/cpp/src/parquet/encryption/read_configurations_test.cc @@ -81,9 +81,7 @@ */ -namespace parquet { -namespace encryption { -namespace test { +namespace parquet::encryption::test { using parquet::test::ParquetTestException; @@ -272,6 +270,4 @@ INSTANTIATE_TEST_SUITE_P( 5, "encrypt_columns_and_footer_disable_aad_storage.parquet.encrypted"), std::make_tuple(6, "encrypt_columns_and_footer_ctr.parquet.encrypted"))); -} // namespace test -} // namespace encryption -} // namespace parquet +} // namespace parquet::encryption::test diff --git a/cpp/src/parquet/encryption/test_encryption_util.cc b/cpp/src/parquet/encryption/test_encryption_util.cc index 26ed15ae031e1..694ed3cf42d9e 100644 --- a/cpp/src/parquet/encryption/test_encryption_util.cc +++ b/cpp/src/parquet/encryption/test_encryption_util.cc @@ -37,9 +37,7 @@ using parquet::Type; using parquet::schema::GroupNode; using parquet::schema::PrimitiveNode; -namespace parquet { -namespace encryption { -namespace test { +namespace parquet::encryption::test { std::string data_file(const char* file) { std::string dir_string(parquet::test::get_data_dir()); @@ -511,6 +509,4 @@ void FileDecryptor::CheckFile(parquet::ParquetFileReader* file_reader, } } -} // namespace test -} // namespace encryption -} // namespace parquet +} // namespace parquet::encryption::test diff --git a/cpp/src/parquet/encryption/test_encryption_util.h b/cpp/src/parquet/encryption/test_encryption_util.h index c2190709aff96..19c230ee5ff99 100644 --- a/cpp/src/parquet/encryption/test_encryption_util.h +++ b/cpp/src/parquet/encryption/test_encryption_util.h @@ -37,8 +37,7 @@ namespace parquet { class ParquetFileReader; -namespace encryption { -namespace test { +namespace encryption::test { using ::arrow::internal::TemporaryDir; @@ -122,6 +121,5 @@ class FileDecryptor { FileDecryptionProperties* file_decryption_properties); }; -} // namespace test -} // namespace encryption +} // namespace encryption::test } // namespace parquet diff --git a/cpp/src/parquet/encryption/test_in_memory_kms.cc b/cpp/src/parquet/encryption/test_in_memory_kms.cc index 5389196b6fa39..e1339ab48b5d6 100644 --- a/cpp/src/parquet/encryption/test_in_memory_kms.cc +++ b/cpp/src/parquet/encryption/test_in_memory_kms.cc @@ -21,8 +21,7 @@ #include "parquet/encryption/test_in_memory_kms.h" #include "parquet/exception.h" -namespace parquet { -namespace encryption { +namespace parquet::encryption { std::unordered_map TestOnlyLocalWrapInMemoryKms::master_key_map_; @@ -95,5 +94,4 @@ std::string TestOnlyInServerWrapKms::GetMasterKeyFromServer( return wrapping_master_key_map_.at(master_key_identifier); } -} // namespace encryption -} // namespace parquet +} // namespace parquet::encryption diff --git a/cpp/src/parquet/encryption/test_in_memory_kms.h b/cpp/src/parquet/encryption/test_in_memory_kms.h index bf887191d1efc..c5fdc797b8ca7 100644 --- a/cpp/src/parquet/encryption/test_in_memory_kms.h +++ b/cpp/src/parquet/encryption/test_in_memory_kms.h @@ -25,8 +25,7 @@ #include "parquet/encryption/local_wrap_kms_client.h" #include "parquet/platform.h" -namespace parquet { -namespace encryption { +namespace parquet::encryption { // This is a mock class, built for testing only. Don't use it as an example of // LocalWrapKmsClient implementation. @@ -92,5 +91,4 @@ class TestOnlyInMemoryKmsClientFactory : public KmsClientFactory { } }; -} // namespace encryption -} // namespace parquet +} // namespace parquet::encryption diff --git a/cpp/src/parquet/encryption/two_level_cache_with_expiration.h b/cpp/src/parquet/encryption/two_level_cache_with_expiration.h index fbd06dc7d20e0..76c2b82770000 100644 --- a/cpp/src/parquet/encryption/two_level_cache_with_expiration.h +++ b/cpp/src/parquet/encryption/two_level_cache_with_expiration.h @@ -23,8 +23,7 @@ #include "arrow/util/concurrent_map.h" #include "arrow/util/mutex.h" -namespace parquet { -namespace encryption { +namespace parquet::encryption { using ::arrow::util::ConcurrentMap; @@ -155,5 +154,4 @@ class TwoLevelCacheWithExpiration { ::arrow::util::Mutex mutex_; }; -} // namespace encryption -} // namespace parquet +} // namespace parquet::encryption diff --git a/cpp/src/parquet/encryption/two_level_cache_with_expiration_test.cc b/cpp/src/parquet/encryption/two_level_cache_with_expiration_test.cc index f375a5c5b315c..d8f2c6255145f 100644 --- a/cpp/src/parquet/encryption/two_level_cache_with_expiration_test.cc +++ b/cpp/src/parquet/encryption/two_level_cache_with_expiration_test.cc @@ -25,9 +25,7 @@ #include "parquet/encryption/two_level_cache_with_expiration.h" -namespace parquet { -namespace encryption { -namespace test { +namespace parquet::encryption::test { using ::arrow::SleepFor; @@ -172,6 +170,4 @@ TEST_F(TwoLevelCacheWithExpirationTest, MultiThread) { clean_thread.join(); } -} // namespace test -} // namespace encryption -} // namespace parquet +} // namespace parquet::encryption::test diff --git a/cpp/src/parquet/encryption/write_configurations_test.cc b/cpp/src/parquet/encryption/write_configurations_test.cc index 580c95fdfd2aa..e262003db3e6a 100644 --- a/cpp/src/parquet/encryption/write_configurations_test.cc +++ b/cpp/src/parquet/encryption/write_configurations_test.cc @@ -60,9 +60,7 @@ * keys. Use the alternative (AES_GCM_CTR_V1) algorithm. */ -namespace parquet { -namespace encryption { -namespace test { +namespace parquet::encryption::test { using FileClass = ::arrow::io::FileOutputStream; @@ -231,6 +229,4 @@ void TestEncryptionConfiguration::SetUpTestCase() { temp_dir = temp_data_dir().ValueOrDie(); } -} // namespace test -} // namespace encryption -} // namespace parquet +} // namespace parquet::encryption::test diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index fc30ddb43f29c..08d493b0bca2f 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -29,6 +29,7 @@ #include "arrow/io/caching.h" #include "arrow/io/file.h" #include "arrow/io/memory.h" +#include "arrow/util/bit_util.h" #include "arrow/util/checked_cast.h" #include "arrow/util/future.h" #include "arrow/util/int_util_overflow.h" @@ -178,17 +179,17 @@ class SerializedRowGroup : public RowGroupReader::Contents { SerializedRowGroup(std::shared_ptr source, std::shared_ptr<::arrow::io::internal::ReadRangeCache> cached_source, int64_t source_size, FileMetaData* file_metadata, - int row_group_number, const ReaderProperties& props, - std::unordered_set prebuffered_column_chunks, + int row_group_number, ReaderProperties props, + std::shared_ptr prebuffered_column_chunks_bitmap, std::shared_ptr file_decryptor = nullptr) : source_(std::move(source)), cached_source_(std::move(cached_source)), source_size_(source_size), file_metadata_(file_metadata), - properties_(props), + properties_(std::move(props)), row_group_ordinal_(row_group_number), - prebuffered_column_chunks_(std::move(prebuffered_column_chunks)), - file_decryptor_(file_decryptor) { + prebuffered_column_chunks_bitmap_(std::move(prebuffered_column_chunks_bitmap)), + file_decryptor_(std::move(file_decryptor)) { row_group_metadata_ = file_metadata->RowGroup(row_group_number); } @@ -203,8 +204,8 @@ class SerializedRowGroup : public RowGroupReader::Contents { ::arrow::io::ReadRange col_range = ComputeColumnChunkRange(file_metadata_, source_size_, row_group_ordinal_, i); std::shared_ptr stream; - if (cached_source_ && - prebuffered_column_chunks_.find(i) != prebuffered_column_chunks_.end()) { + if (cached_source_ && prebuffered_column_chunks_bitmap_ != nullptr && + ::arrow::bit_util::GetBit(prebuffered_column_chunks_bitmap_->data(), i)) { // PARQUET-1698: if read coalescing is enabled, read from pre-buffered // segments. PARQUET_ASSIGN_OR_THROW(auto buffer, cached_source_->Read(col_range)); @@ -272,7 +273,7 @@ class SerializedRowGroup : public RowGroupReader::Contents { std::unique_ptr row_group_metadata_; ReaderProperties properties_; int row_group_ordinal_; - const std::unordered_set prebuffered_column_chunks_; + const std::shared_ptr prebuffered_column_chunks_bitmap_; std::shared_ptr file_decryptor_; }; @@ -302,17 +303,17 @@ class SerializedFile : public ParquetFileReader::Contents { } std::shared_ptr GetRowGroup(int i) override { - std::unordered_set prebuffered_column_chunks; - // Avoid updating the map as this function can be called concurrently. The map can - // only be updated within Prebuffer(). + std::shared_ptr prebuffered_column_chunks_bitmap; + // Avoid updating the bitmap as this function can be called concurrently. The bitmap + // can only be updated within Prebuffer(). auto prebuffered_column_chunks_iter = prebuffered_column_chunks_.find(i); if (prebuffered_column_chunks_iter != prebuffered_column_chunks_.end()) { - prebuffered_column_chunks = prebuffered_column_chunks_iter->second; + prebuffered_column_chunks_bitmap = prebuffered_column_chunks_iter->second; } std::unique_ptr contents = std::make_unique( source_, cached_source_, source_size_, file_metadata_.get(), i, properties_, - std::move(prebuffered_column_chunks), file_decryptor_); + std::move(prebuffered_column_chunks_bitmap), file_decryptor_); return std::make_shared(std::move(contents)); } @@ -365,10 +366,19 @@ class SerializedFile : public ParquetFileReader::Contents { std::make_shared<::arrow::io::internal::ReadRangeCache>(source_, ctx, options); std::vector<::arrow::io::ReadRange> ranges; prebuffered_column_chunks_.clear(); + int num_cols = file_metadata_->num_columns(); + // a bitmap for buffered columns. + std::shared_ptr buffer_columns; + if (!row_groups.empty()) { + PARQUET_THROW_NOT_OK(AllocateEmptyBitmap(num_cols, properties_.memory_pool()) + .Value(&buffer_columns)); + for (int col : column_indices) { + ::arrow::bit_util::SetBit(buffer_columns->mutable_data(), col); + } + } for (int row : row_groups) { - std::unordered_set& prebuffered = prebuffered_column_chunks_[row]; + prebuffered_column_chunks_[row] = buffer_columns; for (int col : column_indices) { - prebuffered.insert(col); ranges.push_back( ComputeColumnChunkRange(file_metadata_.get(), source_size_, row, col)); } @@ -578,8 +588,9 @@ class SerializedFile : public ParquetFileReader::Contents { ReaderProperties properties_; std::shared_ptr page_index_reader_; std::unique_ptr bloom_filter_reader_; - // Maps a row group to its column chunks that are cached via Prebuffer(). - std::unordered_map> prebuffered_column_chunks_; + // Maps row group ordinal and prebuffer status of its column chunks in the form of a + // bitmap buffer. + std::unordered_map> prebuffered_column_chunks_; std::shared_ptr file_decryptor_; // \return The true length of the metadata in bytes diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index 42ce7591cb7a6..2a6a88df2dd0a 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -153,11 +153,24 @@ class RowGroupSerializer : public RowGroupWriter::Contents { auto oi_builder = page_index_builder_ && properties_->page_index_enabled(path) ? page_index_builder_->GetOffsetIndexBuilder(column_ordinal) : nullptr; - std::unique_ptr pager = PageWriter::Open( - sink_, properties_->compression(path), properties_->compression_level(path), - col_meta, row_group_ordinal_, static_cast(column_ordinal), - properties_->memory_pool(), false, meta_encryptor, data_encryptor, - properties_->page_checksum_enabled(), ci_builder, oi_builder); + auto codec_options = properties_->codec_options(path) + ? properties_->codec_options(path).get() + : nullptr; + + std::unique_ptr pager; + if (!codec_options) { + pager = PageWriter::Open(sink_, properties_->compression(path), col_meta, + row_group_ordinal_, static_cast(column_ordinal), + properties_->memory_pool(), false, meta_encryptor, + data_encryptor, properties_->page_checksum_enabled(), + ci_builder, oi_builder, CodecOptions()); + } else { + pager = PageWriter::Open(sink_, properties_->compression(path), col_meta, + row_group_ordinal_, static_cast(column_ordinal), + properties_->memory_pool(), false, meta_encryptor, + data_encryptor, properties_->page_checksum_enabled(), + ci_builder, oi_builder, *codec_options); + } column_writers_[0] = ColumnWriter::Make(col_meta, std::move(pager), properties_); return column_writers_[0].get(); } @@ -291,12 +304,24 @@ class RowGroupSerializer : public RowGroupWriter::Contents { auto oi_builder = page_index_builder_ && properties_->page_index_enabled(path) ? page_index_builder_->GetOffsetIndexBuilder(column_ordinal) : nullptr; - std::unique_ptr pager = PageWriter::Open( - sink_, properties_->compression(path), properties_->compression_level(path), - col_meta, static_cast(row_group_ordinal_), - static_cast(column_ordinal), properties_->memory_pool(), - buffered_row_group_, meta_encryptor, data_encryptor, - properties_->page_checksum_enabled(), ci_builder, oi_builder); + auto codec_options = properties_->codec_options(path) + ? (properties_->codec_options(path)).get() + : nullptr; + + std::unique_ptr pager; + if (!codec_options) { + pager = PageWriter::Open( + sink_, properties_->compression(path), col_meta, row_group_ordinal_, + static_cast(column_ordinal), properties_->memory_pool(), + buffered_row_group_, meta_encryptor, data_encryptor, + properties_->page_checksum_enabled(), ci_builder, oi_builder, CodecOptions()); + } else { + pager = PageWriter::Open( + sink_, properties_->compression(path), col_meta, row_group_ordinal_, + static_cast(column_ordinal), properties_->memory_pool(), + buffered_row_group_, meta_encryptor, data_encryptor, + properties_->page_checksum_enabled(), ci_builder, oi_builder, *codec_options); + } column_writers_.push_back( ColumnWriter::Make(col_meta, std::move(pager), properties_)); } diff --git a/cpp/src/parquet/level_comparison.cc b/cpp/src/parquet/level_comparison.cc index c9ad6b76c7280..f3188e987d081 100644 --- a/cpp/src/parquet/level_comparison.cc +++ b/cpp/src/parquet/level_comparison.cc @@ -25,8 +25,7 @@ #include "arrow/util/dispatch.h" -namespace parquet { -namespace internal { +namespace parquet::internal { #if defined(ARROW_HAVE_RUNTIME_AVX2) MinMax FindMinMaxAvx2(const int16_t* levels, int64_t num_levels); @@ -78,5 +77,4 @@ MinMax FindMinMax(const int16_t* levels, int64_t num_levels) { return dispatch.func(levels, num_levels); } -} // namespace internal -} // namespace parquet +} // namespace parquet::internal diff --git a/cpp/src/parquet/level_comparison.h b/cpp/src/parquet/level_comparison.h index 38e7ef8e2ec3f..3ae442dd46e57 100644 --- a/cpp/src/parquet/level_comparison.h +++ b/cpp/src/parquet/level_comparison.h @@ -21,8 +21,7 @@ #include "parquet/platform.h" -namespace parquet { -namespace internal { +namespace parquet::internal { /// Builds a bitmap where each set bit indicates the corresponding level is greater /// than rhs. @@ -36,5 +35,4 @@ struct MinMax { MinMax FindMinMax(const int16_t* levels, int64_t num_levels); -} // namespace internal -} // namespace parquet +} // namespace parquet::internal diff --git a/cpp/src/parquet/level_comparison_inc.h b/cpp/src/parquet/level_comparison_inc.h index 055f81ffae898..cfee506654331 100644 --- a/cpp/src/parquet/level_comparison_inc.h +++ b/cpp/src/parquet/level_comparison_inc.h @@ -24,9 +24,7 @@ #ifndef PARQUET_IMPL_NAMESPACE #error "PARQUET_IMPL_NAMESPACE must be defined" #endif -namespace parquet { -namespace internal { -namespace PARQUET_IMPL_NAMESPACE { +namespace parquet::internal::PARQUET_IMPL_NAMESPACE { /// Builds a bitmap by applying predicate to the level vector provided. /// /// \param[in] levels Rep or def level array. @@ -60,6 +58,4 @@ inline uint64_t GreaterThanBitmapImpl(const int16_t* levels, int64_t num_levels, return LevelsToBitmap(levels, num_levels, [rhs](int16_t value) { return value > rhs; }); } -} // namespace PARQUET_IMPL_NAMESPACE -} // namespace internal -} // namespace parquet +} // namespace parquet::internal::PARQUET_IMPL_NAMESPACE diff --git a/cpp/src/parquet/level_conversion.cc b/cpp/src/parquet/level_conversion.cc index 2e5bcacea55d6..1271afd866d14 100644 --- a/cpp/src/parquet/level_conversion.cc +++ b/cpp/src/parquet/level_conversion.cc @@ -31,8 +31,7 @@ #include "parquet/level_conversion_inc.h" #undef PARQUET_IMPL_NAMESPACE -namespace parquet { -namespace internal { +namespace parquet::internal { namespace { using ::arrow::internal::CpuInfo; @@ -179,5 +178,4 @@ void DefRepLevelsToBitmap(const int16_t* def_levels, const int16_t* rep_levels, output, /*offsets=*/nullptr); } -} // namespace internal -} // namespace parquet +} // namespace parquet::internal diff --git a/cpp/src/parquet/level_conversion.h b/cpp/src/parquet/level_conversion.h index 480d82ed0d81a..3f56b2de36a78 100644 --- a/cpp/src/parquet/level_conversion.h +++ b/cpp/src/parquet/level_conversion.h @@ -23,8 +23,7 @@ #include "parquet/platform.h" #include "parquet/schema.h" -namespace parquet { -namespace internal { +namespace parquet::internal { struct PARQUET_EXPORT LevelInfo { LevelInfo() @@ -196,5 +195,4 @@ void PARQUET_EXPORT DefRepLevelsToBitmap(const int16_t* def_levels, // (i.e. it isn't hidden by runtime dispatch). uint64_t PARQUET_EXPORT TestOnlyExtractBitsSoftware(uint64_t bitmap, uint64_t selection); -} // namespace internal -} // namespace parquet +} // namespace parquet::internal diff --git a/cpp/src/parquet/level_conversion_benchmark.cc b/cpp/src/parquet/level_conversion_benchmark.cc index f9e91c4820f68..f3a4f8095e3a1 100644 --- a/cpp/src/parquet/level_conversion_benchmark.cc +++ b/cpp/src/parquet/level_conversion_benchmark.cc @@ -29,7 +29,7 @@ constexpr int16_t kMissingDefLevel = 0; // Definition Level indicating the values has an entry in the leaf element. constexpr int16_t kPresentDefLevel = 2; -// A repition level that indicates a repeated element. +// A repetition level that indicates a repeated element. constexpr int16_t kHasRepeatedElements = 1; std::vector RunDefinitionLevelsToBitmap(const std::vector& def_levels, diff --git a/cpp/src/parquet/level_conversion_bmi2.cc b/cpp/src/parquet/level_conversion_bmi2.cc index 274d54e503c81..a39d1fd1eb461 100644 --- a/cpp/src/parquet/level_conversion_bmi2.cc +++ b/cpp/src/parquet/level_conversion_bmi2.cc @@ -20,8 +20,7 @@ #include "parquet/level_conversion_inc.h" #undef PARQUET_IMPL_NAMESPACE -namespace parquet { -namespace internal { +namespace parquet::internal { void DefLevelsToBitmapBmi2WithRepeatedParent(const int16_t* def_levels, int64_t num_def_levels, LevelInfo level_info, ValidityBitmapInputOutput* output) { @@ -29,5 +28,4 @@ void DefLevelsToBitmapBmi2WithRepeatedParent(const int16_t* def_levels, level_info, output); } -} // namespace internal -} // namespace parquet +} // namespace parquet::internal diff --git a/cpp/src/parquet/level_conversion_inc.h b/cpp/src/parquet/level_conversion_inc.h index 710d2f6237913..0bcdbccb34a73 100644 --- a/cpp/src/parquet/level_conversion_inc.h +++ b/cpp/src/parquet/level_conversion_inc.h @@ -29,13 +29,10 @@ #include "arrow/util/simd.h" #include "parquet/exception.h" #include "parquet/level_comparison.h" - -namespace parquet { -namespace internal { #ifndef PARQUET_IMPL_NAMESPACE #error "PARQUET_IMPL_NAMESPACE must be defined" #endif -namespace PARQUET_IMPL_NAMESPACE { +namespace parquet::internal::PARQUET_IMPL_NAMESPACE { // clang-format off /* Python code to generate lookup table: @@ -352,6 +349,4 @@ void DefLevelsToBitmapSimd(const int16_t* def_levels, int64_t num_def_levels, writer.Finish(); } -} // namespace PARQUET_IMPL_NAMESPACE -} // namespace internal -} // namespace parquet +} // namespace parquet::internal::PARQUET_IMPL_NAMESPACE diff --git a/cpp/src/parquet/level_conversion_test.cc b/cpp/src/parquet/level_conversion_test.cc index bfce74ae3a868..b12680089b839 100644 --- a/cpp/src/parquet/level_conversion_test.cc +++ b/cpp/src/parquet/level_conversion_test.cc @@ -31,8 +31,7 @@ #include "arrow/util/bitmap.h" #include "arrow/util/ubsan.h" -namespace parquet { -namespace internal { +namespace parquet::internal { using ::arrow::internal::Bitmap; using ::testing::ElementsAreArray; @@ -357,5 +356,4 @@ TEST(TestOnlyExtractBitsSoftware, BasicTest) { check(0xFECBDA9876543210ULL, 0xF00FF00FF00FF00FULL, 0xFBD87430ULL); } -} // namespace internal -} // namespace parquet +} // namespace parquet::internal diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 541bcc18b8bc3..8aedf5b926add 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -593,13 +593,13 @@ class FileMetaData::FileMetaDataImpl { FileMetaDataImpl() = default; explicit FileMetaDataImpl( - const void* metadata, uint32_t* metadata_len, const ReaderProperties& properties, + const void* metadata, uint32_t* metadata_len, ReaderProperties properties, std::shared_ptr file_decryptor = nullptr) - : properties_(properties), file_decryptor_(file_decryptor) { - metadata_.reset(new format::FileMetaData); + : properties_(std::move(properties)), file_decryptor_(std::move(file_decryptor)) { + metadata_ = std::make_unique(); auto footer_decryptor = - file_decryptor_ != nullptr ? file_decryptor->GetFooterDecryptor() : nullptr; + file_decryptor_ != nullptr ? file_decryptor_->GetFooterDecryptor() : nullptr; ThriftDeserializer deserializer(properties_); deserializer.DeserializeMessage(reinterpret_cast(metadata), @@ -779,8 +779,8 @@ class FileMetaData::FileMetaDataImpl { } std::shared_ptr out(new FileMetaData()); - out->impl_.reset(new FileMetaDataImpl()); - out->impl_->metadata_.reset(new format::FileMetaData()); + out->impl_ = std::make_unique(); + out->impl_->metadata_ = std::make_unique(); auto metadata = out->impl_->metadata_.get(); metadata->version = metadata_->version; @@ -834,6 +834,7 @@ class FileMetaData::FileMetaDataImpl { // update ColumnOrder std::vector column_orders; if (metadata_->__isset.column_orders) { + column_orders.reserve(metadata_->column_orders.size()); for (auto column_order : metadata_->column_orders) { if (column_order.__isset.TYPE_ORDER) { column_orders.push_back(ColumnOrder::type_defined_); @@ -865,7 +866,7 @@ std::shared_ptr FileMetaData::Make( std::shared_ptr file_decryptor) { // This FileMetaData ctor is private, not compatible with std::make_shared return std::shared_ptr( - new FileMetaData(metadata, metadata_len, properties, file_decryptor)); + new FileMetaData(metadata, metadata_len, properties, std::move(file_decryptor))); } std::shared_ptr FileMetaData::Make( diff --git a/cpp/src/parquet/page_index_benchmark.cc b/cpp/src/parquet/page_index_benchmark.cc new file mode 100644 index 0000000000000..5631034105056 --- /dev/null +++ b/cpp/src/parquet/page_index_benchmark.cc @@ -0,0 +1,107 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include "benchmark/benchmark.h" + +#include "parquet/benchmark_util.h" +#include "parquet/metadata.h" +#include "parquet/page_index.h" +#include "parquet/schema.h" +#include "parquet/test_util.h" +#include "parquet/thrift_internal.h" + +namespace parquet::benchmark { + +void PageIndexSetArgs(::benchmark::internal::Benchmark* bench) { + bench->ArgNames({"num_pages"}); + bench->Range(8, 1024); +} + +void BM_ReadOffsetIndex(::benchmark::State& state) { + auto builder = OffsetIndexBuilder::Make(); + const int num_pages = static_cast(state.range(0)); + constexpr int64_t page_size = 1024; + constexpr int64_t first_row_index = 10000; + for (int i = 0; i < num_pages; ++i) { + builder->AddPage(page_size * i, page_size, first_row_index * i); + } + constexpr int64_t final_position = 4096; + builder->Finish(final_position); + auto sink = CreateOutputStream(); + builder->WriteTo(sink.get()); + auto buffer = sink->Finish().ValueOrDie(); + ReaderProperties properties; + for (auto _ : state) { + auto offset_index = OffsetIndex::Make( + buffer->data() + 0, static_cast(buffer->size()), properties); + ::benchmark::DoNotOptimize(offset_index); + } + state.SetBytesProcessed(state.iterations() * buffer->size()); + state.SetItemsProcessed(state.iterations() * num_pages); +} + +BENCHMARK(BM_ReadOffsetIndex)->Apply(PageIndexSetArgs); + +// The sample string length for FLBA and ByteArray benchmarks +constexpr static uint32_t kDataStringLength = 8; + +template +void BM_ReadColumnIndex(::benchmark::State& state) { + schema::NodePtr type = ::parquet::schema::PrimitiveNode::Make( + "b", Repetition::OPTIONAL, DType::type_num, ConvertedType::NONE, 8); + auto descr_ptr = + std::make_unique(type, /*def_level=*/1, /*rep_level=*/0); + auto descr = descr_ptr.get(); + + const int num_pages = static_cast(state.range(0)); + auto builder = ColumnIndexBuilder::Make(descr); + + const size_t values_per_page = 100; + for (int i = 0; i < num_pages; ++i) { + auto stats = MakeStatistics(descr); + std::vector heap; + std::vector values; + values.resize(values_per_page); + GenerateBenchmarkData(values_per_page, /*seed=*/0, values.data(), &heap, + kDataStringLength); + stats->Update(values.data(), values_per_page, /*null_count=*/0); + builder->AddPage(stats->Encode()); + } + + builder->Finish(); + auto sink = CreateOutputStream(); + builder->WriteTo(sink.get()); + auto buffer = sink->Finish().ValueOrDie(); + ReaderProperties properties; + for (auto _ : state) { + auto column_index = ColumnIndex::Make(*descr, buffer->data() + 0, + static_cast(buffer->size()), properties); + ::benchmark::DoNotOptimize(column_index); + } + state.SetBytesProcessed(state.iterations() * buffer->size()); + state.SetItemsProcessed(state.iterations() * num_pages); +} + +BENCHMARK_TEMPLATE(BM_ReadColumnIndex, Int64Type)->Apply(PageIndexSetArgs); +BENCHMARK_TEMPLATE(BM_ReadColumnIndex, DoubleType)->Apply(PageIndexSetArgs); +BENCHMARK_TEMPLATE(BM_ReadColumnIndex, FLBAType)->Apply(PageIndexSetArgs); +BENCHMARK_TEMPLATE(BM_ReadColumnIndex, ByteArrayType)->Apply(PageIndexSetArgs); + +} // namespace parquet::benchmark diff --git a/cpp/src/parquet/platform.h b/cpp/src/parquet/platform.h index 00a193f144a18..b085e57cd9918 100644 --- a/cpp/src/parquet/platform.h +++ b/cpp/src/parquet/platform.h @@ -87,6 +87,7 @@ namespace parquet { using Buffer = ::arrow::Buffer; using Codec = ::arrow::util::Codec; +using CodecOptions = ::arrow::util::CodecOptions; using Compression = ::arrow::Compression; using MemoryPool = ::arrow::MemoryPool; using MutableBuffer = ::arrow::MutableBuffer; diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index c195ab80791ca..bd7eb9dc7abd6 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -153,7 +153,6 @@ class PARQUET_EXPORT ColumnProperties { dictionary_enabled_(dictionary_enabled), statistics_enabled_(statistics_enabled), max_stats_size_(max_stats_size), - compression_level_(Codec::UseDefaultCompressionLevel()), page_index_enabled_(page_index_enabled) {} void set_encoding(Encoding::type encoding) { encoding_ = encoding; } @@ -173,7 +172,14 @@ class PARQUET_EXPORT ColumnProperties { } void set_compression_level(int compression_level) { - compression_level_ = compression_level; + if (!codec_options_) { + codec_options_ = std::make_shared(); + } + codec_options_->compression_level = compression_level; + } + + void set_codec_options(const std::shared_ptr& codec_options) { + codec_options_ = codec_options; } void set_page_index_enabled(bool page_index_enabled) { @@ -190,7 +196,9 @@ class PARQUET_EXPORT ColumnProperties { size_t max_statistics_size() const { return max_stats_size_; } - int compression_level() const { return compression_level_; } + int compression_level() const { return codec_options_->compression_level; } + + const std::shared_ptr& codec_options() const { return codec_options_; } bool page_index_enabled() const { return page_index_enabled_; } @@ -200,7 +208,7 @@ class PARQUET_EXPORT ColumnProperties { bool dictionary_enabled_; bool statistics_enabled_; size_t max_stats_size_; - int compression_level_; + std::shared_ptr codec_options_; bool page_index_enabled_; }; @@ -394,6 +402,9 @@ class PARQUET_EXPORT WriterProperties { /// level is selected by the user or if the special /// std::numeric_limits::min() value is passed, then Arrow selects the /// compression level. + /// + /// If other compressor-specific options need to be set in addition to the compression + /// level, use the codec_options method. Builder* compression_level(int compression_level) { default_column_properties_.set_compression_level(compression_level); return this; @@ -411,7 +422,10 @@ class PARQUET_EXPORT WriterProperties { /// std::numeric_limits::min() value is passed, then Arrow selects the /// compression level. Builder* compression_level(const std::string& path, int compression_level) { - codecs_compression_level_[path] = compression_level; + if (!codec_options_[path]) { + codec_options_[path] = std::make_shared(); + } + codec_options_[path]->compression_level = compression_level; return this; } @@ -431,6 +445,34 @@ class PARQUET_EXPORT WriterProperties { return this->compression_level(path->ToDotString(), compression_level); } + /// \brief Specify the default codec options for the compressor in + /// every column. + /// + /// The codec options allow configuring the compression level as well + /// as other codec-specific options. + Builder* codec_options( + const std::shared_ptr<::arrow::util::CodecOptions>& codec_options) { + default_column_properties_.set_codec_options(codec_options); + return this; + } + + /// \brief Specify the codec options for the compressor for the column + /// described by path. + Builder* codec_options( + const std::string& path, + const std::shared_ptr<::arrow::util::CodecOptions>& codec_options) { + codec_options_[path] = codec_options; + return this; + } + + /// \brief Specify the codec options for the compressor for the column + /// described by path. + Builder* codec_options( + const std::shared_ptr& path, + const std::shared_ptr<::arrow::util::CodecOptions>& codec_options) { + return this->codec_options(path->ToDotString(), codec_options); + } + /// Define the file encryption properties. /// Default NULL. Builder* encryption( @@ -579,8 +621,8 @@ class PARQUET_EXPORT WriterProperties { for (const auto& item : encodings_) get(item.first).set_encoding(item.second); for (const auto& item : codecs_) get(item.first).set_compression(item.second); - for (const auto& item : codecs_compression_level_) - get(item.first).set_compression_level(item.second); + for (const auto& item : codec_options_) + get(item.first).set_codec_options(item.second); for (const auto& item : dictionary_enabled_) get(item.first).set_dictionary_enabled(item.second); for (const auto& item : statistics_enabled_) @@ -617,7 +659,7 @@ class PARQUET_EXPORT WriterProperties { ColumnProperties default_column_properties_; std::unordered_map encodings_; std::unordered_map codecs_; - std::unordered_map codecs_compression_level_; + std::unordered_map> codec_options_; std::unordered_map dictionary_enabled_; std::unordered_map statistics_enabled_; std::unordered_map page_index_enabled_; @@ -680,6 +722,11 @@ class PARQUET_EXPORT WriterProperties { return column_properties(path).compression_level(); } + const std::shared_ptr codec_options( + const std::shared_ptr& path) const { + return column_properties(path).codec_options(); + } + bool dictionary_enabled(const std::shared_ptr& path) const { return column_properties(path).dictionary_enabled(); } diff --git a/cpp/src/parquet/properties_test.cc b/cpp/src/parquet/properties_test.cc index 2ba1b8a604604..96c3a63b831eb 100644 --- a/cpp/src/parquet/properties_test.cc +++ b/cpp/src/parquet/properties_test.cc @@ -70,6 +70,37 @@ TEST(TestWriterProperties, AdvancedHandling) { ASSERT_EQ(ParquetDataPageVersion::V2, props->data_page_version()); } +TEST(TestWriterProperties, SetCodecOptions) { + WriterProperties::Builder builder; + builder.compression("gzip", Compression::GZIP); + builder.compression("zstd", Compression::ZSTD); + builder.compression("brotli", Compression::BROTLI); + auto gzip_codec_options = std::make_shared<::arrow::util::GZipCodecOptions>(); + gzip_codec_options->compression_level = 5; + gzip_codec_options->window_bits = 12; + builder.codec_options("gzip", gzip_codec_options); + auto codec_options = std::make_shared(); + builder.codec_options(codec_options); + auto brotli_codec_options = std::make_shared<::arrow::util::BrotliCodecOptions>(); + brotli_codec_options->compression_level = 11; + brotli_codec_options->window_bits = 20; + builder.codec_options("brotli", brotli_codec_options); + std::shared_ptr props = builder.build(); + + ASSERT_EQ(5, + props->codec_options(ColumnPath::FromDotString("gzip"))->compression_level); + ASSERT_EQ(12, std::dynamic_pointer_cast<::arrow::util::GZipCodecOptions>( + props->codec_options(ColumnPath::FromDotString("gzip"))) + ->window_bits); + ASSERT_EQ(Codec::UseDefaultCompressionLevel(), + props->codec_options(ColumnPath::FromDotString("zstd"))->compression_level); + ASSERT_EQ(11, + props->codec_options(ColumnPath::FromDotString("brotli"))->compression_level); + ASSERT_EQ(20, std::dynamic_pointer_cast<::arrow::util::BrotliCodecOptions>( + props->codec_options(ColumnPath::FromDotString("brotli"))) + ->window_bits); +} + TEST(TestReaderProperties, GetStreamInsufficientData) { // ARROW-6058 std::string data = "shorter than expected"; diff --git a/cpp/src/parquet/test_util.h b/cpp/src/parquet/test_util.h index dfb4b5d0fbf4a..b0aafa037ead1 100644 --- a/cpp/src/parquet/test_util.h +++ b/cpp/src/parquet/test_util.h @@ -556,7 +556,7 @@ static inline int MakePages(const ColumnDescriptor* d, int num_pages, int levels } else { num_values = num_levels; } - // Create repitition levels + // Create repetition levels if (max_rep_level > 0 && num_levels != 0) { rep_levels.resize(num_levels); // Using a different seed so that def_levels and rep_levels are different. diff --git a/cpp/src/parquet/thrift_internal.h b/cpp/src/parquet/thrift_internal.h index e9b859541b759..5824a82d5b86d 100644 --- a/cpp/src/parquet/thrift_internal.h +++ b/cpp/src/parquet/thrift_internal.h @@ -435,8 +435,7 @@ class ThriftDeserializer { #if PARQUET_THRIFT_VERSION_MAJOR > 0 || PARQUET_THRIFT_VERSION_MINOR >= 14 auto conf = std::make_shared(); conf->setMaxMessageSize(std::numeric_limits::max()); - return std::shared_ptr( - new ThriftBuffer(buf, len, ThriftBuffer::OBSERVE, conf)); + return std::make_shared(buf, len, ThriftBuffer::OBSERVE, conf); #else return std::make_shared(buf, len); #endif diff --git a/cpp/src/parquet/types.cc b/cpp/src/parquet/types.cc index 6e838356c94dc..04a0fc2e0117b 100644 --- a/cpp/src/parquet/types.cc +++ b/cpp/src/parquet/types.cc @@ -51,10 +51,11 @@ bool IsCodecSupported(Compression::type codec) { } std::unique_ptr GetCodec(Compression::type codec) { - return GetCodec(codec, Codec::UseDefaultCompressionLevel()); + return GetCodec(codec, CodecOptions()); } -std::unique_ptr GetCodec(Compression::type codec, int compression_level) { +std::unique_ptr GetCodec(Compression::type codec, + const CodecOptions& codec_options) { std::unique_ptr result; if (codec == Compression::LZO) { throw ParquetException( @@ -69,10 +70,15 @@ std::unique_ptr GetCodec(Compression::type codec, int compression_level) throw ParquetException(ss.str()); } - PARQUET_ASSIGN_OR_THROW(result, Codec::Create(codec, compression_level)); + PARQUET_ASSIGN_OR_THROW(result, Codec::Create(codec, codec_options)); return result; } +// use compression level to create Codec +std::unique_ptr GetCodec(Compression::type codec, int compression_level) { + return GetCodec(codec, CodecOptions{compression_level}); +} + bool PageCanUseChecksum(PageType::type pageType) { switch (pageType) { case PageType::type::DATA_PAGE: diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h index d07ad6246a853..62d625ddcd800 100644 --- a/cpp/src/parquet/types.h +++ b/cpp/src/parquet/types.h @@ -30,13 +30,11 @@ #include "parquet/type_fwd.h" #include "parquet/windows_fixup.h" // for OPTIONAL -namespace arrow { -namespace util { +namespace arrow::util { class Codec; -} // namespace util -} // namespace arrow +} // namespace arrow::util namespace parquet { @@ -500,6 +498,10 @@ bool IsCodecSupported(Compression::type codec); PARQUET_EXPORT std::unique_ptr GetCodec(Compression::type codec); +PARQUET_EXPORT +std::unique_ptr GetCodec(Compression::type codec, + const CodecOptions& codec_options); + PARQUET_EXPORT std::unique_ptr GetCodec(Compression::type codec, int compression_level); diff --git a/csharp/src/Apache.Arrow.Flight.AspNetCore/Apache.Arrow.Flight.AspNetCore.csproj b/csharp/src/Apache.Arrow.Flight.AspNetCore/Apache.Arrow.Flight.AspNetCore.csproj index ef2f5be10b641..7afb51925e6cc 100644 --- a/csharp/src/Apache.Arrow.Flight.AspNetCore/Apache.Arrow.Flight.AspNetCore.csproj +++ b/csharp/src/Apache.Arrow.Flight.AspNetCore/Apache.Arrow.Flight.AspNetCore.csproj @@ -5,7 +5,7 @@ - + diff --git a/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj b/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj index d9956ed49cc44..ed33d88861415 100644 --- a/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj +++ b/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj @@ -6,8 +6,8 @@ - - + + diff --git a/csharp/src/Apache.Arrow/Arrays/Date32Array.cs b/csharp/src/Apache.Arrow/Arrays/Date32Array.cs index 35c0065e11907..23ad7356eb322 100644 --- a/csharp/src/Apache.Arrow/Arrays/Date32Array.cs +++ b/csharp/src/Apache.Arrow/Arrays/Date32Array.cs @@ -25,6 +25,9 @@ namespace Apache.Arrow public class Date32Array : PrimitiveArray { private static readonly DateTime _epochDate = new DateTime(1970, 1, 1, 0, 0, 0, DateTimeKind.Unspecified); +#if NET6_0_OR_GREATER + private static readonly int _epochDayNumber = new DateOnly(1970, 1, 1).DayNumber; +#endif /// /// The class can be used to fluently build objects. @@ -57,6 +60,13 @@ protected override int Convert(DateTimeOffset dateTimeOffset) // DateTimeOffset.Date property. return (int)(dateTimeOffset.UtcDateTime.Date - _epochDate).TotalDays; } + +#if NET6_0_OR_GREATER + protected override int Convert(DateOnly date) + { + return (int)(date.DayNumber - _epochDayNumber); + } +#endif } public Date32Array( @@ -108,5 +118,21 @@ public Date32Array(ArrayData data) ? new DateTimeOffset(_epochDate.AddDays(value.Value), TimeSpan.Zero) : default(DateTimeOffset?); } + +#if NET6_0_OR_GREATER + /// + /// Get the date at the specified index + /// + /// Index at which to get the date. + /// Returns a , or null if there is no object at that index. + /// + public DateOnly? GetDateOnly(int index) + { + int? value = GetValue(index); + return value.HasValue + ? DateOnly.FromDayNumber(_epochDayNumber + value.Value) + : default(DateOnly?); + } +#endif } } diff --git a/csharp/src/Apache.Arrow/Arrays/Date64Array.cs b/csharp/src/Apache.Arrow/Arrays/Date64Array.cs index cf977b2e4969a..b0d42e27bbd23 100644 --- a/csharp/src/Apache.Arrow/Arrays/Date64Array.cs +++ b/csharp/src/Apache.Arrow/Arrays/Date64Array.cs @@ -69,6 +69,13 @@ protected override long Convert(DateTimeOffset dateTimeOffset) long days = millis / MillisecondsPerDay; return (millis < 0 ? days - 1 : days) * MillisecondsPerDay; } + +#if NET6_0_OR_GREATER + protected override long Convert(DateOnly date) + { + return ((long)date.DayNumber - _epochDayNumber) * MillisecondsPerDay; + } +#endif } public Date64Array(ArrayData data) @@ -113,5 +120,21 @@ public Date64Array(ArrayData data) ? DateTimeOffset.FromUnixTimeMilliseconds(value.Value) : default(DateTimeOffset?); } + +#if NET6_0_OR_GREATER + /// + /// Get the date at the specified index + /// + /// Index at which to get the date. + /// Returns a , or null if there is no object at that index. + /// + public DateOnly? GetDateOnly(int index) + { + long? value = GetValue(index); + return value.HasValue + ? DateOnly.FromDateTime(DateTimeOffset.FromUnixTimeMilliseconds(value.Value).UtcDateTime) + : default(DateOnly?); + } +#endif } } diff --git a/csharp/src/Apache.Arrow/Arrays/DateArrayBuilder.cs b/csharp/src/Apache.Arrow/Arrays/DateArrayBuilder.cs index 4e69f6fe3e7e1..dcbb76930b6d3 100644 --- a/csharp/src/Apache.Arrow/Arrays/DateArrayBuilder.cs +++ b/csharp/src/Apache.Arrow/Arrays/DateArrayBuilder.cs @@ -28,9 +28,16 @@ public abstract class DateArrayBuilder : DelegatingArrayBuilder, IArrowArrayBuilder, IArrowArrayBuilder +#if NET6_0_OR_GREATER + , IArrowArrayBuilder +#endif where TArray : IArrowArray where TBuilder : class, IArrowArrayBuilder { +#if NET6_0_OR_GREATER + protected static readonly long _epochDayNumber = new DateOnly(1970, 1, 1).DayNumber; +#endif + /// /// Construct a new instance of the class. /// @@ -72,6 +79,20 @@ public TBuilder Append(DateTimeOffset value) return this as TBuilder; } +#if NET6_0_OR_GREATER + /// + /// Append a date from a object to the array. + /// + /// + /// Date to add. + /// Returns the builder (for fluent-style composition). + public TBuilder Append(DateOnly value) + { + InnerBuilder.Append(Convert(value)); + return this as TBuilder; + } +#endif + /// /// Append a span of dates in the form of objects to the array. /// @@ -114,6 +135,24 @@ public TBuilder Append(ReadOnlySpan span) return this as TBuilder; } +#if NET6_0_OR_GREATER + /// + /// Append a span of dates in the form of objects to the array. + /// + /// Span of dates to add. + /// Returns the builder (for fluent-style composition). + public TBuilder Append(ReadOnlySpan span) + { + InnerBuilder.Reserve(span.Length); + foreach (var item in span) + { + InnerBuilder.Append(Convert(item)); + } + + return this as TBuilder; + } +#endif + /// /// Append a null date to the array. /// @@ -156,6 +195,19 @@ public TBuilder AppendRange(IEnumerable values) return this as TBuilder; } +#if NET6_0_OR_GREATER + /// + /// Append a collection of dates in the form of objects to the array. + /// + /// Collection of dates to add. + /// Returns the builder (for fluent-style composition). + public TBuilder AppendRange(IEnumerable values) + { + InnerBuilder.AppendRange(values.Select(Convert)); + return this as TBuilder; + } +#endif + /// /// Set the value of a date in the form of a object at the specified index. /// @@ -190,6 +242,20 @@ public TBuilder Set(int index, DateTimeOffset value) return this as TBuilder; } +#if NET6_0_OR_GREATER + /// + /// Set the value of a date in the form of a object at the specified index. + /// + /// Index at which to set value. + /// Date to set. + /// Returns the builder (for fluent-style composition). + public TBuilder Set(int index, DateOnly value) + { + InnerBuilder.Set(index, Convert(value)); + return this as TBuilder; + } +#endif + /// /// Swap the values of the dates at the specified indices. /// @@ -205,5 +271,9 @@ public TBuilder Swap(int i, int j) protected abstract TUnderlying Convert(DateTime dateTime); protected abstract TUnderlying Convert(DateTimeOffset dateTimeOffset); + +#if NET6_0_OR_GREATER + protected abstract TUnderlying Convert(DateOnly date); +#endif } } diff --git a/csharp/src/Apache.Arrow/Arrays/Time32Array.cs b/csharp/src/Apache.Arrow/Arrays/Time32Array.cs index bdaf64d5561c7..31d17d06a1e40 100644 --- a/csharp/src/Apache.Arrow/Arrays/Time32Array.cs +++ b/csharp/src/Apache.Arrow/Arrays/Time32Array.cs @@ -14,6 +14,7 @@ // limitations under the License. using Apache.Arrow.Types; +using System; using System.IO; namespace Apache.Arrow @@ -27,14 +28,19 @@ public class Time32Array : PrimitiveArray /// /// The class can be used to fluently build objects. /// - public class Builder : PrimitiveArrayBuilder + public class Builder : TimeArrayBuilder { - protected override Time32Array Build( - ArrowBuffer valueBuffer, ArrowBuffer nullBitmapBuffer, - int length, int nullCount, int offset) => - new Time32Array(DataType, valueBuffer, nullBitmapBuffer, length, nullCount, offset); - - protected Time32Type DataType { get; } + private class TimeBuilder : PrimitiveArrayBuilder + { + public Time32Type DataType { get; } + + public TimeBuilder(Time32Type dataType) => DataType = dataType; + + protected override Time32Array Build( + ArrowBuffer valueBuffer, ArrowBuffer nullBitmapBuffer, + int length, int nullCount, int offset) => + new Time32Array(DataType, valueBuffer, nullBitmapBuffer, length, nullCount, offset); + } public Builder() : this(Time32Type.Default) { } @@ -46,10 +52,22 @@ public Builder(TimeUnit unit) /// Construct a new instance of the class. /// public Builder(Time32Type type) - : base() + : base(new TimeBuilder(type)) + { + } + +#if NET6_0_OR_GREATER + protected override int Convert(TimeOnly time) { - DataType = type; + var unit = ((TimeBuilder)InnerBuilder).DataType.Unit; + return unit switch + { + TimeUnit.Second => (int)(time.Ticks / TimeSpan.TicksPerSecond), + TimeUnit.Millisecond => (int)(time.Ticks / TimeSpan.TicksPerMillisecond), + _ => throw new InvalidDataException($"Unsupported time unit for Time32Type: {unit}") + }; } +#endif } public Time32Array( @@ -113,5 +131,30 @@ public Time32Array(ArrayData data) _ => throw new InvalidDataException($"Unsupported time unit for Time32Type: {unit}") }; } + +#if NET6_0_OR_GREATER + /// + /// Get the time at the specified index as + /// + /// Index at which to get the time. + /// Returns a , or null if there is no object at that index. + /// + public TimeOnly? GetTime(int index) + { + int? value = GetValue(index); + if (value == null) + { + return null; + } + + var unit = ((Time32Type)Data.DataType).Unit; + return unit switch + { + TimeUnit.Second => new TimeOnly(value.Value * TimeSpan.TicksPerSecond), + TimeUnit.Millisecond => new TimeOnly(value.Value * TimeSpan.TicksPerMillisecond), + _ => throw new InvalidDataException($"Unsupported time unit for Time32Type: {unit}") + }; + } +#endif } } diff --git a/csharp/src/Apache.Arrow/Arrays/Time64Array.cs b/csharp/src/Apache.Arrow/Arrays/Time64Array.cs index 127db63a7e09d..95faf18fe9e61 100644 --- a/csharp/src/Apache.Arrow/Arrays/Time64Array.cs +++ b/csharp/src/Apache.Arrow/Arrays/Time64Array.cs @@ -14,6 +14,7 @@ // limitations under the License. using Apache.Arrow.Types; +using System; using System.IO; namespace Apache.Arrow @@ -24,17 +25,25 @@ namespace Apache.Arrow /// public class Time64Array : PrimitiveArray { + private const long TicksPerMicrosecond = 10; + private const long NanosecondsPerTick = 100; + /// /// The class can be used to fluently build objects. /// - public class Builder : PrimitiveArrayBuilder + public class Builder : TimeArrayBuilder { - protected override Time64Array Build( - ArrowBuffer valueBuffer, ArrowBuffer nullBitmapBuffer, - int length, int nullCount, int offset) => - new Time64Array(DataType, valueBuffer, nullBitmapBuffer, length, nullCount, offset); + private class TimeBuilder : PrimitiveArrayBuilder + { + public Time64Type DataType { get; } - protected Time64Type DataType { get; } + public TimeBuilder(Time64Type dataType) => DataType = dataType; + + protected override Time64Array Build( + ArrowBuffer valueBuffer, ArrowBuffer nullBitmapBuffer, + int length, int nullCount, int offset) => + new Time64Array(DataType, valueBuffer, nullBitmapBuffer, length, nullCount, offset); + } public Builder() : this(Time64Type.Default) { } @@ -46,10 +55,22 @@ public Builder(TimeUnit unit) /// Construct a new instance of the class. /// public Builder(Time64Type type) - : base() + : base(new TimeBuilder(type)) + { + } + +#if NET6_0_OR_GREATER + protected override long Convert(TimeOnly time) { - DataType = type; + var unit = ((TimeBuilder)InnerBuilder).DataType.Unit; + return unit switch + { + TimeUnit.Microsecond => (long)(time.Ticks / TicksPerMicrosecond), + TimeUnit.Nanosecond => (long)(time.Ticks * NanosecondsPerTick), + _ => throw new InvalidDataException($"Unsupported time unit for Time32Type: {unit}") + }; } +#endif } public Time64Array( @@ -113,5 +134,33 @@ public Time64Array(ArrayData data) _ => throw new InvalidDataException($"Unsupported time unit for Time64Type: {unit}") }; } + +#if NET6_0_OR_GREATER + /// + /// Get the time at the specified index as + /// + /// + /// This may cause truncation of nanosecond values, as the resolution of TimeOnly is in 100-ns increments. + /// + /// Index at which to get the time. + /// Returns a , or null if there is no object at that index. + /// + public TimeOnly? GetTime(int index) + { + long? value = GetValue(index); + if (value == null) + { + return null; + } + + var unit = ((Time64Type)Data.DataType).Unit; + return unit switch + { + TimeUnit.Microsecond => new TimeOnly(value.Value * TicksPerMicrosecond), + TimeUnit.Nanosecond => new TimeOnly(value.Value / NanosecondsPerTick), + _ => throw new InvalidDataException($"Unsupported time unit for Time64Type: {unit}") + }; + } +#endif } } diff --git a/csharp/src/Apache.Arrow/Arrays/TimeArrayBuilder.cs b/csharp/src/Apache.Arrow/Arrays/TimeArrayBuilder.cs new file mode 100644 index 0000000000000..da93db84717da --- /dev/null +++ b/csharp/src/Apache.Arrow/Arrays/TimeArrayBuilder.cs @@ -0,0 +1,152 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System; +using System.Collections.Generic; +using System.Linq; + +namespace Apache.Arrow +{ + public abstract class TimeArrayBuilder : + DelegatingArrayBuilder +#if NET6_0_OR_GREATER + , IArrowArrayBuilder +#endif + where TArray : IArrowArray + where TBuilder : class, IArrowArrayBuilder + { + /// + /// Construct a new instance of the class. + /// + /// Inner builder that will produce arrays of type . + /// + protected TimeArrayBuilder(IArrowArrayBuilder> innerBuilder) + : base(innerBuilder) + { } + +#if NET6_0_OR_GREATER + /// + /// Append a time in the form of a object to the array. + /// + /// Time to add. + /// Returns the builder (for fluent-style composition). + public TBuilder Append(TimeOnly value) + { + InnerBuilder.Append(Convert(value)); + return this as TBuilder; + } +#endif + + /// + /// Append a time + /// + /// + /// + public TBuilder Append(TUnderlying value) + { + InnerBuilder.Append(value); + return this as TBuilder; + } + +#if NET6_0_OR_GREATER + /// + /// Append a span of times in the form of objects to the array. + /// + /// Span of times to add. + /// Returns the builder (for fluent-style composition). + public TBuilder Append(ReadOnlySpan span) + { + InnerBuilder.Reserve(span.Length); + foreach (var item in span) + { + InnerBuilder.Append(Convert(item)); + } + + return this as TBuilder; + } +#endif + + public TBuilder Append(ReadOnlySpan values) + { + InnerBuilder.Append(values); + return this as TBuilder; + } + + /// + /// Append a null time to the array. + /// + /// Returns the builder (for fluent-style composition). + public TBuilder AppendNull() + { + InnerBuilder.AppendNull(); + return this as TBuilder; + } + +#if NET6_0_OR_GREATER + /// + /// Append a collection of times in the form of objects to the array. + /// + /// Collection of times to add. + /// Returns the builder (for fluent-style composition). + public TBuilder AppendRange(IEnumerable values) + { + InnerBuilder.AppendRange(values.Select(Convert)); + return this as TBuilder; + } +#endif + + public TBuilder AppendRange(IEnumerable values) + { + InnerBuilder.AppendRange(values); + return this as TBuilder; + } + +#if NET6_0_OR_GREATER + /// + /// Set the value of a time in the form of a object at the specified index. + /// + /// Index at which to set value. + /// Time to set. + /// Returns the builder (for fluent-style composition). + public TBuilder Set(int index, TimeOnly value) + { + InnerBuilder.Set(index, Convert(value)); + return this as TBuilder; + } +#endif + + public TBuilder Set(int index, TUnderlying value) + { + InnerBuilder.Set(index, value); + return this as TBuilder; + } + + /// + /// Swap the values of the times at the specified indices. + /// + /// First index. + /// Second index. + /// Returns the builder (for fluent-style composition). + public TBuilder Swap(int i, int j) + { + InnerBuilder.Swap(i, j); + return this as TBuilder; + } + +#if NET6_0_OR_GREATER + protected abstract TUnderlying Convert(TimeOnly time); +#endif + } +} diff --git a/csharp/src/Apache.Arrow/C/CArrowArray.cs b/csharp/src/Apache.Arrow/C/CArrowArray.cs index a8a084d1d767d..fc609f10fdfa5 100644 --- a/csharp/src/Apache.Arrow/C/CArrowArray.cs +++ b/csharp/src/Apache.Arrow/C/CArrowArray.cs @@ -38,11 +38,11 @@ public unsafe struct CArrowArray public byte** buffers; public CArrowArray** children; public CArrowArray* dictionary; - internal delegate* unmanaged -#if !NET5_0_OR_GREATER - [Cdecl] +#if NET5_0_OR_GREATER + internal delegate* unmanaged release; +#else + internal IntPtr release; #endif - release; public void* private_data; /// @@ -68,10 +68,14 @@ internal delegate* unmanaged /// public static void Free(CArrowArray* array) { - if (array->release != null) + if (array->release != default) { // Call release if not already called. +#if NET5_0_OR_GREATER array->release(array); +#else + Marshal.GetDelegateForFunctionPointer(array->release)(array); +#endif } Marshal.FreeHGlobal((IntPtr)array); } diff --git a/csharp/src/Apache.Arrow/C/CArrowArrayExporter.cs b/csharp/src/Apache.Arrow/C/CArrowArrayExporter.cs index 5a793c177e0a6..16aaa3874b370 100644 --- a/csharp/src/Apache.Arrow/C/CArrowArrayExporter.cs +++ b/csharp/src/Apache.Arrow/C/CArrowArrayExporter.cs @@ -26,9 +26,9 @@ public static class CArrowArrayExporter #if NET5_0_OR_GREATER private static unsafe delegate* unmanaged ReleaseArrayPtr => &ReleaseArray; #else - private unsafe delegate void ReleaseArrowArray(CArrowArray* cArray); + internal unsafe delegate void ReleaseArrowArray(CArrowArray* cArray); private static unsafe readonly NativeDelegate s_releaseArray = new NativeDelegate(ReleaseArray); - private static unsafe delegate* unmanaged[Cdecl] ReleaseArrayPtr => (delegate* unmanaged[Cdecl])s_releaseArray.Pointer; + private static IntPtr ReleaseArrayPtr => s_releaseArray.Pointer; #endif /// /// Export an to a . Whether or not the @@ -93,7 +93,7 @@ public static unsafe void ExportRecordBatch(RecordBatch batch, CArrowArray* cArr { throw new ArgumentNullException(nameof(cArray)); } - if (cArray->release != null) + if (cArray->release != default) { throw new ArgumentException("Cannot export array to a struct that is already initialized.", nameof(cArray)); } @@ -191,7 +191,7 @@ private unsafe static void ConvertRecordBatch(ExportedAllocationOwner sharedOwne private unsafe static void ReleaseArray(CArrowArray* cArray) { Dispose(&cArray->private_data); - cArray->release = null; + cArray->release = default; } private unsafe static void* FromDisposable(IDisposable disposable) diff --git a/csharp/src/Apache.Arrow/C/CArrowArrayImporter.cs b/csharp/src/Apache.Arrow/C/CArrowArrayImporter.cs index e1314e5a62253..2f4ebed4b0cf1 100644 --- a/csharp/src/Apache.Arrow/C/CArrowArrayImporter.cs +++ b/csharp/src/Apache.Arrow/C/CArrowArrayImporter.cs @@ -17,6 +17,7 @@ using System; using System.Collections.Generic; +using System.Runtime.InteropServices; using Apache.Arrow.Memory; using Apache.Arrow.Types; @@ -104,21 +105,25 @@ public ImportedArrowArray(CArrowArray* cArray) { throw new ArgumentNullException(nameof(cArray)); } - if (cArray->release == null) + if (cArray->release == default) { throw new ArgumentException("Tried to import an array that has already been released.", nameof(cArray)); } _cArray = *cArray; - cArray->release = null; + cArray->release = default; } protected override void FinalRelease() { - if (_cArray.release != null) + if (_cArray.release != default) { fixed (CArrowArray* cArray = &_cArray) { +#if NET5_0_OR_GREATER cArray->release(cArray); +#else + Marshal.GetDelegateForFunctionPointer(cArray->release)(cArray); +#endif } } } diff --git a/csharp/src/Apache.Arrow/C/CArrowArrayStream.cs b/csharp/src/Apache.Arrow/C/CArrowArrayStream.cs index a900a6895a097..9cc9984c6ec8f 100644 --- a/csharp/src/Apache.Arrow/C/CArrowArrayStream.cs +++ b/csharp/src/Apache.Arrow/C/CArrowArrayStream.cs @@ -35,11 +35,11 @@ public unsafe struct CArrowArrayStream /// /// Return value: 0 if successful, an `errno`-compatible error code otherwise. /// - internal delegate* unmanaged -#if !NET5_0_OR_GREATER - [Cdecl] +#if NET5_0_OR_GREATER + internal delegate* unmanaged get_schema; +#else + internal IntPtr get_schema; #endif - get_schema; /// /// Callback to get the next array. If no error and the array is released, the stream has ended. @@ -47,11 +47,11 @@ internal delegate* unmanaged /// /// Return value: 0 if successful, an `errno`-compatible error code otherwise. /// - internal delegate* unmanaged -#if !NET5_0_OR_GREATER - [Cdecl] +#if NET5_0_OR_GREATER + internal delegate* unmanaged get_next; +#else + internal IntPtr get_next; #endif - get_next; /// /// Callback to get optional detailed error information. This must only @@ -62,21 +62,21 @@ internal delegate* unmanaged /// Return value: pointer to a null-terminated character array describing the last /// error, or NULL if no description is available. /// - internal delegate* unmanaged -#if !NET5_0_OR_GREATER - [Cdecl] +#if NET5_0_OR_GREATER + internal delegate* unmanaged get_last_error; +#else + internal IntPtr get_last_error; #endif - get_last_error; /// /// Release callback: release the stream's own resources. Note that arrays returned by /// get_next must be individually released. /// - internal delegate* unmanaged -#if !NET5_0_OR_GREATER - [Cdecl] +#if NET5_0_OR_GREATER + internal delegate* unmanaged release; +#else + internal IntPtr release; #endif - release; public void* private_data; @@ -103,10 +103,15 @@ internal delegate* unmanaged /// public static void Free(CArrowArrayStream* arrayStream) { - if (arrayStream->release != null) + if (arrayStream->release != default) { // Call release if not already called. +#if NET5_0_OR_GREATER + arrayStream->release(arrayStream); +#else + Marshal.GetDelegateForFunctionPointer(arrayStream->release)(arrayStream); +#endif } Marshal.FreeHGlobal((IntPtr)arrayStream); } diff --git a/csharp/src/Apache.Arrow/C/CArrowArrayStreamExporter.cs b/csharp/src/Apache.Arrow/C/CArrowArrayStreamExporter.cs index 56e0468f9415c..0a0f1cc837459 100644 --- a/csharp/src/Apache.Arrow/C/CArrowArrayStreamExporter.cs +++ b/csharp/src/Apache.Arrow/C/CArrowArrayStreamExporter.cs @@ -29,22 +29,18 @@ public static class CArrowArrayStreamExporter private static unsafe delegate* unmanaged GetLastErrorPtr => &GetLastError; private static unsafe delegate* unmanaged ReleasePtr => &Release; #else - private unsafe delegate int GetSchemaArrayStream(CArrowArrayStream* cArrayStream, CArrowSchema* cSchema); + internal unsafe delegate int GetSchemaArrayStream(CArrowArrayStream* cArrayStream, CArrowSchema* cSchema); private static unsafe NativeDelegate s_getSchemaArrayStream = new NativeDelegate(GetSchema); - private static unsafe delegate* unmanaged[Cdecl] GetSchemaPtr => - (delegate* unmanaged[Cdecl])s_getSchemaArrayStream.Pointer; - private unsafe delegate int GetNextArrayStream(CArrowArrayStream* cArrayStream, CArrowArray* cArray); + private static unsafe IntPtr GetSchemaPtr => s_getSchemaArrayStream.Pointer; + internal unsafe delegate int GetNextArrayStream(CArrowArrayStream* cArrayStream, CArrowArray* cArray); private static unsafe NativeDelegate s_getNextArrayStream = new NativeDelegate(GetNext); - private static unsafe delegate* unmanaged[Cdecl] GetNextPtr => - (delegate* unmanaged[Cdecl])s_getNextArrayStream.Pointer; - private unsafe delegate byte* GetLastErrorArrayStream(CArrowArrayStream* cArrayStream); + private static unsafe IntPtr GetNextPtr => s_getNextArrayStream.Pointer; + internal unsafe delegate byte* GetLastErrorArrayStream(CArrowArrayStream* cArrayStream); private static unsafe NativeDelegate s_getLastErrorArrayStream = new NativeDelegate(GetLastError); - private static unsafe delegate* unmanaged[Cdecl] GetLastErrorPtr => - (delegate* unmanaged[Cdecl])s_getLastErrorArrayStream.Pointer; - private unsafe delegate void ReleaseArrayStream(CArrowArrayStream* cArrayStream); + private static unsafe IntPtr GetLastErrorPtr => s_getLastErrorArrayStream.Pointer; + internal unsafe delegate void ReleaseArrayStream(CArrowArrayStream* cArrayStream); private static unsafe NativeDelegate s_releaseArrayStream = new NativeDelegate(Release); - private static unsafe delegate* unmanaged[Cdecl] ReleasePtr => - (delegate* unmanaged[Cdecl])s_releaseArrayStream.Pointer; + private static unsafe IntPtr ReleasePtr => s_releaseArrayStream.Pointer; #endif /// @@ -103,7 +99,7 @@ private unsafe static int GetNext(CArrowArrayStream* cArrayStream, CArrowArray* ExportedArrayStream arrayStream = null; try { - cArray->release = null; + cArray->release = default; arrayStream = ExportedArrayStream.FromPointer(cArrayStream->private_data); RecordBatch recordBatch = arrayStream.ArrowArrayStream.ReadNextRecordBatchAsync().Result; if (recordBatch != null) @@ -140,7 +136,7 @@ private unsafe static int GetNext(CArrowArrayStream* cArrayStream, CArrowArray* private unsafe static void Release(CArrowArrayStream* cArrayStream) { ExportedArrayStream.Free(&cArrayStream->private_data); - cArrayStream->release = null; + cArrayStream->release = default; } sealed unsafe class ExportedArrayStream : IDisposable @@ -165,7 +161,7 @@ sealed unsafe class ExportedArrayStream : IDisposable public static void Free(void** ptr) { - GCHandle gch = GCHandle.FromIntPtr((IntPtr)ptr); + GCHandle gch = GCHandle.FromIntPtr((IntPtr)(*ptr)); if (!gch.IsAllocated) { return; diff --git a/csharp/src/Apache.Arrow/C/CArrowArrayStreamImporter.cs b/csharp/src/Apache.Arrow/C/CArrowArrayStreamImporter.cs index 7e70632bf82fc..fe0a307c9b26c 100644 --- a/csharp/src/Apache.Arrow/C/CArrowArrayStreamImporter.cs +++ b/csharp/src/Apache.Arrow/C/CArrowArrayStreamImporter.cs @@ -16,6 +16,7 @@ // under the License. using System; +using System.Runtime.InteropServices; using System.Threading; using System.Threading.Tasks; using Apache.Arrow.Ipc; @@ -57,7 +58,11 @@ private sealed unsafe class ImportedArrowArrayStream : IArrowArrayStream internal static string GetLastError(CArrowArrayStream* arrayStream, int errno) { +#if NET5_0_OR_GREATER byte* error = arrayStream->get_last_error(arrayStream); +#else + byte* error = Marshal.GetDelegateForFunctionPointer(arrayStream->get_last_error)(arrayStream); +#endif if (error == null) { return $"Array stream operation failed with no message. Error code: {errno}"; @@ -71,13 +76,17 @@ public ImportedArrowArrayStream(CArrowArrayStream* cArrayStream) { throw new ArgumentNullException(nameof(cArrayStream)); } - if (cArrayStream->release == null) + if (cArrayStream->release == default) { throw new ArgumentException("Tried to import an array stream that has already been released.", nameof(cArrayStream)); } CArrowSchema cSchema = new CArrowSchema(); +#if NET5_0_OR_GREATER int errno = cArrayStream->get_schema(cArrayStream, &cSchema); +#else + int errno = Marshal.GetDelegateForFunctionPointer(cArrayStream->get_schema)(cArrayStream, &cSchema); +#endif if (errno != 0) { throw new Exception(GetLastError(cArrayStream, errno)); @@ -85,7 +94,7 @@ public ImportedArrowArrayStream(CArrowArrayStream* cArrayStream) _schema = CArrowSchemaImporter.ImportSchema(&cSchema); _cArrayStream = *cArrayStream; - cArrayStream->release = null; + cArrayStream->release = default; } ~ImportedArrowArrayStream() @@ -111,12 +120,16 @@ public ValueTask ReadNextRecordBatchAsync(CancellationToken cancell CArrowArray cArray = new CArrowArray(); fixed (CArrowArrayStream* cArrayStream = &_cArrayStream) { +#if NET5_0_OR_GREATER int errno = cArrayStream->get_next(cArrayStream, &cArray); +#else + int errno = Marshal.GetDelegateForFunctionPointer(cArrayStream->get_next)(cArrayStream, &cArray); +#endif if (errno != 0) { return new(Task.FromException(new Exception(GetLastError(cArrayStream, errno)))); } - if (cArray.release != null) + if (cArray.release != default) { result = CArrowArrayImporter.ImportRecordBatch(&cArray, _schema); } @@ -127,12 +140,16 @@ public ValueTask ReadNextRecordBatchAsync(CancellationToken cancell public void Dispose() { - if (!_disposed && _cArrayStream.release != null) + if (!_disposed && _cArrayStream.release != default) { _disposed = true; fixed (CArrowArrayStream* cArrayStream = &_cArrayStream) { +#if NET5_0_OR_GREATER cArrayStream->release(cArrayStream); +#else + Marshal.GetDelegateForFunctionPointer(cArrayStream->release)(cArrayStream); +#endif } } GC.SuppressFinalize(this); diff --git a/csharp/src/Apache.Arrow/C/CArrowSchema.cs b/csharp/src/Apache.Arrow/C/CArrowSchema.cs index 64761dbd0d095..50c363b07720f 100644 --- a/csharp/src/Apache.Arrow/C/CArrowSchema.cs +++ b/csharp/src/Apache.Arrow/C/CArrowSchema.cs @@ -39,11 +39,11 @@ public unsafe struct CArrowSchema public long n_children; public CArrowSchema** children; public CArrowSchema* dictionary; - internal delegate* unmanaged -#if !NET5_0_OR_GREATER - [Cdecl] +#if NET5_0_OR_GREATER + internal delegate* unmanaged release; +#else + internal IntPtr release; #endif - release; public void* private_data; /// @@ -69,10 +69,14 @@ internal delegate* unmanaged /// public static void Free(CArrowSchema* schema) { - if (schema->release != null) + if (schema->release != default) { // Call release if not already called. +#if NET5_0_OR_GREATER schema->release(schema); +#else + Marshal.GetDelegateForFunctionPointer(schema->release)(schema); +#endif } Marshal.FreeHGlobal((IntPtr)schema); } diff --git a/csharp/src/Apache.Arrow/C/CArrowSchemaExporter.cs b/csharp/src/Apache.Arrow/C/CArrowSchemaExporter.cs index 9053e80664e31..696212eda36c7 100644 --- a/csharp/src/Apache.Arrow/C/CArrowSchemaExporter.cs +++ b/csharp/src/Apache.Arrow/C/CArrowSchemaExporter.cs @@ -30,9 +30,9 @@ public static class CArrowSchemaExporter #if NET5_0_OR_GREATER private static unsafe delegate* unmanaged ReleaseSchemaPtr => &ReleaseCArrowSchema; #else - private unsafe delegate void ReleaseArrowSchema(CArrowSchema* cArray); + internal unsafe delegate void ReleaseArrowSchema(CArrowSchema* cArray); private static unsafe readonly NativeDelegate s_releaseSchema = new NativeDelegate(ReleaseCArrowSchema); - private static unsafe delegate* unmanaged[Cdecl] ReleaseSchemaPtr => (delegate* unmanaged[Cdecl])s_releaseSchema.Pointer; + private static IntPtr ReleaseSchemaPtr => s_releaseSchema.Pointer; #endif /// @@ -297,7 +297,7 @@ private unsafe static void WriteMetadataString(ref byte* ptr, int length, string private static unsafe void ReleaseCArrowSchema(CArrowSchema* schema) { if (schema == null) return; - if (schema->release == null) return; + if (schema->release == default) return; Marshal.FreeHGlobal((IntPtr)schema->format); Marshal.FreeHGlobal((IntPtr)schema->name); @@ -324,7 +324,7 @@ private static unsafe void ReleaseCArrowSchema(CArrowSchema* schema) schema->n_children = 0; schema->dictionary = null; schema->children = null; - schema->release = null; + schema->release = default; } } } diff --git a/csharp/src/Apache.Arrow/C/CArrowSchemaImporter.cs b/csharp/src/Apache.Arrow/C/CArrowSchemaImporter.cs index 89c9481270c79..b21f24edba9af 100644 --- a/csharp/src/Apache.Arrow/C/CArrowSchemaImporter.cs +++ b/csharp/src/Apache.Arrow/C/CArrowSchemaImporter.cs @@ -113,7 +113,7 @@ public ImportedArrowSchema(CArrowSchema* cSchema) throw new ArgumentException("Passed null pointer for cSchema."); } _cSchema = cSchema; - if (_cSchema->release == null) + if (_cSchema->release == default) { throw new ArgumentException("Tried to import a schema that has already been released."); } @@ -128,9 +128,13 @@ public ImportedArrowSchema(CArrowSchema* handle, bool isRoot) : this(handle) public void Dispose() { // We only call release on a root-level schema, not child ones. - if (_isRoot && _cSchema->release != null) + if (_isRoot && _cSchema->release != default) { +#if NET5_0_OR_GREATER _cSchema->release(_cSchema); +#else + Marshal.GetDelegateForFunctionPointer(_cSchema->release)(_cSchema); +#endif } } diff --git a/csharp/test/Apache.Arrow.Benchmarks/Apache.Arrow.Benchmarks.csproj b/csharp/test/Apache.Arrow.Benchmarks/Apache.Arrow.Benchmarks.csproj index c85c0c55d622c..06f42ac1c66ee 100644 --- a/csharp/test/Apache.Arrow.Benchmarks/Apache.Arrow.Benchmarks.csproj +++ b/csharp/test/Apache.Arrow.Benchmarks/Apache.Arrow.Benchmarks.csproj @@ -6,8 +6,8 @@ - - + + diff --git a/csharp/test/Apache.Arrow.Flight.TestWeb/Apache.Arrow.Flight.TestWeb.csproj b/csharp/test/Apache.Arrow.Flight.TestWeb/Apache.Arrow.Flight.TestWeb.csproj index 63e28271de8a0..32ff4a9ece5e1 100644 --- a/csharp/test/Apache.Arrow.Flight.TestWeb/Apache.Arrow.Flight.TestWeb.csproj +++ b/csharp/test/Apache.Arrow.Flight.TestWeb/Apache.Arrow.Flight.TestWeb.csproj @@ -5,7 +5,7 @@ - + diff --git a/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj b/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj index cdbfe479470a4..805fb5ab3acce 100644 --- a/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj +++ b/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj @@ -1,8 +1,11 @@ - + net7.0 + true @@ -21,4 +24,12 @@ - \ No newline at end of file + + + + + + + + + diff --git a/csharp/test/Apache.Arrow.Tests/ArrayBuilderTests.cs b/csharp/test/Apache.Arrow.Tests/ArrayBuilderTests.cs index 9ac2f779a6f69..2568e5e8bdab8 100644 --- a/csharp/test/Apache.Arrow.Tests/ArrayBuilderTests.cs +++ b/csharp/test/Apache.Arrow.Tests/ArrayBuilderTests.cs @@ -26,6 +26,7 @@ public class ArrayBuilderTests { // TODO: Test various builder invariants (Append, AppendRange, Clear, Resize, Reserve, etc) +#if NET5_0_OR_GREATER [Fact] public void PrimitiveArrayBuildersProduceExpectedArray() { @@ -40,8 +41,8 @@ public void PrimitiveArrayBuildersProduceExpectedArray() Test(); Test(); Test(); - Test(); - Test(); + TestArrayBuilder(x => x.Append(10).Append(20).Append(30)); + TestArrayBuilder(x => x.Append(10).Append(20).Append(30)); static void Test() where T : struct, INumber @@ -64,8 +65,8 @@ public void PrimitiveArrayBuildersProduceExpectedArrayWithNulls() Test(); Test(); Test(); - Test(); - Test(); + TestArrayBuilder(x => x.Append(123).AppendNull().AppendNull().Append(127), 4, 2, 0x9); + TestArrayBuilder(x => x.Append(123).AppendNull().AppendNull().Append(127), 4, 2, 0x9); static void Test() where T : struct, INumber @@ -73,6 +74,7 @@ static void Test() where TBuilder : PrimitiveArrayBuilder, new() => TestArrayBuilder(x => x.Append(T.CreateChecked(123)).AppendNull().AppendNull().Append(T.CreateChecked(127)), 4, 2, 0x09); } +#endif [Fact] public void BooleanArrayBuilderProducersExpectedArray() diff --git a/csharp/test/Apache.Arrow.Tests/ArrowArrayTests.cs b/csharp/test/Apache.Arrow.Tests/ArrowArrayTests.cs index af3e0f80e6473..d4f0d8dfd0383 100644 --- a/csharp/test/Apache.Arrow.Tests/ArrowArrayTests.cs +++ b/csharp/test/Apache.Arrow.Tests/ArrowArrayTests.cs @@ -93,6 +93,7 @@ void TestIsValid(ArrowBuffer valueBuf, ArrowBuffer nullBitmapBuf, int length, in } } +#if NET5_0_OR_GREATER [Fact] public void SliceArray() { @@ -109,8 +110,8 @@ public void SliceArray() TestNumberSlice(); TestSlice(x => x.Append(new DateTime(2019, 1, 1)).Append(new DateTime(2019, 1, 2)).Append(new DateTime(2019, 1, 3))); TestSlice(x => x.Append(new DateTime(2019, 1, 1)).Append(new DateTime(2019, 1, 2)).Append(new DateTime(2019, 1, 3))); - TestNumberSlice(); - TestNumberSlice(); + TestSlice(x => x.Append(10).Append(20).Append(30)); + TestSlice(x => x.Append(10).Append(20).Append(30)); TestSlice(x => x.Append("10").Append("20").Append("30")); static void TestNumberSlice() @@ -136,8 +137,8 @@ public void SlicePrimitiveArrayWithNulls() TestNumberSlice(); TestSlice(x => x.Append(new DateTime(2019, 1, 1)).Append(new DateTime(2019, 1, 2)).AppendNull().Append(new DateTime(2019, 1, 3))); TestSlice(x => x.Append(new DateTime(2019, 1, 1)).Append(new DateTime(2019, 1, 2)).AppendNull().Append(new DateTime(2019, 1, 3))); - TestNumberSlice(); - TestNumberSlice(); + TestSlice(x => x.Append(10).Append(20).AppendNull().Append(30)); + TestSlice(x => x.Append(10).Append(20).AppendNull().Append(30)); static void TestNumberSlice() where T : struct, INumber @@ -145,6 +146,7 @@ static void TestNumberSlice() where TBuilder : PrimitiveArrayBuilder, new() => TestSlice(x => x.AppendNull().Append(T.CreateChecked(10)).Append(T.CreateChecked(20)).AppendNull().Append(T.CreateChecked(30))); } +#endif [Fact] public void SliceBooleanArray() @@ -198,7 +200,9 @@ private class ArraySliceValidator : IArrowArrayVisitor, IArrowArrayVisitor, IArrowArrayVisitor, +#if NET5_0_OR_GREATER IArrowArrayVisitor, +#endif IArrowArrayVisitor, IArrowArrayVisitor, IArrowArrayVisitor, @@ -240,7 +244,9 @@ public void Visit(Date64Array array) public void Visit(Time32Array array) => ValidateArrays(array); public void Visit(Time64Array array) => ValidateArrays(array); +#if NET5_0_OR_GREATER public void Visit(HalfFloatArray array) => ValidateArrays(array); +#endif public void Visit(FloatArray array) => ValidateArrays(array); public void Visit(DoubleArray array) => ValidateArrays(array); public void Visit(StringArray array) => ValidateArrays(array); diff --git a/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs b/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs index acfe72f83195e..543b446bba876 100644 --- a/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs +++ b/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs @@ -74,7 +74,9 @@ private class ArrayComparer : IArrowArrayVisitor, IArrowArrayVisitor, IArrowArrayVisitor, +#if NET5_0_OR_GREATER IArrowArrayVisitor, +#endif IArrowArrayVisitor, IArrowArrayVisitor, IArrowArrayVisitor, @@ -112,7 +114,9 @@ public ArrayComparer(IArrowArray expectedArray, bool strictCompare) public void Visit(UInt16Array array) => CompareArrays(array); public void Visit(UInt32Array array) => CompareArrays(array); public void Visit(UInt64Array array) => CompareArrays(array); +#if NET5_0_OR_GREATER public void Visit(HalfFloatArray array) => CompareArrays(array); +#endif public void Visit(FloatArray array) => CompareArrays(array); public void Visit(DoubleArray array) => CompareArrays(array); public void Visit(BooleanArray array) => CompareArrays(array); diff --git a/csharp/test/Apache.Arrow.Tests/ArrowStreamReaderTests.cs b/csharp/test/Apache.Arrow.Tests/ArrowStreamReaderTests.cs index 0e8c9d6687a02..ed030cc6ace11 100644 --- a/csharp/test/Apache.Arrow.Tests/ArrowStreamReaderTests.cs +++ b/csharp/test/Apache.Arrow.Tests/ArrowStreamReaderTests.cs @@ -224,6 +224,7 @@ private class PartialReadStream : MemoryStream // by default return 20 bytes at a time public int PartialReadLength { get; set; } = 20; +#if NET5_0_OR_GREATER public override int Read(Span destination) { if (destination.Length > PartialReadLength) @@ -243,6 +244,17 @@ public override ValueTask ReadAsync(Memory destination, CancellationT return base.ReadAsync(destination, cancellationToken); } +#else + public override int Read(byte[] buffer, int offset, int length) + { + return base.Read(buffer, offset, Math.Min(length, PartialReadLength)); + } + + public override Task ReadAsync(byte[] buffer, int offset, int length, CancellationToken cancellationToken = default) + { + return base.ReadAsync(buffer, offset, Math.Min(length, PartialReadLength), cancellationToken); + } +#endif } } } diff --git a/csharp/test/Apache.Arrow.Tests/CDataInterfaceDataTests.cs b/csharp/test/Apache.Arrow.Tests/CDataInterfaceDataTests.cs index a430e140cfc2a..2bd4d4d661942 100644 --- a/csharp/test/Apache.Arrow.Tests/CDataInterfaceDataTests.cs +++ b/csharp/test/Apache.Arrow.Tests/CDataInterfaceDataTests.cs @@ -47,7 +47,7 @@ public unsafe void InitializeArrayZeroed() Assert.True(cArray->buffers == null); Assert.True(cArray->children == null); Assert.True(cArray->dictionary == null); - Assert.True(cArray->release == null); + Assert.True(cArray->release == default); Assert.True(cArray->private_data == null); CArrowArray.Free(cArray); @@ -59,12 +59,13 @@ public unsafe void CallsReleaseForValid() IArrowArray array = GetTestArray(); CArrowArray* cArray = CArrowArray.Create(); CArrowArrayExporter.ExportArray(array, cArray); - Assert.False(cArray->release == null); + Assert.False(cArray->release == default); CArrowArrayImporter.ImportArray(cArray, array.Data.DataType).Dispose(); - Assert.True(cArray->release == null); + Assert.True(cArray->release == default); CArrowArray.Free(cArray); } +#if NET5_0_OR_GREATER [Fact] public unsafe void CallsReleaseForInvalid() { @@ -75,7 +76,7 @@ public unsafe void CallsReleaseForInvalid() var releaseCallback = (CArrowArray* cArray) => { wasCalled = true; - cArray->release = null; + cArray->release = default; }; cArray->release = (delegate* unmanaged)Marshal.GetFunctionPointerForDelegate( releaseCallback); @@ -90,5 +91,6 @@ public unsafe void CallsReleaseForInvalid() GC.KeepAlive(releaseCallback); } +#endif } } diff --git a/csharp/test/Apache.Arrow.Tests/CDataInterfacePythonTests.cs b/csharp/test/Apache.Arrow.Tests/CDataInterfacePythonTests.cs index 084d7bfb014cc..4c53b98e3d9f1 100644 --- a/csharp/test/Apache.Arrow.Tests/CDataInterfacePythonTests.cs +++ b/csharp/test/Apache.Arrow.Tests/CDataInterfacePythonTests.cs @@ -28,31 +28,39 @@ namespace Apache.Arrow.Tests { - public class CDataSchemaPythonTest + public class CDataSchemaPythonTest : IClassFixture { - public CDataSchemaPythonTest() + class PythonNet : IDisposable { - bool inCIJob = Environment.GetEnvironmentVariable("GITHUB_ACTIONS") == "true"; - bool inVerificationJob = Environment.GetEnvironmentVariable("TEST_CSHARP") == "1"; - bool pythonSet = Environment.GetEnvironmentVariable("PYTHONNET_PYDLL") != null; - // We only skip if this is not in CI - if (inCIJob && !inVerificationJob && !pythonSet) + public PythonNet() { - throw new Exception("PYTHONNET_PYDLL not set; skipping C Data Interface tests."); - } - else - { - Skip.If(!pythonSet, "PYTHONNET_PYDLL not set; skipping C Data Interface tests."); - } + bool inCIJob = Environment.GetEnvironmentVariable("GITHUB_ACTIONS") == "true"; + bool inVerificationJob = Environment.GetEnvironmentVariable("TEST_CSHARP") == "1"; + bool pythonSet = Environment.GetEnvironmentVariable("PYTHONNET_PYDLL") != null; + // We only skip if this is not in CI + if (inCIJob && !inVerificationJob && !pythonSet) + { + throw new Exception("PYTHONNET_PYDLL not set; skipping C Data Interface tests."); + } + else + { + Skip.If(!pythonSet, "PYTHONNET_PYDLL not set; skipping C Data Interface tests."); + } + + PythonEngine.Initialize(); - PythonEngine.Initialize(); + if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows) && + PythonEngine.PythonPath.IndexOf("dlls", StringComparison.OrdinalIgnoreCase) < 0) + { + dynamic sys = Py.Import("sys"); + sys.path.append(Path.Combine(Path.GetDirectoryName(Environment.GetEnvironmentVariable("PYTHONNET_PYDLL")), "DLLs")); + } + } - if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows) && - !PythonEngine.PythonPath.Contains("dlls", StringComparison.OrdinalIgnoreCase)) + public void Dispose() { - dynamic sys = Py.Import("sys"); - sys.path.append(Path.Combine(Path.GetDirectoryName(Environment.GetEnvironmentVariable("PYTHONNET_PYDLL")), "DLLs")); + PythonEngine.Shutdown(); } } @@ -360,7 +368,7 @@ public unsafe void ExportType() } // Python should have called release once `exportedPyType` went out-of-scope. - Assert.True(cSchema->release == null); + Assert.True(cSchema->release == default); Assert.True(cSchema->format == null); Assert.Equal(0, cSchema->flags); Assert.Equal(0, cSchema->n_children); @@ -395,7 +403,7 @@ public unsafe void ExportField() // Python should have called release once `exportedPyField` went out-of-scope. Assert.True(cSchema->name == null); - Assert.True(cSchema->release == null); + Assert.True(cSchema->release == default); Assert.True(cSchema->format == null); // Since we allocated, we are responsible for freeing the pointer. diff --git a/csharp/test/Apache.Arrow.Tests/CDataInterfaceSchemaTests.cs b/csharp/test/Apache.Arrow.Tests/CDataInterfaceSchemaTests.cs index dfd6f9912cd4d..4aa5eb6b4d7ed 100644 --- a/csharp/test/Apache.Arrow.Tests/CDataInterfaceSchemaTests.cs +++ b/csharp/test/Apache.Arrow.Tests/CDataInterfaceSchemaTests.cs @@ -35,7 +35,7 @@ public unsafe void InitializeZeroed() Assert.Equal(0, cSchema->n_children); Assert.True(cSchema->children == null); Assert.True(cSchema->dictionary == null); - Assert.True(cSchema->release == null); + Assert.True(cSchema->release == default); Assert.True(cSchema->private_data == null); CArrowSchema.Free(cSchema); @@ -86,12 +86,13 @@ public unsafe void CallsReleaseForValid() { CArrowSchema* cSchema = CArrowSchema.Create(); CArrowSchemaExporter.ExportType(Int32Type.Default, cSchema); - Assert.False(cSchema->release == null); + Assert.False(cSchema->release == default); CArrowSchemaImporter.ImportType(cSchema); - Assert.True(cSchema->release == null); + Assert.True(cSchema->release == default); CArrowSchema.Free(cSchema); } +#if NET5_0_OR_GREATER // can't round-trip marshaled delegate [Fact] public unsafe void CallsReleaseForInvalid() { @@ -103,7 +104,7 @@ public unsafe void CallsReleaseForInvalid() var releaseCallback = (CArrowSchema* cSchema) => { wasCalled = true; - cSchema->release = null; + cSchema->release = default; }; cSchema->release = (delegate* unmanaged)Marshal.GetFunctionPointerForDelegate( releaseCallback); @@ -117,5 +118,6 @@ public unsafe void CallsReleaseForInvalid() GC.KeepAlive(releaseCallback); } +#endif } } diff --git a/csharp/test/Apache.Arrow.Tests/Date32ArrayTests.cs b/csharp/test/Apache.Arrow.Tests/Date32ArrayTests.cs index 0d6aad96e5dfd..2a674b942c17b 100644 --- a/csharp/test/Apache.Arrow.Tests/Date32ArrayTests.cs +++ b/csharp/test/Apache.Arrow.Tests/Date32ArrayTests.cs @@ -31,6 +31,11 @@ public static IEnumerable GetDateTimesData() => public static IEnumerable GetDateTimeOffsetsData() => TestDateAndTimeData.ExampleDateTimeOffsets.Select(dto => new object[] { dto }); +#if NET6_0_OR_GREATER + public static IEnumerable GetDateOnlyData() => + TestDateAndTimeData.ExampleDates.Select(d => new object[] { DateOnly.FromDateTime(d) }); +#endif + public class AppendNull { [Fact] @@ -121,5 +126,32 @@ public void AppendGivesUtcDate(DateTimeOffset dateTimeOffset) Assert.Equal(expectedValue, array.GetValue(0)); } } + +#if NET6_0_OR_GREATER + public class AppendDateOnly + { + [Theory] + [MemberData(nameof(GetDateOnlyData), MemberType = typeof(Date64ArrayTests))] + public void AppendDateGivesSameDate(DateOnly date) + { + // Arrange + var builder = new Date32Array.Builder(); + var expectedDateTime = date.ToDateTime(TimeOnly.MinValue); + var expectedDateTimeOffset = new DateTimeOffset(expectedDateTime, TimeSpan.Zero); + int expectedValue = date.DayNumber - new DateOnly(1970, 1, 1).DayNumber; + + // Act + builder = builder.Append(date); + + // Assert + var array = builder.Build(); + Assert.Equal(1, array.Length); + Assert.Equal(date, array.GetDateOnly(0)); + Assert.Equal(expectedDateTime, array.GetDateTime(0)); + Assert.Equal(expectedDateTimeOffset, array.GetDateTimeOffset(0)); + Assert.Equal(expectedValue, array.GetValue(0)); + } + } +#endif } } diff --git a/csharp/test/Apache.Arrow.Tests/Date64ArrayTests.cs b/csharp/test/Apache.Arrow.Tests/Date64ArrayTests.cs index 65cffc84e5555..22ae08a617c48 100644 --- a/csharp/test/Apache.Arrow.Tests/Date64ArrayTests.cs +++ b/csharp/test/Apache.Arrow.Tests/Date64ArrayTests.cs @@ -33,6 +33,11 @@ public static IEnumerable GetDateTimesData() => public static IEnumerable GetDateTimeOffsetsData() => TestDateAndTimeData.ExampleDateTimeOffsets.Select(dto => new object[] { dto }); +#if NET6_0_OR_GREATER + public static IEnumerable GetDateOnlyData() => + TestDateAndTimeData.ExampleDates.Select(d => new object[] { DateOnly.FromDateTime(d) }); +#endif + public class AppendNull { [Fact] @@ -129,5 +134,33 @@ public void AppendGivesUtcDate(DateTimeOffset dateTimeOffset) Assert.Equal(0, array.GetValue(0).Value % MillisecondsPerDay); } } + +#if NET6_0_OR_GREATER + public class AppendDateOnly + { + [Theory] + [MemberData(nameof(GetDateOnlyData), MemberType = typeof(Date64ArrayTests))] + public void AppendDateGivesSameDate(DateOnly date) + { + // Arrange + var builder = new Date64Array.Builder(); + var expectedDateTime = date.ToDateTime(TimeOnly.MinValue); + var expectedDateTimeOffset = new DateTimeOffset(expectedDateTime, TimeSpan.Zero); + long expectedValue = (date.DayNumber - new DateOnly(1970, 1, 1).DayNumber) * MillisecondsPerDay; + + // Act + builder = builder.Append(date); + + // Assert + var array = builder.Build(); + Assert.Equal(1, array.Length); + Assert.Equal(date, array.GetDateOnly(0)); + Assert.Equal(expectedDateTime, array.GetDateTime(0)); + Assert.Equal(expectedDateTimeOffset, array.GetDateTimeOffset(0)); + Assert.Equal(expectedValue, array.GetValue(0)); + Assert.Equal(0, array.GetValue(0).Value % MillisecondsPerDay); + } + } +#endif } } diff --git a/csharp/test/Apache.Arrow.Tests/Extensions/Net472Extensions.cs b/csharp/test/Apache.Arrow.Tests/Extensions/Net472Extensions.cs new file mode 100644 index 0000000000000..0b298dec414c0 --- /dev/null +++ b/csharp/test/Apache.Arrow.Tests/Extensions/Net472Extensions.cs @@ -0,0 +1,34 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System.Collections.Generic; + +namespace Apache.Arrow.Tests +{ + internal static class Net472Extensions + { + public static IEnumerable<(TFirst First, TSecond Second)> Zip(this IEnumerable first, IEnumerable second) + { + using (var enumerator1 = first.GetEnumerator()) + using (var enumerator2 = second.GetEnumerator()) + { + while (enumerator1.MoveNext() && enumerator2.MoveNext()) + { + yield return (enumerator1.Current, enumerator2.Current); + } + } + } + } +} diff --git a/csharp/test/Apache.Arrow.Tests/TestDateAndTimeData.cs b/csharp/test/Apache.Arrow.Tests/TestDateAndTimeData.cs index 1f2eae45b039c..c258fdd2d6988 100644 --- a/csharp/test/Apache.Arrow.Tests/TestDateAndTimeData.cs +++ b/csharp/test/Apache.Arrow.Tests/TestDateAndTimeData.cs @@ -59,6 +59,11 @@ from date in _exampleDates from kind in _exampleKinds select DateTime.SpecifyKind(date, kind); + /// + /// Gets a collection of example times + /// + public static IEnumerable ExampleTimes => _exampleTimes; + /// /// Gets a collection of example date/times, of all different kinds. /// diff --git a/csharp/test/Apache.Arrow.Tests/TimeOnlyTests.cs b/csharp/test/Apache.Arrow.Tests/TimeOnlyTests.cs new file mode 100644 index 0000000000000..cd66530a0e935 --- /dev/null +++ b/csharp/test/Apache.Arrow.Tests/TimeOnlyTests.cs @@ -0,0 +1,111 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System; +using System.Collections.Generic; +using System.Linq; +using Apache.Arrow.Types; +using Xunit; + +namespace Apache.Arrow.Tests +{ + public class TimeOnlyTests + { + private static IEnumerable GetTimeOnlyData(params TimeUnit[] units) => + from time in TestDateAndTimeData.ExampleTimes + from unit in units + select new object[] { TimeOnly.FromTimeSpan(time), unit }; + + public class Time32 + { + public static IEnumerable GetTestData => GetTimeOnlyData(TimeUnit.Second, TimeUnit.Millisecond); + + [Fact] + public void AppendThenGetGivesNull() + { + // Arrange + var builder = new Time32Array.Builder(); + + // Act + builder = builder.AppendNull(); + + // Assert + var array = builder.Build(); + Assert.Equal(1, array.Length); + Assert.Null(array.GetTime(0)); + Assert.Null(array.GetValue(0)); + } + + [Theory] + [MemberData(nameof(GetTestData))] + public void AppendTimeGivesSameTime(TimeOnly time, TimeUnit timeUnit) + { + // Arrange + var builder = new Time32Array.Builder(timeUnit); + var expectedTime = time; + int expectedMilliseconds = (int)(time.Ticks / TimeSpan.TicksPerMillisecond); + + // Act + builder = builder.Append(time); + + // Assert + var array = builder.Build(); + Assert.Equal(1, array.Length); + Assert.Equal(expectedTime, array.GetTime(0)); + Assert.Equal(expectedMilliseconds, array.GetMilliSeconds(0)); + } + } + + public class Time64 + { + public static IEnumerable GetTestData => GetTimeOnlyData(TimeUnit.Microsecond, TimeUnit.Nanosecond); + + [Fact] + public void AppendThenGetGivesNull() + { + // Arrange + var builder = new Time64Array.Builder(); + + // Act + builder = builder.AppendNull(); + + // Assert + var array = builder.Build(); + Assert.Equal(1, array.Length); + Assert.Null(array.GetTime(0)); + Assert.Null(array.GetValue(0)); + } + + [Theory] + [MemberData(nameof(GetTestData))] + public void AppendTimeGivesSameTime(TimeOnly time, TimeUnit timeUnit) + { + // Arrange + var builder = new Time64Array.Builder(timeUnit); + var expectedTime = time; + long expectedNanoseconds = time.Ticks * TimeSpan.NanosecondsPerTick; + + // Act + builder = builder.Append(time); + + // Assert + var array = builder.Build(); + Assert.Equal(1, array.Length); + Assert.Equal(expectedTime, array.GetTime(0)); + Assert.Equal(expectedNanoseconds, array.GetNanoSeconds(0)); + } + } + } +} diff --git a/dev/archery/archery/templates/release_curation.txt.j2 b/dev/archery/archery/templates/release_curation.txt.j2 index 0796f451625f1..8e72290366bdd 100644 --- a/dev/archery/archery/templates/release_curation.txt.j2 +++ b/dev/archery/archery/templates/release_curation.txt.j2 @@ -39,7 +39,7 @@ {% for commit in noissue -%} - {{ commit.url }} {{ commit.title }} {% endfor %} -### JIRA issues in version {{ release.version }} without a linked patch: {{ nopatch|length }} +### GitHub issues in version {{ release.version }} without a linked patch: {{ nopatch|length }} {% for issue in nopatch -%} - https://github.com/apache/arrow/issues/{{ issue.key }} {% endfor %} diff --git a/dev/archery/setup.py b/dev/archery/setup.py index 7dbfe47d6eeb5..627e576fb6f59 100755 --- a/dev/archery/setup.py +++ b/dev/archery/setup.py @@ -28,9 +28,8 @@ jinja_req = 'jinja2>=2.11' extras = { - 'lint': [ - 'numpydoc==1.1.0', 'autopep8', 'flake8', 'cython-lint', 'cmake_format==0.6.13' - ], + 'lint': ['numpydoc==1.1.0', 'autopep8', 'flake8==6.1.0', 'cython-lint', + 'cmake_format==0.6.13'], 'benchmark': ['pandas'], 'docker': ['ruamel.yaml', 'python-dotenv'], 'release': ['pygithub', jinja_req, 'jira', 'semver', 'gitpython'], diff --git a/dev/merge_arrow_pr.py b/dev/merge_arrow_pr.py index 90b2e9b034eea..0f36a5ba9025c 100755 --- a/dev/merge_arrow_pr.py +++ b/dev/merge_arrow_pr.py @@ -78,7 +78,24 @@ def get_json(url, headers=None): response = requests.get(url, headers=headers) if response.status_code != 200: raise ValueError(response.json()) - return response.json() + # GitHub returns a link header with the next, previous, last + # page if there is pagination on the response. See: + # https://docs.github.com/en/rest/guides/using-pagination-in-the-rest-api#using-link-headers + next_responses = None + if "link" in response.headers: + links = response.headers['link'].split(', ') + for link in links: + if 'rel="next"' in link: + # Format: '; rel="next"' + next_url = link.split(";")[0][1:-1] + next_responses = get_json(next_url, headers) + responses = response.json() + if next_responses: + if isinstance(responses, list): + responses.extend(next_responses) + else: + raise ValueError('GitHub response was paginated and is not a list') + return responses def run_cmd(cmd): diff --git a/dev/release/02-source-test.rb b/dev/release/02-source-test.rb index 1d0fd19d01b84..b9e6a8505b72b 100644 --- a/dev/release/02-source-test.rb +++ b/dev/release/02-source-test.rb @@ -134,7 +134,7 @@ def test_vote I would like to propose the following release candidate (RC0) of Apache Arrow version #{@release_version}. This is a release consisting of #{n_resolved_issues} -resolved JIRA issues[1]. +resolved GitHub issues[1]. This release candidate is based on commit: #{@current_commit} [2] diff --git a/dev/release/02-source.sh b/dev/release/02-source.sh index 1e54d6d10db76..e9cd7126361cd 100755 --- a/dev/release/02-source.sh +++ b/dev/release/02-source.sh @@ -168,7 +168,7 @@ Hi, I would like to propose the following release candidate (RC${rc}) of Apache Arrow version ${version}. This is a release consisting of ${n_resolved_issues} -resolved JIRA issues[1]. +resolved GitHub issues[1]. This release candidate is based on commit: ${release_hash} [2] diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt index f61c217760f61..7bdb692d048e9 100644 --- a/dev/release/rat_exclude_files.txt +++ b/dev/release/rat_exclude_files.txt @@ -63,8 +63,7 @@ docs/requirements.txt go.work.sum go/go.sum go/arrow/Gopkg.lock -go/arrow/flight/internal/flight/Flight.pb.go -go/arrow/flight/internal/flight/Flight_grpc.pb.go +go/arrow/flight/gen/flight/*.pb.go go/arrow/internal/cpu/* go/arrow/type_string.go go/arrow/cdata/test/go.sum diff --git a/dev/release/verify-apt.sh b/dev/release/verify-apt.sh index 187482cbf52d2..49671f01cc7e8 100755 --- a/dev/release/verify-apt.sh +++ b/dev/release/verify-apt.sh @@ -45,7 +45,21 @@ echo "::group::Prepare repository" export DEBIAN_FRONTEND=noninteractive -APT_INSTALL="apt install -y -V --no-install-recommends" +retry() +{ + local n_retries=0 + local max_n_retries=3 + while ! "$@"; do + n_retries=$((n_retries + 1)) + if [ ${n_retries} -eq ${max_n_retries} ]; then + echo "Failed: $@" + return 1 + fi + echo "Retry: $@" + done +} + +APT_INSTALL="retry apt install -y -V --no-install-recommends" apt update ${APT_INSTALL} \ diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh index 8c5de9bda85aa..ce31b497c1fab 100755 --- a/dev/release/verify-release-candidate.sh +++ b/dev/release/verify-release-candidate.sh @@ -665,7 +665,7 @@ test_python() { show_header "Build and test Python libraries" # Build and test Python - maybe_setup_virtualenv cython numpy setuptools_scm setuptools || exit 1 + maybe_setup_virtualenv "cython<3" numpy setuptools_scm setuptools || exit 1 maybe_setup_conda --file ci/conda_env_python.txt || exit 1 if [ "${USE_CONDA}" -gt 0 ]; then diff --git a/dev/release/verify-yum.sh b/dev/release/verify-yum.sh index 03aa8e9dccc75..55fc0c1735931 100755 --- a/dev/release/verify-yum.sh +++ b/dev/release/verify-yum.sh @@ -234,7 +234,7 @@ if [ "${have_glib}" = "yes" ]; then if [ "${have_ruby}" = "yes" ]; then ${install_command} "${ruby_devel_packages[@]}" - gem install gobject-introspection + MAKEFLAGS="-j$(nproc)" gem install gobject-introspection ruby -r gi -e "p GI.load('Arrow')" fi echo "::endgroup::" diff --git a/dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_version10.2.yaml b/dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_version11.2.yaml similarity index 79% rename from dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_version10.2.yaml rename to dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_version11.2.yaml index 5d80a17c4dfd7..1cdcec199e7ba 100644 --- a/dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_version10.2.yaml +++ b/dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_version11.2.yaml @@ -1,17 +1,15 @@ aws_crt_cpp: -- 0.19.8 +- 0.20.3 aws_sdk_cpp: - 1.10.57 bzip2: - '1' -c_ares: -- '1' c_compiler: - gcc c_compiler_version: -- '7' +- '10' cdt_name: -- cos6 +- cos7 channel_sources: - conda-forge channel_targets: @@ -19,38 +17,40 @@ channel_targets: cuda_compiler: - nvcc cuda_compiler_version: -- '10.2' +- '11.2' cuda_compiler_version_min: -- '10.2' +- '11.2' cxx_compiler: - gxx cxx_compiler_version: -- '7' +- '10' docker_image: -- quay.io/condaforge/linux-anvil-cos7-cuda:10.2 +- quay.io/condaforge/linux-anvil-cuda:11.2 gflags: - '2.2' glog: - '0.6' google_cloud_cpp: -- 2.8.0 +- '2.12' libabseil: - '20230125' libgrpc: -- '1.52' +- '1.54' +- '1.56' libprotobuf: - '3.21' +- 4.23.3 lz4_c: - 1.9.3 numpy: - '1.21' - '1.23' -- '1.20' -- '1.20' +- '1.21' +- '1.21' openssl: - '3' orc: -- 1.8.3 +- 1.9.0 pin_run_as_build: python: min_pin: x.x @@ -61,7 +61,7 @@ python: - 3.8.* *_cpython - 3.9.* *_cpython re2: -- 2023.02.02 +- 2023.03.02 snappy: - '1' target_platform: @@ -73,9 +73,12 @@ ucx: zip_keys: - - c_compiler_version - cxx_compiler_version + - cuda_compiler - cuda_compiler_version - cdt_name - docker_image +- - libgrpc + - libprotobuf - - python - numpy zlib: diff --git a/dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_versionNone.yaml b/dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_versionNone.yaml index 39b25b44690d7..5be5b58a73932 100644 --- a/dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_versionNone.yaml +++ b/dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_versionNone.yaml @@ -1,15 +1,13 @@ aws_crt_cpp: -- 0.19.8 +- 0.20.3 aws_sdk_cpp: - 1.10.57 bzip2: - '1' -c_ares: -- '1' c_compiler: - gcc c_compiler_version: -- '11' +- '12' cdt_name: - cos6 channel_sources: @@ -17,15 +15,15 @@ channel_sources: channel_targets: - conda-forge main cuda_compiler: -- nvcc +- None cuda_compiler_version: - None cuda_compiler_version_min: -- '10.2' +- '11.2' cxx_compiler: - gxx cxx_compiler_version: -- '11' +- '12' docker_image: - quay.io/condaforge/linux-anvil-cos7-x86_64 gflags: @@ -33,24 +31,26 @@ gflags: glog: - '0.6' google_cloud_cpp: -- 2.8.0 +- '2.12' libabseil: - '20230125' libgrpc: -- '1.52' +- '1.54' +- '1.56' libprotobuf: - '3.21' +- 4.23.3 lz4_c: - 1.9.3 numpy: - '1.21' - '1.23' -- '1.20' -- '1.20' +- '1.21' +- '1.21' openssl: - '3' orc: -- 1.8.3 +- 1.9.0 pin_run_as_build: python: min_pin: x.x @@ -61,7 +61,7 @@ python: - 3.8.* *_cpython - 3.9.* *_cpython re2: -- 2023.02.02 +- 2023.03.02 snappy: - '1' target_platform: @@ -73,9 +73,12 @@ ucx: zip_keys: - - c_compiler_version - cxx_compiler_version + - cuda_compiler - cuda_compiler_version - cdt_name - docker_image +- - libgrpc + - libprotobuf - - python - numpy zlib: diff --git a/dev/tasks/conda-recipes/.ci_support/linux_aarch64_cuda_compiler_version11.2.yaml b/dev/tasks/conda-recipes/.ci_support/linux_aarch64_cuda_compiler_version11.2.yaml new file mode 100644 index 0000000000000..1677b03564c08 --- /dev/null +++ b/dev/tasks/conda-recipes/.ci_support/linux_aarch64_cuda_compiler_version11.2.yaml @@ -0,0 +1,91 @@ +BUILD: +- aarch64-conda_cos7-linux-gnu +aws_crt_cpp: +- 0.20.3 +aws_sdk_cpp: +- 1.10.57 +bzip2: +- '1' +c_compiler: +- gcc +c_compiler_version: +- '10' +cdt_arch: +- aarch64 +cdt_name: +- cos7 +channel_sources: +- conda-forge +channel_targets: +- conda-forge main +cuda_compiler: +- nvcc +cuda_compiler_version: +- '11.2' +cuda_compiler_version_min: +- '11.2' +cxx_compiler: +- gxx +cxx_compiler_version: +- '10' +docker_image: +- quay.io/condaforge/linux-anvil-cuda:11.2 +gflags: +- '2.2' +glog: +- '0.6' +google_cloud_cpp: +- '2.12' +libabseil: +- '20230125' +libgrpc: +- '1.54' +- '1.56' +libprotobuf: +- '3.21' +- 4.23.3 +lz4_c: +- 1.9.3 +numpy: +- '1.21' +- '1.23' +- '1.21' +- '1.21' +openssl: +- '3' +orc: +- 1.9.0 +pin_run_as_build: + python: + min_pin: x.x + max_pin: x.x +python: +- 3.10.* *_cpython +- 3.11.* *_cpython +- 3.8.* *_cpython +- 3.9.* *_cpython +re2: +- 2023.03.02 +snappy: +- '1' +target_platform: +- linux-aarch64 +thrift_cpp: +- 0.18.1 +ucx: +- 1.14.0 +zip_keys: +- - c_compiler_version + - cxx_compiler_version + - cuda_compiler + - cuda_compiler_version + - cdt_name + - docker_image +- - libgrpc + - libprotobuf +- - python + - numpy +zlib: +- '1.2' +zstd: +- '1.5' diff --git a/dev/tasks/conda-recipes/.ci_support/linux_aarch64_cuda_compiler_versionNone.yaml b/dev/tasks/conda-recipes/.ci_support/linux_aarch64_cuda_compiler_versionNone.yaml index af0fc2dcd255e..88fdf1254e661 100644 --- a/dev/tasks/conda-recipes/.ci_support/linux_aarch64_cuda_compiler_versionNone.yaml +++ b/dev/tasks/conda-recipes/.ci_support/linux_aarch64_cuda_compiler_versionNone.yaml @@ -1,17 +1,15 @@ BUILD: - aarch64-conda_cos7-linux-gnu aws_crt_cpp: -- 0.19.8 +- 0.20.3 aws_sdk_cpp: - 1.10.57 bzip2: - '1' -c_ares: -- '1' c_compiler: - gcc c_compiler_version: -- '11' +- '12' cdt_arch: - aarch64 cdt_name: @@ -20,12 +18,16 @@ channel_sources: - conda-forge channel_targets: - conda-forge main +cuda_compiler: +- None cuda_compiler_version: - None +cuda_compiler_version_min: +- '11.2' cxx_compiler: - gxx cxx_compiler_version: -- '11' +- '12' docker_image: - quay.io/condaforge/linux-anvil-cos7-x86_64 gflags: @@ -33,24 +35,26 @@ gflags: glog: - '0.6' google_cloud_cpp: -- 2.8.0 +- '2.12' libabseil: - '20230125' libgrpc: -- '1.52' +- '1.54' +- '1.56' libprotobuf: - '3.21' +- 4.23.3 lz4_c: - 1.9.3 numpy: - '1.21' - '1.23' -- '1.20' -- '1.20' +- '1.21' +- '1.21' openssl: - '3' orc: -- 1.8.3 +- 1.9.0 pin_run_as_build: python: min_pin: x.x @@ -61,7 +65,7 @@ python: - 3.8.* *_cpython - 3.9.* *_cpython re2: -- 2023.02.02 +- 2023.03.02 snappy: - '1' target_platform: @@ -73,9 +77,12 @@ ucx: zip_keys: - - c_compiler_version - cxx_compiler_version + - cuda_compiler - cuda_compiler_version - cdt_name - docker_image +- - libgrpc + - libprotobuf - - python - numpy zlib: diff --git a/dev/tasks/conda-recipes/.ci_support/linux_ppc64le_cuda_compiler_version11.2.yaml b/dev/tasks/conda-recipes/.ci_support/linux_ppc64le_cuda_compiler_version11.2.yaml new file mode 100644 index 0000000000000..3585db7b99baa --- /dev/null +++ b/dev/tasks/conda-recipes/.ci_support/linux_ppc64le_cuda_compiler_version11.2.yaml @@ -0,0 +1,87 @@ +aws_crt_cpp: +- 0.20.3 +aws_sdk_cpp: +- 1.10.57 +bzip2: +- '1' +c_compiler: +- gcc +c_compiler_version: +- '10' +cdt_name: +- cos7 +channel_sources: +- conda-forge +channel_targets: +- conda-forge main +cuda_compiler: +- nvcc +cuda_compiler_version: +- '11.2' +cuda_compiler_version_min: +- '11.2' +cxx_compiler: +- gxx +cxx_compiler_version: +- '10' +docker_image: +- quay.io/condaforge/linux-anvil-cuda:11.2 +gflags: +- '2.2' +glog: +- '0.6' +google_cloud_cpp: +- '2.12' +libabseil: +- '20230125' +libgrpc: +- '1.54' +- '1.56' +libprotobuf: +- '3.21' +- 4.23.3 +lz4_c: +- 1.9.3 +numpy: +- '1.21' +- '1.23' +- '1.21' +- '1.21' +openssl: +- '3' +orc: +- 1.9.0 +pin_run_as_build: + python: + min_pin: x.x + max_pin: x.x +python: +- 3.10.* *_cpython +- 3.11.* *_cpython +- 3.8.* *_cpython +- 3.9.* *_cpython +re2: +- 2023.03.02 +snappy: +- '1' +target_platform: +- linux-ppc64le +thrift_cpp: +- 0.18.1 +ucx: +- 1.14.0 +zip_keys: +- - c_compiler_version + - cxx_compiler_version + - cuda_compiler + - cuda_compiler_version + - cdt_name + - docker_image +- - libgrpc + - libprotobuf +- - python + - numpy +zlib: +- '1.2' +zstd: +- '1.5' diff --git a/dev/tasks/conda-recipes/.ci_support/linux_ppc64le_cuda_compiler_versionNone.yaml b/dev/tasks/conda-recipes/.ci_support/linux_ppc64le_cuda_compiler_versionNone.yaml index 83a1f7f740092..c13a522254286 100644 --- a/dev/tasks/conda-recipes/.ci_support/linux_ppc64le_cuda_compiler_versionNone.yaml +++ b/dev/tasks/conda-recipes/.ci_support/linux_ppc64le_cuda_compiler_versionNone.yaml @@ -1,27 +1,29 @@ aws_crt_cpp: -- 0.19.8 +- 0.20.3 aws_sdk_cpp: - 1.10.57 bzip2: - '1' -c_ares: -- '1' c_compiler: - gcc c_compiler_version: -- '11' +- '12' cdt_name: - cos7 channel_sources: - conda-forge channel_targets: - conda-forge main +cuda_compiler: +- None cuda_compiler_version: - None +cuda_compiler_version_min: +- '11.2' cxx_compiler: - gxx cxx_compiler_version: -- '11' +- '12' docker_image: - quay.io/condaforge/linux-anvil-cos7-x86_64 gflags: @@ -29,24 +31,26 @@ gflags: glog: - '0.6' google_cloud_cpp: -- 2.8.0 +- '2.12' libabseil: - '20230125' libgrpc: -- '1.52' +- '1.54' +- '1.56' libprotobuf: - '3.21' +- 4.23.3 lz4_c: - 1.9.3 numpy: - '1.21' - '1.23' -- '1.20' -- '1.20' +- '1.21' +- '1.21' openssl: - '3' orc: -- 1.8.3 +- 1.9.0 pin_run_as_build: python: min_pin: x.x @@ -57,7 +61,7 @@ python: - 3.8.* *_cpython - 3.9.* *_cpython re2: -- 2023.02.02 +- 2023.03.02 snappy: - '1' target_platform: @@ -69,9 +73,12 @@ ucx: zip_keys: - - c_compiler_version - cxx_compiler_version + - cuda_compiler - cuda_compiler_version - cdt_name - docker_image +- - libgrpc + - libprotobuf - - python - numpy zlib: diff --git a/dev/tasks/conda-recipes/.ci_support/osx_64_.yaml b/dev/tasks/conda-recipes/.ci_support/osx_64_.yaml index 0cf990cc113f2..dd4a230760ef2 100644 --- a/dev/tasks/conda-recipes/.ci_support/osx_64_.yaml +++ b/dev/tasks/conda-recipes/.ci_support/osx_64_.yaml @@ -1,17 +1,15 @@ MACOSX_DEPLOYMENT_TARGET: - '10.9' aws_crt_cpp: -- 0.19.8 +- 0.20.3 aws_sdk_cpp: - 1.10.57 bzip2: - '1' -c_ares: -- '1' c_compiler: - clang c_compiler_version: -- '14' +- '15' channel_sources: - conda-forge channel_targets: @@ -21,19 +19,21 @@ cuda_compiler_version: cxx_compiler: - clangxx cxx_compiler_version: -- '14' +- '15' gflags: - '2.2' glog: - '0.6' google_cloud_cpp: -- 2.8.0 +- '2.12' libabseil: - '20230125' libgrpc: -- '1.52' +- '1.54' +- '1.56' libprotobuf: - '3.21' +- 4.23.3 lz4_c: - 1.9.3 macos_machine: @@ -41,12 +41,12 @@ macos_machine: numpy: - '1.21' - '1.23' -- '1.20' -- '1.20' +- '1.21' +- '1.21' openssl: - '3' orc: -- 1.8.3 +- 1.9.0 pin_run_as_build: python: min_pin: x.x @@ -57,7 +57,7 @@ python: - 3.8.* *_cpython - 3.9.* *_cpython re2: -- 2023.02.02 +- 2023.03.02 snappy: - '1' target_platform: @@ -67,6 +67,8 @@ thrift_cpp: zip_keys: - - c_compiler_version - cxx_compiler_version +- - libgrpc + - libprotobuf - - python - numpy zlib: diff --git a/dev/tasks/conda-recipes/.ci_support/osx_arm64_.yaml b/dev/tasks/conda-recipes/.ci_support/osx_arm64_.yaml index 3faa6278e81e2..6a6713a54fe86 100644 --- a/dev/tasks/conda-recipes/.ci_support/osx_arm64_.yaml +++ b/dev/tasks/conda-recipes/.ci_support/osx_arm64_.yaml @@ -1,17 +1,15 @@ MACOSX_DEPLOYMENT_TARGET: - '11.0' aws_crt_cpp: -- 0.19.8 +- 0.20.3 aws_sdk_cpp: - 1.10.57 bzip2: - '1' -c_ares: -- '1' c_compiler: - clang c_compiler_version: -- '14' +- '15' channel_sources: - conda-forge channel_targets: @@ -21,19 +19,21 @@ cuda_compiler_version: cxx_compiler: - clangxx cxx_compiler_version: -- '14' +- '15' gflags: - '2.2' glog: - '0.6' google_cloud_cpp: -- 2.8.0 +- '2.12' libabseil: - '20230125' libgrpc: -- '1.52' +- '1.54' +- '1.56' libprotobuf: - '3.21' +- 4.23.3 lz4_c: - 1.9.3 macos_machine: @@ -41,12 +41,12 @@ macos_machine: numpy: - '1.21' - '1.23' -- '1.20' -- '1.20' +- '1.21' +- '1.21' openssl: - '3' orc: -- 1.8.3 +- 1.9.0 pin_run_as_build: python: min_pin: x.x @@ -57,7 +57,7 @@ python: - 3.8.* *_cpython - 3.9.* *_cpython re2: -- 2023.02.02 +- 2023.03.02 snappy: - '1' target_platform: @@ -67,6 +67,8 @@ thrift_cpp: zip_keys: - - c_compiler_version - cxx_compiler_version +- - libgrpc + - libprotobuf - - python - numpy zlib: diff --git a/dev/tasks/conda-recipes/.ci_support/r/linux_64_r_base4.1.yaml b/dev/tasks/conda-recipes/.ci_support/r/linux_64_r_base4.1.yaml new file mode 100644 index 0000000000000..e63767cbe9771 --- /dev/null +++ b/dev/tasks/conda-recipes/.ci_support/r/linux_64_r_base4.1.yaml @@ -0,0 +1,27 @@ +c_compiler: +- gcc +c_compiler_version: +- '12' +cdt_name: +- cos6 +channel_sources: +- conda-forge +channel_targets: +- conda-forge main +cxx_compiler: +- gxx +cxx_compiler_version: +- '12' +docker_image: +- quay.io/condaforge/linux-anvil-cos7-x86_64 +pin_run_as_build: + r-base: + min_pin: x.x + max_pin: x.x +r_base: +- '4.1' +target_platform: +- linux-64 +zip_keys: +- - c_compiler_version + - cxx_compiler_version diff --git a/dev/tasks/conda-recipes/.ci_support/r/linux_64_r_base4.2.yaml b/dev/tasks/conda-recipes/.ci_support/r/linux_64_r_base4.2.yaml index 38753baa7ed09..6e661e1357d22 100644 --- a/dev/tasks/conda-recipes/.ci_support/r/linux_64_r_base4.2.yaml +++ b/dev/tasks/conda-recipes/.ci_support/r/linux_64_r_base4.2.yaml @@ -1,7 +1,7 @@ c_compiler: - gcc c_compiler_version: -- '11' +- '12' cdt_name: - cos6 channel_sources: @@ -11,7 +11,7 @@ channel_targets: cxx_compiler: - gxx cxx_compiler_version: -- '11' +- '12' docker_image: - quay.io/condaforge/linux-anvil-cos7-x86_64 pin_run_as_build: diff --git a/dev/tasks/conda-recipes/.ci_support/r/linux_aarch64_r_base4.1.yaml b/dev/tasks/conda-recipes/.ci_support/r/linux_aarch64_r_base4.1.yaml new file mode 100644 index 0000000000000..2b80b020fdc0b --- /dev/null +++ b/dev/tasks/conda-recipes/.ci_support/r/linux_aarch64_r_base4.1.yaml @@ -0,0 +1,31 @@ +BUILD: +- aarch64-conda_cos7-linux-gnu +c_compiler: +- gcc +c_compiler_version: +- '12' +cdt_arch: +- aarch64 +cdt_name: +- cos7 +channel_sources: +- conda-forge +channel_targets: +- conda-forge main +cxx_compiler: +- gxx +cxx_compiler_version: +- '12' +docker_image: +- quay.io/condaforge/linux-anvil-cos7-x86_64 +pin_run_as_build: + r-base: + min_pin: x.x + max_pin: x.x +r_base: +- '4.1' +target_platform: +- linux-aarch64 +zip_keys: +- - c_compiler_version + - cxx_compiler_version diff --git a/dev/tasks/conda-recipes/.ci_support/r/linux_aarch64_r_base4.2.yaml b/dev/tasks/conda-recipes/.ci_support/r/linux_aarch64_r_base4.2.yaml index 2913bbb4f141f..9dcd0c34c851c 100644 --- a/dev/tasks/conda-recipes/.ci_support/r/linux_aarch64_r_base4.2.yaml +++ b/dev/tasks/conda-recipes/.ci_support/r/linux_aarch64_r_base4.2.yaml @@ -3,7 +3,7 @@ BUILD: c_compiler: - gcc c_compiler_version: -- '11' +- '12' cdt_arch: - aarch64 cdt_name: @@ -15,7 +15,7 @@ channel_targets: cxx_compiler: - gxx cxx_compiler_version: -- '11' +- '12' docker_image: - quay.io/condaforge/linux-anvil-cos7-x86_64 pin_run_as_build: diff --git a/dev/tasks/conda-recipes/.ci_support/r/osx_64_r_base4.1.yaml b/dev/tasks/conda-recipes/.ci_support/r/osx_64_r_base4.1.yaml new file mode 100644 index 0000000000000..6be6c2f5462c5 --- /dev/null +++ b/dev/tasks/conda-recipes/.ci_support/r/osx_64_r_base4.1.yaml @@ -0,0 +1,27 @@ +MACOSX_DEPLOYMENT_TARGET: +- '10.9' +c_compiler: +- clang +c_compiler_version: +- '15' +channel_sources: +- conda-forge +channel_targets: +- conda-forge main +cxx_compiler: +- clangxx +cxx_compiler_version: +- '15' +macos_machine: +- x86_64-apple-darwin13.4.0 +pin_run_as_build: + r-base: + min_pin: x.x + max_pin: x.x +r_base: +- '4.1' +target_platform: +- osx-64 +zip_keys: +- - c_compiler_version + - cxx_compiler_version diff --git a/dev/tasks/conda-recipes/.ci_support/r/osx_64_r_base4.2.yaml b/dev/tasks/conda-recipes/.ci_support/r/osx_64_r_base4.2.yaml index 25437ee4adcfe..2116eaf7b8b21 100644 --- a/dev/tasks/conda-recipes/.ci_support/r/osx_64_r_base4.2.yaml +++ b/dev/tasks/conda-recipes/.ci_support/r/osx_64_r_base4.2.yaml @@ -3,7 +3,7 @@ MACOSX_DEPLOYMENT_TARGET: c_compiler: - clang c_compiler_version: -- '14' +- '15' channel_sources: - conda-forge channel_targets: @@ -11,7 +11,7 @@ channel_targets: cxx_compiler: - clangxx cxx_compiler_version: -- '14' +- '15' macos_machine: - x86_64-apple-darwin13.4.0 pin_run_as_build: diff --git a/dev/tasks/conda-recipes/.ci_support/r/osx_arm64_r_base4.1.yaml b/dev/tasks/conda-recipes/.ci_support/r/osx_arm64_r_base4.1.yaml new file mode 100644 index 0000000000000..0ce856fcccf5c --- /dev/null +++ b/dev/tasks/conda-recipes/.ci_support/r/osx_arm64_r_base4.1.yaml @@ -0,0 +1,27 @@ +MACOSX_DEPLOYMENT_TARGET: +- '11.0' +c_compiler: +- clang +c_compiler_version: +- '15' +channel_sources: +- conda-forge +channel_targets: +- conda-forge main +cxx_compiler: +- clangxx +cxx_compiler_version: +- '15' +macos_machine: +- arm64-apple-darwin20.0.0 +pin_run_as_build: + r-base: + min_pin: x.x + max_pin: x.x +r_base: +- '4.1' +target_platform: +- osx-arm64 +zip_keys: +- - c_compiler_version + - cxx_compiler_version diff --git a/dev/tasks/conda-recipes/.ci_support/r/osx_arm64_r_base4.2.yaml b/dev/tasks/conda-recipes/.ci_support/r/osx_arm64_r_base4.2.yaml index 1557b23ff96af..af8a07c42208e 100644 --- a/dev/tasks/conda-recipes/.ci_support/r/osx_arm64_r_base4.2.yaml +++ b/dev/tasks/conda-recipes/.ci_support/r/osx_arm64_r_base4.2.yaml @@ -3,7 +3,7 @@ MACOSX_DEPLOYMENT_TARGET: c_compiler: - clang c_compiler_version: -- '14' +- '15' channel_sources: - conda-forge channel_targets: @@ -11,7 +11,7 @@ channel_targets: cxx_compiler: - clangxx cxx_compiler_version: -- '14' +- '15' macos_machine: - arm64-apple-darwin20.0.0 pin_run_as_build: diff --git a/dev/tasks/conda-recipes/.ci_support/win_64_cuda_compiler_version10.2.yaml b/dev/tasks/conda-recipes/.ci_support/win_64_cuda_compiler_version11.2.yaml similarity index 78% rename from dev/tasks/conda-recipes/.ci_support/win_64_cuda_compiler_version10.2.yaml rename to dev/tasks/conda-recipes/.ci_support/win_64_cuda_compiler_version11.2.yaml index 6ea00e3bd0d3f..f75d92e276d9e 100644 --- a/dev/tasks/conda-recipes/.ci_support/win_64_cuda_compiler_version10.2.yaml +++ b/dev/tasks/conda-recipes/.ci_support/win_64_cuda_compiler_version11.2.yaml @@ -1,3 +1,5 @@ +aws_crt_cpp: +- 0.20.3 aws_sdk_cpp: - 1.10.57 bzip2: @@ -13,9 +15,9 @@ channel_targets: cuda_compiler: - nvcc cuda_compiler_version: -- '10.2' +- '11.2' cuda_compiler_version_min: -- '10.2' +- '11.2' cxx_compiler: - vs2019 gflags: @@ -23,28 +25,30 @@ gflags: glog: - '0.6' google_cloud_cpp: -- 2.8.0 +- '2.12' libabseil: - '20230125' libcrc32c: - '1.1' libcurl: -- '7' +- '8' libgrpc: -- '1.52' +- '1.54' +- '1.56' libprotobuf: - '3.21' +- 4.23.3 lz4_c: - 1.9.3 numpy: - '1.21' - '1.23' -- '1.20' -- '1.20' +- '1.21' +- '1.21' openssl: - '3' orc: -- 1.8.3 +- 1.9.0 pin_run_as_build: python: min_pin: x.x @@ -55,7 +59,7 @@ python: - 3.8.* *_cpython - 3.9.* *_cpython re2: -- 2023.02.02 +- 2023.03.02 snappy: - '1' target_platform: @@ -63,6 +67,10 @@ target_platform: thrift_cpp: - 0.18.1 zip_keys: +- - cuda_compiler + - cuda_compiler_version +- - libgrpc + - libprotobuf - - python - numpy zlib: diff --git a/dev/tasks/conda-recipes/.ci_support/win_64_cuda_compiler_versionNone.yaml b/dev/tasks/conda-recipes/.ci_support/win_64_cuda_compiler_versionNone.yaml index 183356662c648..6d8fb15b15a2a 100644 --- a/dev/tasks/conda-recipes/.ci_support/win_64_cuda_compiler_versionNone.yaml +++ b/dev/tasks/conda-recipes/.ci_support/win_64_cuda_compiler_versionNone.yaml @@ -1,3 +1,5 @@ +aws_crt_cpp: +- 0.20.3 aws_sdk_cpp: - 1.10.57 bzip2: @@ -11,11 +13,11 @@ channel_sources: channel_targets: - conda-forge main cuda_compiler: -- nvcc +- None cuda_compiler_version: - None cuda_compiler_version_min: -- '10.2' +- '11.2' cxx_compiler: - vs2019 gflags: @@ -23,28 +25,30 @@ gflags: glog: - '0.6' google_cloud_cpp: -- 2.8.0 +- '2.12' libabseil: - '20230125' libcrc32c: - '1.1' libcurl: -- '7' +- '8' libgrpc: -- '1.52' +- '1.54' +- '1.56' libprotobuf: - '3.21' +- 4.23.3 lz4_c: - 1.9.3 numpy: - '1.21' - '1.23' -- '1.20' -- '1.20' +- '1.21' +- '1.21' openssl: - '3' orc: -- 1.8.3 +- 1.9.0 pin_run_as_build: python: min_pin: x.x @@ -55,7 +59,7 @@ python: - 3.8.* *_cpython - 3.9.* *_cpython re2: -- 2023.02.02 +- 2023.03.02 snappy: - '1' target_platform: @@ -63,6 +67,10 @@ target_platform: thrift_cpp: - 0.18.1 zip_keys: +- - cuda_compiler + - cuda_compiler_version +- - libgrpc + - libprotobuf - - python - numpy zlib: diff --git a/dev/tasks/conda-recipes/arrow-cpp/activate.sh b/dev/tasks/conda-recipes/arrow-cpp/activate.sh index 90210fac0a034..8757612781bbe 100644 --- a/dev/tasks/conda-recipes/arrow-cpp/activate.sh +++ b/dev/tasks/conda-recipes/arrow-cpp/activate.sh @@ -7,24 +7,70 @@ # doesn't come with a deactivate script, because the symlink # is benign and doesn't need to be deleted. -# where the GDB wrappers get installed -GDB_PREFIX=$CONDA_PREFIX/share/gdb/auto-load +_la_log() { + if [ "${CF_LIBARROW_ACTIVATE_LOGGING:-}" = "1" ]; then + # The following loop is necessary to handle multi-line strings + # like for the output of `ls -al`. + printf '%s\n' "$*" | while IFS= read -r line + do + echo "$CONDA_PREFIX/etc/conda/activate.d/libarrow_activate.sh DEBUG: $line" + done + fi +} + +_la_log "Beginning libarrow activation." -# If the directory is not writable, nothing can be done -if [ ! -w $GDB_PREFIX ]; then - return -fi +# where the GDB wrappers get installed +_la_gdb_prefix="$CONDA_PREFIX/share/gdb/auto-load" -# this needs to be in sync with the respective patch -PLACEHOLDER=replace_this_section_with_absolute_slashed_path_to_CONDA_PREFIX +# this needs to be in sync with ARROW_GDB_INSTALL_DIR in build.sh +_la_placeholder="replace_this_section_with_absolute_slashed_path_to_CONDA_PREFIX" # the paths here are intentionally stacked, see #935, resp. # https://github.com/apache/arrow/blob/master/docs/source/cpp/gdb.rst#manual-loading -WRAPPER_DIR=$GDB_PREFIX/$CONDA_PREFIX/lib +_la_symlink_dir="$_la_gdb_prefix/$CONDA_PREFIX/lib" +_la_orig_install_dir="$_la_gdb_prefix/$_la_placeholder/lib" -mkdir -p $WRAPPER_DIR -# there's only one lib in that folder, but the libname changes +_la_log " _la_gdb_prefix: $_la_gdb_prefix" +_la_log " _la_placeholder: $_la_placeholder" +_la_log " _la_symlink_dir: $_la_symlink_dir" +_la_log " _la_orig_install_dir: $_la_orig_install_dir" +_la_log " content of that folder:" +_la_log "$(ls -al "$_la_orig_install_dir" | sed 's/^/ /')" + +# there's only one lib in the _la_orig_install_dir folder, but the libname changes # based on the version so use a loop instead of hardcoding it. -for f in $GDB_PREFIX/$PLACEHOLDER/lib/*.py; do - # overwrite, because we don't have deactivation (i.e. symlink remains) - ln -sf $f $WRAPPER_DIR/$(basename $f) +for _la_target in "$_la_orig_install_dir/"*.py; do + if [ ! -e "$_la_target" ]; then + # If the file doesn't exist, skip this iteration of the loop. + # (This happens when no files are found, in which case the + # loop runs with target equal to the pattern itself.) + _la_log 'Folder $_la_orig_install_dir seems to not contain .py files, skipping' + continue + fi + _la_symlink="$_la_symlink_dir/$(basename "$_la_target")" + _la_log " _la_target: $_la_target" + _la_log " _la_symlink: $_la_symlink" + if [ -L "$_la_symlink" ] && [ "$(readlink "$_la_symlink")" = "$_la_target" ]; then + _la_log 'symlink $_la_symlink already exists and points to $_la_target, skipping.' + continue + fi + _la_log 'Creating symlink $_la_symlink pointing to $_la_target' + mkdir -p "$_la_symlink_dir" || true + # this check also creates the symlink; if it fails, we enter the if-branch. + if ! ln -sf "$_la_target" "$_la_symlink"; then + echo -n "${BASH_SOURCE[0]} ERROR: Failed to create symlink from " + echo -n "'$_la_target' to '$_la_symlink'" + echo + continue + fi done + +_la_log "Libarrow activation complete." + +unset _la_gdb_prefix +unset _la_log +unset _la_orig_install_dir +unset _la_placeholder +unset _la_symlink +unset _la_symlink_dir +unset _la_target diff --git a/dev/tasks/conda-recipes/arrow-cpp/bld-arrow.bat b/dev/tasks/conda-recipes/arrow-cpp/build-arrow.bat similarity index 89% rename from dev/tasks/conda-recipes/arrow-cpp/bld-arrow.bat rename to dev/tasks/conda-recipes/arrow-cpp/build-arrow.bat index 60c81be741128..1268771643d4f 100644 --- a/dev/tasks/conda-recipes/arrow-cpp/bld-arrow.bat +++ b/dev/tasks/conda-recipes/arrow-cpp/build-arrow.bat @@ -1,16 +1,12 @@ @echo on -mkdir "%SRC_DIR%"\cpp\build -pushd "%SRC_DIR%"\cpp\build +mkdir cpp\build +pushd cpp\build :: Enable CUDA support if "%cuda_compiler_version%"=="None" ( set "EXTRA_CMAKE_ARGS=-DARROW_CUDA=OFF" ) else ( - REM this should move to nvcc-feedstock - set "CUDA_PATH=%CUDA_PATH:\=/%" - set "CUDA_HOME=%CUDA_HOME:\=/%" - set "EXTRA_CMAKE_ARGS=-DARROW_CUDA=ON" ) @@ -18,8 +14,9 @@ if "%cuda_compiler_version%"=="None" ( set "READ_RECIPE_META_YAML_WHY_NOT=OFF" :: for available switches see -:: https://github.com/apache/arrow/blame/apache-arrow-11.0.0/cpp/cmake_modules/DefineOptions.cmake +:: https://github.com/apache/arrow/blame/apache-arrow-12.0.0/cpp/cmake_modules/DefineOptions.cmake cmake -G "Ninja" ^ + -DARROW_ACERO=ON ^ -DARROW_BOOST_USE_SHARED:BOOL=ON ^ -DARROW_BUILD_STATIC:BOOL=OFF ^ -DARROW_BUILD_TESTS:BOOL=OFF ^ @@ -69,3 +66,6 @@ cmake --build . --target install --config Release if %ERRORLEVEL% neq 0 exit 1 popd + +:: clean up between builds (and to save space) +rmdir /s /q cpp\build diff --git a/dev/tasks/conda-recipes/arrow-cpp/build-arrow.sh b/dev/tasks/conda-recipes/arrow-cpp/build-arrow.sh index fb8cbade86568..dc588f9473870 100755 --- a/dev/tasks/conda-recipes/arrow-cpp/build-arrow.sh +++ b/dev/tasks/conda-recipes/arrow-cpp/build-arrow.sh @@ -30,27 +30,21 @@ fi # Enable CUDA support if [[ ! -z "${cuda_compiler_version+x}" && "${cuda_compiler_version}" != "None" ]] then - if [[ -z "${CUDA_HOME+x}" ]] - then - echo "cuda_compiler_version=${cuda_compiler_version} CUDA_HOME=$CUDA_HOME" - CUDA_GDB_EXECUTABLE=$(which cuda-gdb || exit 0) - if [[ -n "$CUDA_GDB_EXECUTABLE" ]] - then - CUDA_HOME=$(dirname $(dirname $CUDA_GDB_EXECUTABLE)) - else - echo "Cannot determine CUDA_HOME: cuda-gdb not in PATH" - return 1 - fi - fi - EXTRA_CMAKE_ARGS=" ${EXTRA_CMAKE_ARGS} -DARROW_CUDA=ON -DCUDA_TOOLKIT_ROOT_DIR=${CUDA_HOME} -DCMAKE_LIBRARY_PATH=${CUDA_HOME}/lib64/stubs" + EXTRA_CMAKE_ARGS=" ${EXTRA_CMAKE_ARGS} -DARROW_CUDA=ON -DCUDA_TOOLKIT_ROOT_DIR=${CUDA_HOME} -DCMAKE_LIBRARY_PATH=${CONDA_BUILD_SYSROOT}/lib" else EXTRA_CMAKE_ARGS=" ${EXTRA_CMAKE_ARGS} -DARROW_CUDA=OFF" fi -if [[ "${target_platform}" == "osx-arm64" ]]; then - EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DCLANG_EXECUTABLE=${BUILD_PREFIX}/bin/clang -DLLVM_LINK_EXECUTABLE=${BUILD_PREFIX}/bin/llvm-link" +if [[ "${build_platform}" != "${target_platform}" ]]; then + # point to a usable protoc/grpc_cpp_plugin if we're cross-compiling + EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DProtobuf_PROTOC_EXECUTABLE=$BUILD_PREFIX/bin/protoc" + if [[ ! -f ${BUILD_PREFIX}/bin/${CONDA_TOOLCHAIN_HOST}-clang ]]; then + ln -sf ${BUILD_PREFIX}/bin/clang ${BUILD_PREFIX}/bin/${CONDA_TOOLCHAIN_HOST}-clang + fi + EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DCLANG_EXECUTABLE=${BUILD_PREFIX}/bin/${CONDA_TOOLCHAIN_HOST}-clang" + EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DLLVM_LINK_EXECUTABLE=${BUILD_PREFIX}/bin/llvm-link" sed -ie "s;protoc-gen-grpc.*$;protoc-gen-grpc=${BUILD_PREFIX}/bin/grpc_cpp_plugin\";g" ../src/arrow/flight/CMakeLists.txt - sed -ie 's;"--with-jemalloc-prefix\=je_arrow_";"--with-jemalloc-prefix\=je_arrow_" "--with-lg-page\=14";g' ../cmake_modules/ThirdpartyToolchain.cmake + sed -ie 's;"--with-jemalloc-prefix\=je_arrow_";"--with-jemalloc-prefix\=je_arrow_" "--with-lg-page\=16";g' ../cmake_modules/ThirdpartyToolchain.cmake fi # disable -fno-plt, which causes problems with GCC on PPC @@ -59,23 +53,19 @@ if [[ "$target_platform" == "linux-ppc64le" ]]; then CXXFLAGS="$(echo $CXXFLAGS | sed 's/-fno-plt //g')" fi -# Limit number of threads used to avoid hardware oversubscription if [[ "${target_platform}" == "linux-aarch64" ]] || [[ "${target_platform}" == "linux-ppc64le" ]]; then - export CMAKE_BUILD_PARALLEL_LEVEL=3 -fi - -# point to a usable protoc if we're running on a different architecture than the target -if [[ "${build_platform}" != "${target_platform}" ]]; then - EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DProtobuf_PROTOC_EXECUTABLE=$BUILD_PREFIX/bin/protoc" + # Limit number of threads used to avoid hardware oversubscription + export CMAKE_BUILD_PARALLEL_LEVEL=3 fi # reusable variable for dependencies we cannot yet unvendor export READ_RECIPE_META_YAML_WHY_NOT=OFF # for available switches see -# https://github.com/apache/arrow/blame/apache-arrow-11.0.0/cpp/cmake_modules/DefineOptions.cmake -# placeholder in ARROW_GDB_INSTALL_DIR must match what's used for replacement in activate.sh +# https://github.com/apache/arrow/blame/apache-arrow-12.0.0/cpp/cmake_modules/DefineOptions.cmake +# placeholder in ARROW_GDB_INSTALL_DIR must match _la_placeholder in activate.sh cmake -GNinja \ + -DARROW_ACERO=ON \ -DARROW_BOOST_USE_SHARED=ON \ -DARROW_BUILD_BENCHMARKS=OFF \ -DARROW_BUILD_STATIC=OFF \ @@ -129,3 +119,6 @@ cmake -GNinja \ cmake --build . --target install --config Release popd + +# clean up between builds (and to save space) +rm -rf cpp/build diff --git a/dev/tasks/conda-recipes/arrow-cpp/bld-pyarrow.bat b/dev/tasks/conda-recipes/arrow-cpp/build-pyarrow.bat similarity index 54% rename from dev/tasks/conda-recipes/arrow-cpp/bld-pyarrow.bat rename to dev/tasks/conda-recipes/arrow-cpp/build-pyarrow.bat index 084faf74e4a10..e3eaa32bcf848 100644 --- a/dev/tasks/conda-recipes/arrow-cpp/bld-pyarrow.bat +++ b/dev/tasks/conda-recipes/arrow-cpp/build-pyarrow.bat @@ -2,22 +2,10 @@ pushd "%SRC_DIR%"\python -@rem the symlinks for cmake modules don't work here -@rem NOTE: In contrast to conda-forge, they work here as we clone from git. -@rem del cmake_modules\BuildUtils.cmake -@rem del cmake_modules\SetupCxxFlags.cmake -@rem del cmake_modules\CompilerInfo.cmake -@rem del cmake_modules\FindNumPy.cmake -@rem del cmake_modules\FindPythonLibsNew.cmake -@rem copy /Y "%SRC_DIR%\cpp\cmake_modules\BuildUtils.cmake" cmake_modules\ -@rem copy /Y "%SRC_DIR%\cpp\cmake_modules\SetupCxxFlags.cmake" cmake_modules\ -@rem copy /Y "%SRC_DIR%\cpp\cmake_modules\CompilerInfo.cmake" cmake_modules\ -@rem copy /Y "%SRC_DIR%\cpp\cmake_modules\FindNumPy.cmake" cmake_modules\ -@rem copy /Y "%SRC_DIR%\cpp\cmake_modules\FindPythonLibsNew.cmake" cmake_modules\ - SET ARROW_HOME=%LIBRARY_PREFIX% SET SETUPTOOLS_SCM_PRETEND_VERSION=%PKG_VERSION% SET PYARROW_BUILD_TYPE=release +SET PYARROW_WITH_ACERO=1 SET PYARROW_WITH_DATASET=1 SET PYARROW_WITH_FLIGHT=1 SET PYARROW_WITH_GANDIVA=1 diff --git a/dev/tasks/conda-recipes/arrow-cpp/build-pyarrow.sh b/dev/tasks/conda-recipes/arrow-cpp/build-pyarrow.sh index 14c67ede6324e..9c12321a1c115 100755 --- a/dev/tasks/conda-recipes/arrow-cpp/build-pyarrow.sh +++ b/dev/tasks/conda-recipes/arrow-cpp/build-pyarrow.sh @@ -6,6 +6,7 @@ export ARROW_HOME=$PREFIX export PARQUET_HOME=$PREFIX export SETUPTOOLS_SCM_PRETEND_VERSION=$PKG_VERSION export PYARROW_BUILD_TYPE=release +export PYARROW_WITH_ACERO=1 export PYARROW_WITH_DATASET=1 export PYARROW_WITH_FLIGHT=1 export PYARROW_WITH_GANDIVA=1 @@ -37,9 +38,9 @@ if [[ "${target_platform}" == osx-* ]]; then CXXFLAGS="${CXXFLAGS} -D_LIBCPP_DISABLE_AVAILABILITY" fi -# Limit number of threads used to avoid hardware oversubscription if [[ "${target_platform}" == "linux-aarch64" ]] || [[ "${target_platform}" == "linux-ppc64le" ]]; then - export CMAKE_BUILD_PARALLEL_LEVEL=4 + # Limit number of threads used to avoid hardware oversubscription + export CMAKE_BUILD_PARALLEL_LEVEL=4 fi cd python diff --git a/dev/tasks/conda-recipes/arrow-cpp/meta.yaml b/dev/tasks/conda-recipes/arrow-cpp/meta.yaml index 2f79bbe958c07..ac4b29eb5ee7e 100644 --- a/dev/tasks/conda-recipes/arrow-cpp/meta.yaml +++ b/dev/tasks/conda-recipes/arrow-cpp/meta.yaml @@ -4,7 +4,7 @@ {% set build_ext_version = ARROW_VERSION %} {% set build_ext = "cuda" if cuda_enabled else "cpu" %} {% set proc_build_number = "0" %} -{% set llvm_version = "14" %} +{% set llvm_version = "15" %} # see https://github.com/apache/arrow/blob/apache-arrow-10.0.1/cpp/CMakeLists.txt#L88-L90 {% set so_version = (version.split(".")[0] | int * 100 + version.split(".")[1] | int) ~ "." ~ version.split(".")[2] ~ ".0" %} @@ -21,11 +21,9 @@ build: # for cuda support, building with one version is enough to be compatible with # all later versions, since arrow is only using libcuda, and not libcudart. skip: true # [cuda_compiler_version not in ("None", cuda_compiler_version_min)] - # temporary: skip CUDA on aarch/ppc until cross-compilation works, see - # https://github.com/conda-forge/conda-forge-ci-setup-feedstock/pull/210 - skip: true # [(aarch64 or ppc64le) and (cuda_compiler_version != "None")] + # arrow promises API- & ABI-compatibility along SemVer, see #1096 run_exports: - - {{ pin_subpackage("libarrow", max_pin="x.x.x") }} + - {{ pin_subpackage("libarrow", max_pin="x") }} outputs: - name: apache-arrow-proc @@ -57,30 +55,38 @@ outputs: - exit 0 - name: libarrow - script: build-arrow.sh # [not win] - script: bld-arrow.bat # [win] + script: build-arrow.sh # [unix] + script: build-arrow.bat # [win] version: {{ version }} build: string: h{{ PKG_HASH }}_{{ PKG_BUILDNUM }}_{{ build_ext }} run_exports: - {{ pin_subpackage("libarrow", max_pin="x.x.x") }} + ignore_run_exports_from: + - {{ compiler("cuda") }} # [cuda_compiler_version != "None"] + # arrow only uses headers, apparently + - gflags + # shared lib linked on unix, not on win + - glog # [win] ignore_run_exports: - - cudatoolkit + # we don't need all of brotli's run-exports + - libbrotlicommon track_features: {{ "[arrow-cuda]" if cuda_enabled else "" }} missing_dso_whitelist: - - "*/libcuda.so.*" # [linux] - - "*/nvcuda.dll" # [win] + - '*/libcuda.so.*' # [linux] + - '*/nvcuda.dll' # [win] requirements: build: - {{ compiler("c") }} - {{ compiler("cxx") }} - {{ compiler("cuda") }} # [cuda_compiler_version != "None"] - - clangdev {{ llvm_version }} # [osx and arm64] - - llvmdev {{ llvm_version }} # [osx and arm64] - - gnuconfig # [osx and arm64] # needs to run protoc & grpc_cpp_plugin - libgrpc # [build_platform != target_platform] - libprotobuf # [build_platform != target_platform] + # needed for gandiva + - clangdev {{ llvm_version }} # [build_platform != target_platform] + - llvmdev {{ llvm_version }} # [build_platform != target_platform] + - gnuconfig # [build_platform != target_platform] - cmake - ninja # necessary for vendored jemalloc @@ -91,12 +97,11 @@ outputs: # https://github.com/apache/arrow/blob/apache-arrow-11.0.0/cpp/cmake_modules/ThirdpartyToolchain.cmake#L46-L75 - clangdev {{ llvm_version }} - llvmdev {{ llvm_version }} - - aws-crt-cpp # [unix] + - aws-crt-cpp - aws-sdk-cpp - boost-cpp >=1.70 - brotli - bzip2 - - c-ares # not yet: https://github.com/conda-forge/cpp-opentelemetry-sdk-feedstock/issues/38 # - cpp-opentelemetry-sdk # - proto-opentelemetry-proto =={{ cpp_opentelemetry_sdk }} @@ -106,11 +111,6 @@ outputs: # arrow uses a customized jemalloc, see #944 # - jemalloc - libabseil - # since libgoogle-cloud is static on windows, see - # https://github.com/conda-forge/google-cloud-cpp-feedstock/pull/108, - # its dependencies leak into the build here - - libcrc32c # [win] - - libcurl # [win] - libgrpc - libprotobuf - libutf8proc @@ -127,17 +127,26 @@ outputs: - xsimd - zlib - zstd + - __cuda >={{ cuda_compiler_version_min }} # [cuda_compiler_version != "None"] + # since libgoogle-cloud is static on windows, see + # https://github.com/conda-forge/google-cloud-cpp-feedstock/pull/108, + # its host deps (which aren't yet covered above) leak into the build here + - libcrc32c # [win] + - libcurl # [win] + # same for libgrpc (before 1.55.0, which is coupled with libprotobuf 4.23.x) + - c-ares # [win and libprotobuf == "3.21"] run_constrained: - apache-arrow-proc =*={{ build_ext }} - - cudatoolkit >={{ cuda_compiler_version_min }} # [cuda_compiler_version != "None"] # make sure we don't co-install with old version of old package name - arrow-cpp ={{ version }} + # old parquet lib output, now part of this feedstock + - parquet-cpp <0.0a0 test: commands: {% set headers = [ - "arrow/api.h", "arrow/flight/types.h", "arrow/flight/sql/api.h", - "gandiva/engine.h", "parquet/api/reader.h" + "arrow/api.h", "arrow/acero/api.h", "arrow/flight/types.h", + "arrow/flight/sql/api.h", "gandiva/engine.h", "parquet/api/reader.h" ] %} {% for each_header in headers %} # headers @@ -146,8 +155,8 @@ outputs: {% endfor %} {% set libs = (cuda_compiler_version != "None") * ["arrow_cuda"] + [ - "arrow", "arrow_dataset", "arrow_flight", "arrow_flight_sql", - "arrow_substrait", "gandiva", "parquet" + "arrow", "arrow_acero", "arrow_dataset", "arrow_flight", + "arrow_flight_sql", "arrow_substrait", "gandiva", "parquet" ] %} {% for each_lib in libs %} # shared @@ -189,6 +198,8 @@ outputs: requirements: host: - {{ pin_subpackage('libarrow', exact=True) }} + # avoid wrappers for different builds colliding due to identical hashes + - libprotobuf run: - {{ pin_subpackage('libarrow', exact=True) }} test: @@ -196,21 +207,21 @@ outputs: - exit 0 - name: pyarrow - script: build-pyarrow.sh # [not win] - script: bld-pyarrow.bat # [win] + script: build-pyarrow.sh # [unix] + script: build-pyarrow.bat # [win] version: {{ version }} build: string: py{{ CONDA_PY }}h{{ PKG_HASH }}_{{ PKG_BUILDNUM }}_{{ build_ext }} - ignore_run_exports: - - cudatoolkit + ignore_run_exports_from: + - {{ compiler("cuda") }} # [cuda_compiler_version != "None"] track_features: {{ "[arrow-cuda]" if cuda_enabled else "" }} rpaths: - lib/ - {{ SP_DIR }}/pyarrow missing_dso_whitelist: # not actually missing, but installed into SP_DIR, see tests - - "*/arrow_python.dll" # [win] - - "*/arrow_python_flight.dll" # [win] + - '*/arrow_python.dll' # [win] + - '*/arrow_python_flight.dll' # [win] requirements: build: - {{ compiler("c") }} @@ -219,29 +230,28 @@ outputs: - {{ compiler("cuda") }} # [cuda_compiler_version != "None"] - python # [build_platform != target_platform] - cross-python_{{ target_platform }} # [build_platform != target_platform] - - cython # [build_platform != target_platform] + - cython <3 # [build_platform != target_platform] - numpy # [build_platform != target_platform] - cmake - ninja host: - - {{ pin_subpackage('libarrow', exact=True) }} + # we're building for two protobuf versions, cannot pin exactly + # - {{ pin_subpackage('libarrow', exact=True) }} + - libarrow ={{ version }}=*_{{ PKG_BUILDNUM }}_{{ build_ext }} - clangdev {{ llvm_version }} - llvmdev {{ llvm_version }} - - cython - - gflags # [unix] + - cython <3 - numpy - python - setuptools - setuptools_scm run: - - {{ pin_subpackage('libarrow', exact=True) }} + # - {{ pin_subpackage('libarrow', exact=True) }} + - libarrow ={{ version }}=*_{{ PKG_BUILDNUM }}_{{ build_ext }} - {{ pin_compatible('numpy') }} - # empty parquet-cpp metapackage, force old versions to be uninstalled - - parquet-cpp 1.5.1.* - python run_constrained: - apache-arrow-proc =*={{ build_ext }} - - cudatoolkit >={{ cuda_compiler_version_min }} # [cuda_compiler_version != "None"] test: files: @@ -288,13 +298,13 @@ outputs: summary: Python libraries for Apache Arrow - name: pyarrow-tests - script: build-pyarrow.sh # [not win] - script: bld-pyarrow.bat # [win] + script: build-pyarrow.sh # [unix] + script: build-pyarrow.bat # [win] version: {{ version }} build: string: py{{ CONDA_PY }}h{{ PKG_HASH }}_{{ PKG_BUILDNUM }}_{{ build_ext }} - ignore_run_exports: - - cudatoolkit + ignore_run_exports_from: + - {{ compiler("cuda") }} # [cuda_compiler_version != "None"] track_features: {{ "[arrow-cuda]" if cuda_enabled else "" }} requirements: build: @@ -304,7 +314,7 @@ outputs: - {{ compiler("cuda") }} # [cuda_compiler_version != "None"] - python # [build_platform != target_platform] - cross-python_{{ target_platform }} # [build_platform != target_platform] - - cython # [build_platform != target_platform] + - cython <3 # [build_platform != target_platform] - numpy # [build_platform != target_platform] - cmake - ninja @@ -313,7 +323,7 @@ outputs: - {{ pin_subpackage('pyarrow', exact=True) }} - clangdev {{ llvm_version }} - llvmdev {{ llvm_version }} - - cython + - cython <3 - numpy - python - setuptools @@ -323,27 +333,36 @@ outputs: - python run_constrained: - apache-arrow-proc =*={{ build_ext }} - - cudatoolkit >={{ cuda_compiler_version_min }} # [cuda_compiler_version != "None"] + # crossbow CI: reduce to one python version, except on (unemulated) linux, where it's fast enough + {% if linux64 or py == 311 %} + # {% if not (aarch64 or ppc64le) or py in (310, 311) %} + # only run the full test suite for one python version when in emulation (each run takes ~45min); + # there's essentially zero divergence in behaviour across python versions anyway, and otherwise + # CUDA builds for aarch/ppc consistently run out of disk space on azure for some reason test: requires: + # vary protobuf version in test suite (historically, test failures only have a very + # weak dependency on python version, so we don't lose coverage by doing half & half) + - libprotobuf <4 # [py % 2 == 0] # test_cpp_extension_in_python requires a compiler - {{ compiler("cxx") }} # [linux] - - pytest + # temporary pin due to missing fixture + - pytest <7.4.0 - pytest-lazy-fixture - backports.zoneinfo # [py<39] - cffi - cloudpickle - - cython + - cython <3 - fastparquet - fsspec - hypothesis - pandas - scipy - # not all OSes/arches available in conda-forge - - pytorch * # [unix and not ppc64le] - # not yet rebuilt for libabseil 20230125 - # - tensorflow # [unix and x86_64 and py<311] + # these are generally (far) behind on migrating abseil/grpc/protobuf, + # and using them as test dependencies blocks the migrator unnecessarily + # - pytorch + # - tensorflow # we're not building java bindings # - jpype1 # doesn't get picked up correctly @@ -364,6 +383,8 @@ outputs: # skip tests that raise SIGINT and crash the test suite {% set tests_to_skip = tests_to_skip + " or (test_csv and test_cancellation)" %} # [linux] {% set tests_to_skip = tests_to_skip + " or (test_flight and test_interrupt)" %} # [linux] + # tests that may crash the agent due to out-of-bound memory writes or other risky stuff + {% set tests_to_skip = tests_to_skip + " or test_debug_memory_pool" %} # [aarch64 or ppc64le] # cannot pass -D_LIBCPP_DISABLE_AVAILABILITY to test suite for our older macos sdk {% set tests_to_skip = tests_to_skip + " or test_cpp_extension_in_python" %} # [osx] # skip tests that make invalid(-for-conda) assumptions about the compilers setup @@ -373,6 +394,7 @@ outputs: {% set tests_to_skip = tests_to_skip + " or test_debug_memory_pool_disabled" %} # [aarch64 or ppc64le] {% set tests_to_skip = tests_to_skip + " or test_env_var_io_thread_count" %} # [aarch64 or ppc64le] # vvvvvvv TESTS THAT SHOULDN'T HAVE TO BE SKIPPED vvvvvvv + {% set tests_to_skip = tests_to_skip + " or test_extension_to_pandas_storage_type" %} # segfaults on OSX: to investigate ASAP {% set tests_to_skip = tests_to_skip + " or test_flight" %} # [osx] # gandiva tests are segfaulting on ppc @@ -381,14 +403,14 @@ outputs: {% set tests_to_skip = tests_to_skip + " or test_safe_cast_from_float_with_nans_to_int" %} # [ppc64le] # gandiva tests are segfaulting on ppc {% set tests_to_skip = tests_to_skip + " or test_float_with_null_as_integer" %} # [ppc64le] - # "Unsupported backend 'nonexistent' specified in ARROW_DEFAULT_MEMORY_POOL" - {% set tests_to_skip = tests_to_skip + " or (test_memory and test_env_var)" %} # [unix] # test is broken; header is in $PREFIX, not $SP_DIR {% set tests_to_skip = tests_to_skip + " or (test_misc and test_get_include)" %} # [unix] + # flaky tests that fail occasionally + {% set tests_to_skip = tests_to_skip + " or test_total_bytes_allocated " %} # [linux] + {% set tests_to_skip = tests_to_skip + " or test_feather_format " %} # [linux] # ^^^^^^^ TESTS THAT SHOULDN'T HAVE TO BE SKIPPED ^^^^^^^ - - # crossbow CI: reduce to one python version, except on (unemulated) linux, where it's fast enough - - pytest -v -rfEs -k "not ({{ tests_to_skip }})" # [linux64 or (py==310 and build_platform==target_platform)] + - pytest -rfEs -k "not ({{ tests_to_skip }})" + {% endif %} about: home: http://github.com/apache/arrow diff --git a/dev/tasks/conda-recipes/azure.linux.yml b/dev/tasks/conda-recipes/azure.linux.yml index b9a54647cc525..279ffb48ccd60 100755 --- a/dev/tasks/conda-recipes/azure.linux.yml +++ b/dev/tasks/conda-recipes/azure.linux.yml @@ -13,6 +13,29 @@ jobs: UPLOAD_PACKAGES: False steps: + - script: | + sudo mkdir -p /opt/empty_dir || true + for d in \ + /opt/ghc \ + /opt/hostedtoolcache \ + /usr/lib/jvm \ + /usr/local/.ghcup \ + /usr/local/lib/android \ + /usr/local/share/powershell \ + /usr/share/dotnet \ + /usr/share/swift \ + ; do + sudo rsync --stats -a --delete /opt/empty_dir/ $d || true + done + sudo apt-get purge -y -f firefox \ + google-chrome-stable \ + microsoft-edge-stable + sudo apt-get autoremove -y >& /dev/null + sudo apt-get autoclean -y >& /dev/null + sudo docker image prune --all --force + df -h + displayName: Manage disk space + # configure qemu binfmt-misc running. This allows us to run docker containers # embedded qemu-static - script: | diff --git a/dev/tasks/conda-recipes/r-arrow/configure.win b/dev/tasks/conda-recipes/r-arrow/configure.win index fb16a810b8f22..0fc96576bde74 100755 --- a/dev/tasks/conda-recipes/r-arrow/configure.win +++ b/dev/tasks/conda-recipes/r-arrow/configure.win @@ -2,7 +2,7 @@ set -euxo pipefail -echo "PKG_CPPFLAGS=-DNDEBUG -I\"${LIBRARY_PREFIX}/include\" -I\"${PREFIX}/include\" -DARROW_R_WITH_PARQUET -DARROW_R_WITH_DATASET -DARROW_R_WITH_S3 -DARROW_R_WITH_JSON" > src/Makevars.win +echo "PKG_CPPFLAGS=-DNDEBUG -I\"${LIBRARY_PREFIX}/include\" -I\"${PREFIX}/include\" -DARROW_R_WITH_ACERO -DARROW_R_WITH_PARQUET -DARROW_R_WITH_DATASET -DARROW_R_WITH_S3 -DARROW_R_WITH_JSON" > src/Makevars.win echo "PKG_CXXFLAGS=\$(CXX_VISIBILITY)" >> src/Makevars.win echo 'CXX_STD=CXX17' >> src/Makevars.win -echo "PKG_LIBS=-L\"${LIBRARY_PREFIX}/lib\" -larrow_dataset -lparquet -larrow" >> src/Makevars.win +echo "PKG_LIBS=-L\"${LIBRARY_PREFIX}/lib\" -larrow_dataset -larrow_acero -lparquet -larrow" >> src/Makevars.win diff --git a/dev/tasks/conda-recipes/r-arrow/meta.yaml b/dev/tasks/conda-recipes/r-arrow/meta.yaml index 28ee8eb92c921..e8b834254f41c 100644 --- a/dev/tasks/conda-recipes/r-arrow/meta.yaml +++ b/dev/tasks/conda-recipes/r-arrow/meta.yaml @@ -10,8 +10,6 @@ source: path: ../../../../ build: - # 4.1 not usable anymore unless https://github.com/conda-forge/r-base-feedstock/pull/236 gets merged - skip: true # [unix and (r_base == "4.1")] merge_build_host: true # [win] number: 0 rpaths: diff --git a/dev/tasks/docs/github.linux.yml b/dev/tasks/docs/github.linux.yml index f9b2e111e8f3e..6de297b663e01 100644 --- a/dev/tasks/docs/github.linux.yml +++ b/dev/tasks/docs/github.linux.yml @@ -21,11 +21,12 @@ jobs: test: - name: Docs Preview + name: Build Docs runs-on: ubuntu-latest {{ macros.github_set_env(env) }} steps: {{ macros.github_checkout_arrow(fetch_depth=fetch_depth|default(1))|indent }} + {{ macros.github_free_space()|indent }} {{ macros.github_install_archery()|indent }} - name: Execute Docker Build @@ -44,7 +45,8 @@ jobs: ref: {{ default_branch|default("main") }} path: crossbow fetch-depth: 1 - - name: Prepare docs + {% if publish %} + - name: Prepare Docs Preview run: | # build files are created by the docker user sudo chown -R ${USER}: build @@ -61,3 +63,10 @@ jobs: run: | aws s3 cp build/docs/ $BUCKET/pr_docs/{{ pr_number }}/ --recursive echo ":open_book: You can find the preview here: http://crossbow.voltrondata.com/pr_docs/{{ pr_number }}" >> $GITHUB_STEP_SUMMARY + {% endif %} + - name: Prepare Docs artifacts + run: | + cd build + sudo chown -R ${USER}: . + tar cvzf docs.tar.gz docs + {{ macros.github_upload_releases("build/docs.tar.gz")|indent }} diff --git a/dev/tasks/homebrew-formulae/apache-arrow-glib.rb b/dev/tasks/homebrew-formulae/apache-arrow-glib.rb index 1cb3a036b519a..b861c06b0201f 100644 --- a/dev/tasks/homebrew-formulae/apache-arrow-glib.rb +++ b/dev/tasks/homebrew-formulae/apache-arrow-glib.rb @@ -32,7 +32,7 @@ class ApacheArrowGlib < Formula url "https://www.apache.org/dyn/closer.lua?path=arrow/arrow-13.0.0-SNAPSHOT/apache-arrow-13.0.0-SNAPSHOT.tar.gz" sha256 "9948ddb6d4798b51552d0dca3252dd6e3a7d0f9702714fc6f5a1b59397ce1d28" license "Apache-2.0" - head "https://github.com/apache/arrow.git" + head "https://github.com/apache/arrow.git", branch: "main" livecheck do formula "apache-arrow" @@ -46,18 +46,12 @@ class ApacheArrowGlib < Formula depends_on "apache-arrow" depends_on "glib" - on_linux do - depends_on "gcc" - end - fails_with gcc: "5" def install - mkdir "build" do - system "meson", *std_meson_args, "-Dvapi=true", "../c_glib" - system "ninja", "-v" - system "ninja", "install", "-v" - end + system "meson", "setup", "build", "c_glib", *std_meson_args, "-Dvapi=true" + system "meson", "compile", "-C", "build", "--verbose" + system "meson", "install", "-C", "build" end test do @@ -82,9 +76,9 @@ def install -DNDEBUG -larrow-glib -larrow - -lglib-2.0 - -lgobject-2.0 -lgio-2.0 + -lgobject-2.0 + -lglib-2.0 ] system ENV.cc, "test.c", "-o", "test", *flags system "./test" diff --git a/dev/tasks/homebrew-formulae/apache-arrow.rb b/dev/tasks/homebrew-formulae/apache-arrow.rb index 85e490e784a6f..f5d7ff36ea041 100644 --- a/dev/tasks/homebrew-formulae/apache-arrow.rb +++ b/dev/tasks/homebrew-formulae/apache-arrow.rb @@ -32,57 +32,50 @@ class ApacheArrow < Formula url "https://www.apache.org/dyn/closer.lua?path=arrow/arrow-13.0.0-SNAPSHOT/apache-arrow-13.0.0-SNAPSHOT.tar.gz" sha256 "9948ddb6d4798b51552d0dca3252dd6e3a7d0f9702714fc6f5a1b59397ce1d28" license "Apache-2.0" - head "https://github.com/apache/arrow.git" + head "https://github.com/apache/arrow.git", branch: "main" depends_on "boost" => :build depends_on "cmake" => :build - depends_on "llvm" => :build + depends_on "llvm@15" => :build depends_on "aws-sdk-cpp" depends_on "brotli" + depends_on "bzip2" depends_on "glog" depends_on "grpc" depends_on "lz4" - depends_on "numpy" - depends_on "openssl@1.1" + depends_on "mimalloc" + depends_on "openssl@3" depends_on "protobuf" - depends_on "python@3.10" depends_on "rapidjson" depends_on "re2" depends_on "snappy" depends_on "thrift" depends_on "utf8proc" depends_on "zstd" - - on_linux do - depends_on "gcc" - end + uses_from_macos "python" => :build fails_with gcc: "5" def install - python = "python3.10" - - # https://github.com/Homebrew/homebrew-core/issues/76537 + # This isn't for https://github.com/Homebrew/homebrew-core/issues/76537 . + # This may improve performance. ENV.runtime_cpu_detection if Hardware::CPU.intel? - # https://github.com/Homebrew/homebrew-core/issues/94724 - # https://issues.apache.org/jira/browse/ARROW-15664 - ENV["HOMEBREW_OPTIMIZATION_LEVEL"] = "O2" - # link against system libc++ instead of llvm provided libc++ ENV.remove "HOMEBREW_LIBRARY_PATHS", Formula["llvm"].opt_lib args = %W[ + -DCMAKE_INSTALL_RPATH=#{rpath} -DARROW_ACERO=ON -DARROW_COMPUTE=ON -DARROW_CSV=ON -DARROW_DATASET=ON -DARROW_FILESYSTEM=ON -DARROW_FLIGHT=ON + -DARROW_FLIGHT_SQL=ON -DARROW_GANDIVA=ON -DARROW_GCS=ON -DARROW_HDFS=ON -DARROW_INSTALL_NAME_RPATH=OFF - -DARROW_JEMALLOC=ON -DARROW_JSON=ON -DARROW_MIMALLOC=ON -DARROW_ORC=ON @@ -96,16 +89,17 @@ def install -DARROW_WITH_UTF8PROC=ON -DARROW_WITH_ZLIB=ON -DARROW_WITH_ZSTD=ON - -DCMAKE_CXX_STANDARD=17 - -DCMAKE_FIND_PACKAGE_PREFER_CONFIG=TRUE - -DPython3_EXECUTABLE=#{which(python)} + -DPARQUET_BUILD_EXECUTABLES=ON ] + # Disable runtime SIMD dispatch. It may cause "illegal opcode" + # error on Intel Mac because of one-definition-rule violation. + # + # https://github.com/apache/arrow/issues/36685 + args << "-DARROW_RUNTIME_SIMD_LEVEL=NONE" if OS.mac? and Hardware::CPU.intel? - mkdir "build" do - system "cmake", "../cpp", *std_cmake_args, *args - system "make" - system "make", "install" - end + system "cmake", "-S", "cpp", "-B", "build", *args, *std_cmake_args + system "cmake", "--build", "build" + system "cmake", "--install", "build" end test do diff --git a/dev/tasks/homebrew-formulae/autobrew/apache-arrow-static.rb b/dev/tasks/homebrew-formulae/autobrew/apache-arrow-static.rb index c0df6a32175eb..4586649d0c0bc 100644 --- a/dev/tasks/homebrew-formulae/autobrew/apache-arrow-static.rb +++ b/dev/tasks/homebrew-formulae/autobrew/apache-arrow-static.rb @@ -25,7 +25,7 @@ class ApacheArrowStatic < Formula # Uncomment and update to test on a release candidate # mirror "https://dist.apache.org/repos/dist/dev/arrow/apache-arrow-8.0.0-rc1/apache-arrow-8.0.0.tar.gz" sha256 "9948ddb6d4798b51552d0dca3252dd6e3a7d0f9702714fc6f5a1b59397ce1d28" - head "https://github.com/apache/arrow.git" + head "https://github.com/apache/arrow.git", branch: "main" bottle do sha256 cellar: :any, arm64_big_sur: "ef89d21a110b89840cc6148add685d407e75bd633bc8f79625eb33d00e3694b4" diff --git a/dev/tasks/homebrew-formulae/autobrew/apache-arrow.rb b/dev/tasks/homebrew-formulae/autobrew/apache-arrow.rb index c09436d777ae9..b47d0edfe0dd7 100644 --- a/dev/tasks/homebrew-formulae/autobrew/apache-arrow.rb +++ b/dev/tasks/homebrew-formulae/autobrew/apache-arrow.rb @@ -21,7 +21,7 @@ class ApacheArrow < Formula homepage "https://arrow.apache.org/" url "https://www.apache.org/dyn/closer.lua?path=arrow/arrow-12.0.1.9000/apache-arrow-12.0.1.9000.tar.gz" sha256 "9948ddb6d4798b51552d0dca3252dd6e3a7d0f9702714fc6f5a1b59397ce1d28" - head "https://github.com/apache/arrow.git" + head "https://github.com/apache/arrow.git", branch: "main" bottle do cellar :any @@ -35,6 +35,7 @@ class ApacheArrow < Formula depends_on "aws-sdk-cpp" depends_on "brotli" depends_on "lz4" + depends_on "openssl@1.1" depends_on "snappy" depends_on "thrift" depends_on "zstd" diff --git a/dev/tasks/java-jars/github.yml b/dev/tasks/java-jars/github.yml index 59a8edb69e6e4..57f595e9db44e 100644 --- a/dev/tasks/java-jars/github.yml +++ b/dev/tasks/java-jars/github.yml @@ -22,8 +22,28 @@ jobs: build-cpp-ubuntu: - name: Build C++ libraries Ubuntu - runs-on: ubuntu-latest + {% set arch = '${{ matrix.platform.arch }}' %} + name: Build C++ libraries Ubuntu {{ arch }} + runs-on: {{ '${{ matrix.platform.runs_on }}' }} + env: + # architecture name used for archery build + ARCH: {{ '${{ matrix.platform.archery_arch }}' }} + ARCH_ALIAS: {{ '${{ matrix.platform.archery_arch_alias }}' }} + ARCH_SHORT: {{ '${{ matrix.platform.archery_arch_short }}' }} + strategy: + fail-fast: false + matrix: + platform: + - runs_on: ["ubuntu-latest"] + arch: "x86_64" + archery_arch: "amd64" + archery_arch_alias: "x86_64" + archery_arch_short: "amd64" + - runs_on: ["self-hosted", "Linux", "arm64"] + arch: "aarch_64" + archery_arch: "arm64v8" + archery_arch_alias: "aarch64" + archery_arch_short: "arm64" steps: {{ macros.github_checkout_arrow()|indent }} {{ macros.github_install_archery()|indent }} @@ -36,12 +56,12 @@ jobs: -e ARROW_JAVA_TEST=OFF \ java-jni-manylinux-2014 - name: Compress into single artifact to keep directory structure - run: tar -cvzf arrow-shared-libs-linux.tar.gz arrow/java-dist/ + run: tar -cvzf arrow-shared-libs-linux-{{ arch }}.tar.gz arrow/java-dist/ - name: Upload artifacts uses: actions/upload-artifact@v2 with: - name: ubuntu-shared-lib - path: arrow-shared-libs-linux.tar.gz + name: ubuntu-shared-lib-{{ arch }} + path: arrow-shared-libs-linux-{{ arch }}.tar.gz {% if arrow.is_default_branch() %} {{ macros.github_login_dockerhub()|indent }} - name: Push Docker image @@ -178,7 +198,8 @@ jobs: - name: Decompress artifacts run: | mv artifacts/*/*.tar.gz . - tar -xvzf arrow-shared-libs-linux.tar.gz + tar -xvzf arrow-shared-libs-linux-x86_64.tar.gz + tar -xvzf arrow-shared-libs-linux-aarch_64.tar.gz tar -xvzf arrow-shared-libs-macos-x86_64.tar.gz tar -xvzf arrow-shared-libs-macos-aarch_64.tar.gz tar -xvzf arrow-shared-libs-windows.tar.gz @@ -191,6 +212,11 @@ jobs: test -f arrow/java-dist/x86_64/libarrow_orc_jni.so test -f arrow/java-dist/x86_64/libgandiva_jni.so + test -f arrow/java-dist/aarch_64/libarrow_cdata_jni.so + test -f arrow/java-dist/aarch_64/libarrow_dataset_jni.so + test -f arrow/java-dist/aarch_64/libarrow_orc_jni.so + test -f arrow/java-dist/aarch_64/libgandiva_jni.so + test -f arrow/java-dist/x86_64/libarrow_cdata_jni.dylib test -f arrow/java-dist/x86_64/libarrow_dataset_jni.dylib test -f arrow/java-dist/x86_64/libarrow_orc_jni.dylib diff --git a/dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in b/dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in index 67e0f8db8a4dd..46c6d91b2d5dc 100644 --- a/dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in +++ b/dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in @@ -238,27 +238,11 @@ cd - %package -n %{name}%{major_version}-libs Summary: Runtime libraries for Apache Arrow C++ License: Apache-2.0 -Requires: brotli -%if %{use_gflags} -Requires: gflags -%endif -%if %{use_glog} -Requires: glog -%endif -Requires: libzstd %if %{have_lz4_libs} Requires: lz4-libs %{lz4_requirement} %else Requires: lz4 %{lz4_requirement} %endif -%if %{have_re2} -Requires: re2 -%endif -Requires: snappy -%if %{have_utf8proc} -Requires: utf8proc -%endif -Requires: zlib %description -n %{name}%{major_version}-libs This package contains the libraries for Apache Arrow C++. @@ -414,8 +398,6 @@ Libraries and header files for Apache Arrow dataset. Summary: C++ library for fast data transport. License: Apache-2.0 Requires: %{name}%{major_version}-libs = %{version}-%{release} -Requires: c-ares -Requires: openssl %description -n %{name}%{major_version}-flight-libs This package contains the libraries for Apache Arrow Flight. @@ -485,7 +467,6 @@ Libraries and header files for Apache Arrow Flight SQL. Summary: C++ library for compiling and evaluating expressions on Apache Arrow data. License: Apache-2.0 Requires: %{name}%{major_version}-libs = %{version}-%{release} -Requires: ncurses-libs %description -n gandiva%{major_version}-libs This package contains the libraries for Gandiva. @@ -521,7 +502,6 @@ Libraries and header files for Gandiva. Summary: Runtime libraries for Apache Parquet C++ License: Apache-2.0 Requires: %{name}%{major_version}-libs = %{version}-%{release} -Requires: openssl %description -n parquet%{major_version}-libs This package contains the libraries for Apache Parquet C++. @@ -570,7 +550,6 @@ Libraries and header files for Apache Parquet C++. Summary: Runtime libraries for Apache Arrow GLib License: Apache-2.0 Requires: %{name}%{major_version}-libs = %{version}-%{release} -Requires: glib2 %description -n %{name}%{major_version}-glib-libs This package contains the libraries for Apache Arrow GLib. diff --git a/dev/tasks/linux-packages/github.linux.yml b/dev/tasks/linux-packages/github.linux.yml index bf28cf10e9ecf..6de3edfce07e1 100644 --- a/dev/tasks/linux-packages/github.linux.yml +++ b/dev/tasks/linux-packages/github.linux.yml @@ -32,56 +32,8 @@ jobs: steps: {{ macros.github_checkout_arrow()|indent }} {{ macros.github_login_dockerhub()|indent }} + {{ macros.github_free_space()|indent }} - - name: Free up disk space - if: | - env.ARCHITECTURE == 'amd64' - run: | - df -h - echo "::group::/usr/local/*" - du -hsc /usr/local/* - echo "::endgroup::" - echo "::group::/usr/local/bin/*" - du -hsc /usr/local/bin/* - echo "::endgroup::" - # ~1GB (From 1.2GB to 214MB) - sudo rm -rf \ - /usr/local/bin/aliyun \ - /usr/local/bin/azcopy \ - /usr/local/bin/bicep \ - /usr/local/bin/cmake-gui \ - /usr/local/bin/cpack \ - /usr/local/bin/helm \ - /usr/local/bin/hub \ - /usr/local/bin/kubectl \ - /usr/local/bin/minikube \ - /usr/local/bin/node \ - /usr/local/bin/packer \ - /usr/local/bin/pulumi* \ - /usr/local/bin/stack \ - /usr/local/bin/terraform || : - echo "::group::/usr/local/share/*" - du -hsc /usr/local/share/* - echo "::endgroup::" - # 1.3GB - sudo rm -rf /usr/local/share/powershell || : - echo "::group::/opt/*" - du -hsc /opt/* - echo "::endgroup::" - echo "::group::/opt/hostedtoolcache/*" - du -hsc /opt/hostedtoolcache/* - echo "::endgroup::" - # 5.3GB - sudo rm -rf /opt/hostedtoolcache/CodeQL || : - # 1.4GB - sudo rm -rf /opt/hostedtoolcache/go || : - # 489MB - sudo rm -rf /opt/hostedtoolcache/PyPy || : - # 1.2GB - sudo rm -rf /opt/hostedtoolcache/Python || : - # 376MB - sudo rm -rf /opt/hostedtoolcache/node || : - df -h - name: Set up Ruby run: | sudo apt update diff --git a/dev/tasks/macros.jinja b/dev/tasks/macros.jinja index 1fdfc08be03e6..757c15c937ce5 100644 --- a/dev/tasks/macros.jinja +++ b/dev/tasks/macros.jinja @@ -72,6 +72,60 @@ on: run: pip install -e arrow/dev/archery[all] {% endmacro %} +{%- macro github_free_space() -%} + - name: Free up disk space + if: runner.os == 'Linux' && runner.arch == 'X64' + shell: bash + run: | + df -h + echo "::group::/usr/local/*" + du -hsc /usr/local/* + echo "::endgroup::" + echo "::group::/usr/local/bin/*" + du -hsc /usr/local/bin/* + echo "::endgroup::" + # ~1GB (From 1.2GB to 214MB) + sudo rm -rf \ + /usr/local/bin/aliyun \ + /usr/local/bin/azcopy \ + /usr/local/bin/bicep \ + /usr/local/bin/cmake-gui \ + /usr/local/bin/cpack \ + /usr/local/bin/helm \ + /usr/local/bin/hub \ + /usr/local/bin/kubectl \ + /usr/local/bin/minikube \ + /usr/local/bin/node \ + /usr/local/bin/packer \ + /usr/local/bin/pulumi* \ + /usr/local/bin/stack \ + /usr/local/bin/terraform || : + echo "::group::/usr/local/share/*" + du -hsc /usr/local/share/* + echo "::endgroup::" + # 1.3GB + sudo rm -rf /usr/local/share/powershell || : + echo "::group::/opt/*" + du -hsc /opt/* + echo "::endgroup::" + echo "::group::/opt/hostedtoolcache/*" + du -hsc /opt/hostedtoolcache/* + echo "::endgroup::" + # 5.3GB + sudo rm -rf /opt/hostedtoolcache/CodeQL || : + # 1.4GB + sudo rm -rf /opt/hostedtoolcache/go || : + # 489MB + sudo rm -rf /opt/hostedtoolcache/PyPy || : + # 376MB + sudo rm -rf /opt/hostedtoolcache/node || : + # Remove Web browser packages + sudo apt-get purge -y -f firefox \ + google-chrome-stable \ + microsoft-edge-stable + df -h +{% endmacro %} + {%- macro github_upload_releases(pattern) -%} - name: Set up Python by actions/setup-python if: runner.arch == 'X64' @@ -137,12 +191,26 @@ on: run: | sudo apt update sudo apt install -y ruby-full + - name: Set up Ruby by GitHub Actions + if: runner.arch == 'X64' && runner.os != 'macOS' + uses: ruby/setup-ruby@v1 + with: + ruby-version: "ruby" + - name: Install gemfury client on ARM self-hosted + if: runner.arch != 'X64' + run: | + # GH-36692: Pin gemfury due to wrong faraday dependency declaration. + gem install --user-install gemfury -v 0.12.0 + ruby -r rubygems -e 'puts("#{Gem.user_dir}/bin")' >> $GITHUB_PATH + - name: Install gemfury client + if: runner.arch == 'X64' + run: | + # GH-36692: Pin gemfury due to wrong faraday dependency declaration. + gem install gemfury -v 0.12.0 - name: Upload package to Gemfury shell: bash run: | - PATH=$(echo $(ruby -r rubygems -e 'puts Gem.user_dir') | sed "s/C:\//\/c\//")/bin:$PATH - gem install --user-install gemfury - fury \ + fury push \ --api-token=${CROSSBOW_GEMFURY_TOKEN} \ --as=${CROSSBOW_GEMFURY_ORG} \ {{ pattern }} @@ -214,6 +282,7 @@ on: # see https://github.com/actions/runner-images/issues/6868 brew install --overwrite python@3.11 python@3.10 + set -x ARROW_GLIB_FORMULA=$(echo ${ARROW_FORMULA} | sed -e 's/\.rb/-glib.rb/') echo "ARROW_GLIB_FORMULA=${ARROW_GLIB_FORMULA}" >> ${GITHUB_ENV} for formula in ${ARROW_FORMULA} ${ARROW_GLIB_FORMULA}; do @@ -223,11 +292,12 @@ on: # Pin the current commit in the formula to test so that # we're not always pulling from the tip of the default branch sed -i '' -E \ - -e 's@https://github.com/apache/arrow.git"$@{{ arrow.remote }}.git", revision: "{{ arrow.head }}"@' \ + -e 's@https://github.com/apache/arrow.git", branch: "main"$@{{ arrow.remote }}.git", revision: "{{ arrow.head }}"@' \ ${formula} # Sometimes crossbow gives a remote URL with .git and sometimes not. # Make sure there's only one sed -i '' -E -e 's@.git.git@.git@' ${formula} + cat ${formula} cp ${formula} $(brew --repository homebrew/core)/Formula/ done {% endmacro %} @@ -238,13 +308,14 @@ on: cp ../dev/tasks/homebrew-formulae/autobrew/apache-arrow*.rb tools/ # Pin the git commit in the formula to match - cd tools + pushd tools if [ "{{ is_fork }}" == "true" ]; then - sed -i.bak -E -e 's/apache\/arrow.git"$/{{ arrow.github_repo.split("/") | join("\/") }}.git", :revision => "'"{{ arrow.head }}"'"/' apache-arrow*.rb + sed -i.bak -E -e 's/apache\/arrow.git", branch: "main"$/{{ arrow.github_repo.split("/") | join("\/") }}.git", :revision => "'"{{ arrow.head }}"'"/' apache-arrow*.rb else - sed -i.bak -E -e 's/arrow.git"$/arrow.git", :revision => "'"{{ arrow.head }}"'"/' apache-arrow*.rb + sed -i.bak -E -e 's/arrow.git", branch: "main"$/arrow.git", :revision => "'"{{ arrow.head }}"'"/' apache-arrow*.rb fi rm -f apache-arrow*.rb.bak + popd {% endmacro %} {%- macro github_change_r_pkg_version(is_fork, version) -%} diff --git a/dev/tasks/r/github.devdocs.yml b/dev/tasks/r/github.devdocs.yml index 7126b1418253f..0839e7fc6afcf 100644 --- a/dev/tasks/r/github.devdocs.yml +++ b/dev/tasks/r/github.devdocs.yml @@ -38,10 +38,9 @@ jobs: # remove after https://issues.apache.org/jira/browse/ARROW-16376 r-version: '4.1' - uses: r-lib/actions/setup-pandoc@v2 - - name: Install knitr, rmarkdown - run: | - install.packages(c("rmarkdown", "knitr", "sessioninfo")) - shell: Rscript {0} + - uses: r-lib/actions/setup-r-dependencies@v2 + with: + packages: "rmarkdown, knitr, sessioninfo" - name: Session info run: | options(width = 100) diff --git a/dev/tasks/r/github.macos.brew.yml b/dev/tasks/r/github.macos.brew.yml index dea7564c5f80f..8a0e6f6bb96e3 100644 --- a/dev/tasks/r/github.macos.brew.yml +++ b/dev/tasks/r/github.macos.brew.yml @@ -24,21 +24,33 @@ jobs: name: "Homebrew + R package" runs-on: macOS-11 steps: + - name: Show system information + run: | + sysctl hw.optional machdep.cpu + {{ macros.github_checkout_arrow()|indent }} {{ macros.configure_homebrew_arrow(formula)|indent }} - name: Install apache-arrow env: - {{ macros.github_set_sccache_envvars()|indent(8)}} + {{ macros.github_set_sccache_envvars()|indent(8)}} run: | - brew install sccache # for testing brew install minio - + # TODO(ARROW-16907): apache/arrow@main seems to be installed already # so this does nothing on a branch/PR - brew install -v --HEAD apache-arrow + brew install -v --HEAD {{ '$(brew --repository homebrew/core)/Formula/apache-arrow.rb' }} + + mkdir -p homebrew-logs + cp -a ~/Library/Logs/Homebrew/apache-arrow homebrew-logs/ + - name: Save logs + if: always() + uses: actions/upload-artifact@v2 + with: + name: homebrew-logs + path: homebrew-logs - uses: r-lib/actions/setup-r@v2 - name: Install dependencies diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index 879c2246b41ee..73b793162d959 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -64,7 +64,7 @@ groups: - r-binary-packages - ubuntu-* - wheel-* - - test-ubuntu-default-docs + - test-ubuntu-*-docs {############################# Testing tasks #################################} @@ -246,6 +246,16 @@ tasks: # generated and to be synced regularly from the feedstock. We have no way # yet to generate them inside the arrow repository automatically. + conda-linux-x64-cpu-r41: + ci: azure + template: conda-recipes/azure.linux.yml + params: + config: linux_64_cuda_compiler_versionNone + r_config: linux_64_r_base4.1 + artifacts: + - libarrow-{no_rc_version}-(h[a-z0-9]+)_0_cpu.conda + - r-arrow-{no_rc_version}-r41(h[a-z0-9]+)_0.conda + conda-linux-x64-cpu-r42: ci: azure template: conda-recipes/azure.linux.yml @@ -272,7 +282,7 @@ tasks: ci: azure template: conda-recipes/azure.linux.yml params: - config: linux_64_cuda_compiler_version10.2 + config: linux_64_cuda_compiler_version11.2 artifacts: - libarrow-{no_rc_version}-(h[a-z0-9]+)_0_cuda.conda - pyarrow-{no_rc_version}-py38(h[a-z0-9]+)_0_cuda.conda @@ -282,6 +292,16 @@ tasks: ########################### Conda Linux (aarch64) ########################### + conda-linux-aarch64-cpu-r41: + ci: azure + template: conda-recipes/azure.linux.yml + params: + config: linux_aarch64_cuda_compiler_versionNone + r_config: linux_aarch64_r_base4.1 + artifacts: + - libarrow-{no_rc_version}-(h[a-z0-9]+)_0_cpu.conda + - r-arrow-{no_rc_version}-r41(h[a-z0-9]+)_0.conda + conda-linux-aarch64-cpu-r42: ci: azure template: conda-recipes/azure.linux.yml @@ -304,6 +324,18 @@ tasks: - pyarrow-{no_rc_version}-py310(h[a-z0-9]+)_0_cpu.conda - pyarrow-{no_rc_version}-py311(h[a-z0-9]+)_0_cpu.conda + conda-linux-aarch64-cuda-py3: + ci: azure + template: conda-recipes/azure.linux.yml + params: + config: linux_aarch64_cuda_compiler_version11.2 + artifacts: + - libarrow-{no_rc_version}-(h[a-z0-9]+)_0_cuda.conda + - pyarrow-{no_rc_version}-py38(h[a-z0-9]+)_0_cuda.conda + - pyarrow-{no_rc_version}-py39(h[a-z0-9]+)_0_cuda.conda + - pyarrow-{no_rc_version}-py310(h[a-z0-9]+)_0_cuda.conda + - pyarrow-{no_rc_version}-py311(h[a-z0-9]+)_0_cuda.conda + ########################### Conda Linux (ppc64le) ########################### conda-linux-ppc64le-cpu-py3: @@ -318,8 +350,30 @@ tasks: - pyarrow-{no_rc_version}-py310(h[a-z0-9]+)_0_cpu.conda - pyarrow-{no_rc_version}-py311(h[a-z0-9]+)_0_cpu.conda + conda-linux-ppc64le-cuda-py3: + ci: azure + template: conda-recipes/azure.linux.yml + params: + config: linux_ppc64le_cuda_compiler_version11.2 + artifacts: + - libarrow-{no_rc_version}-(h[a-z0-9]+)_0_cuda.conda + - pyarrow-{no_rc_version}-py38(h[a-z0-9]+)_0_cuda.conda + - pyarrow-{no_rc_version}-py39(h[a-z0-9]+)_0_cuda.conda + - pyarrow-{no_rc_version}-py310(h[a-z0-9]+)_0_cuda.conda + - pyarrow-{no_rc_version}-py311(h[a-z0-9]+)_0_cuda.conda + ############################## Conda OSX (x64) ############################## + conda-osx-x64-cpu-r41: + ci: azure + template: conda-recipes/azure.osx.yml + params: + config: osx_64_ + r_config: osx_64_r_base4.1 + artifacts: + - libarrow-{no_rc_version}-(h[a-z0-9]+)_0_cpu.conda + - r-arrow-{no_rc_version}-r41(h[a-z0-9]+)_0.conda + conda-osx-x64-cpu-r42: ci: azure template: conda-recipes/azure.osx.yml @@ -344,6 +398,16 @@ tasks: ############################# Conda OSX (arm64) ############################# + conda-osx-arm64-cpu-r41: + ci: azure + template: conda-recipes/azure.osx.yml + params: + config: osx_arm64_ + r_config: osx_arm64_r_base4.1 + artifacts: + - libarrow-{no_rc_version}-(h[a-z0-9]+)_0_cpu.conda + - r-arrow-{no_rc_version}-r41(h[a-z0-9]+)_0.conda + conda-osx-arm64-cpu-r42: ci: azure template: conda-recipes/azure.osx.yml @@ -396,7 +460,7 @@ tasks: ci: azure template: conda-recipes/azure.win.yml params: - config: win_64_cuda_compiler_versionNone + config: win_64_cuda_compiler_version11.2 artifacts: - libarrow-{no_rc_version}-(h[a-z0-9]+)_0_cuda.conda - pyarrow-{no_rc_version}-py38(h[a-z0-9]+)_0_cuda.conda @@ -1434,17 +1498,16 @@ tasks: image: debian-go {% endfor %} - test-ubuntu-default-docs: - ci: azure - template: docker-tests/azure.linux.yml + test-ubuntu-22.04-docs: + ci: github + template: docs/github.linux.yml params: - artifacts: "build/docs.tar.gz" + env: + UBUNTU: 22.04 + pr_number: Unset flags: "-v $PWD/build/:/build/" image: ubuntu-docs - post_script: | - cd build - sudo chown -R ${USER}: . - tar cvzf docs.tar.gz docs + publish: false artifacts: - docs.tar.gz @@ -1525,9 +1588,9 @@ tasks: image: conda-python-hdfs {% endfor %} -{% for python_version, spark_version, test_pyarrow_only, numpy_version in [("3.8", "v3.1.2", "false", "latest"), - ("3.9", "v3.2.0", "false", "1.23"), - ("3.10", "master", "false", "latest")] %} +{% for python_version, spark_version, test_pyarrow_only, numpy_version in [("3.8", "v3.4.1", "false", "latest"), + ("3.10", "v3.4.1", "false", "1.23"), + ("3.11", "master", "false", "latest")] %} test-conda-python-{{ python_version }}-spark-{{ spark_version }}: ci: github template: docker-tests/github.linux.yml @@ -1564,7 +1627,9 @@ tasks: ci: github template: docs/github.linux.yml params: + env: + UBUNTU: 22.04 pr_number: Unset - artifacts: "build/docs.tar.gz" flags: "-v $PWD/build/:/build/" image: ubuntu-docs + publish: true diff --git a/docker-compose.yml b/docker-compose.yml index 0b0d0e57b30ce..fe98a30d0b92b 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -72,6 +72,10 @@ x-sccache: &sccache SCCACHE_REGION: SCCACHE_S3_KEY_PREFIX: ${SCCACHE_S3_KEY_PREFIX:-sccache} +x-cpp: &cpp + ARROW_RUNTIME_SIMD_LEVEL: + ARROW_SIMD_LEVEL: + # CPU/memory limit presets to pass to Docker. # # Usage: archery docker run --resource-limit=github @@ -227,7 +231,7 @@ services: ulimits: &ulimits core: ${ULIMIT_CORE} environment: - <<: [*common, *ccache] + <<: [*common, *ccache, *cpp] ARROW_ENABLE_TIMING_TESTS: # inherit ARROW_MIMALLOC: "ON" volumes: &alpine-linux-volumes @@ -278,7 +282,7 @@ services: shm_size: *shm-size ulimits: *ulimits environment: - <<: [*common, *ccache, *sccache] + <<: [*common, *ccache, *sccache, *cpp] ARROW_BUILD_BENCHMARKS: "ON" ARROW_BUILD_EXAMPLES: "ON" ARROW_ENABLE_TIMING_TESTS: # inherit @@ -313,7 +317,7 @@ services: arch: ${ARCH} shm_size: *shm-size environment: - <<: [*common, *ccache, *sccache] + <<: [*common, *ccache, *sccache, *cpp] # Shrink test runtime by enabling minimal optimizations ARROW_C_FLAGS_DEBUG: "-g1 -Og" ARROW_CXX_FLAGS_DEBUG: "-g1 -Og" @@ -349,7 +353,7 @@ services: shm_size: *shm-size ulimits: *ulimits environment: - <<: [*common, *ccache, *sccache] + <<: [*common, *ccache, *sccache, *cpp] ARROW_ENABLE_TIMING_TESTS: # inherit ARROW_MIMALLOC: "ON" volumes: &debian-volumes @@ -390,7 +394,7 @@ services: - apparmor:unconfined ulimits: *ulimits environment: - <<: [*common, *ccache, *sccache] + <<: [*common, *ccache, *sccache, *cpp] ARROW_ENABLE_TIMING_TESTS: # inherit ARROW_MIMALLOC: "ON" volumes: &ubuntu-volumes @@ -426,7 +430,7 @@ services: - apparmor:unconfined ulimits: *ulimits environment: - <<: [*common, *ccache, *sccache] + <<: [*common, *ccache, *sccache, *cpp] ARROW_HOME: /arrow ARROW_DEPENDENCY_SOURCE: BUNDLED LIBARROW_MINIMAL: "false" @@ -448,7 +452,7 @@ services: volumes: - .:/arrow:delegated environment: - <<: [*common, *ccache, *sccache] + <<: [*common, *ccache, *sccache, *cpp] ARROW_DEPENDENCY_SOURCE: BUNDLED ARROW_HOME: /arrow LIBARROW_MINIMAL: "false" @@ -470,7 +474,7 @@ services: shm_size: *shm-size ulimits: *ulimits environment: - <<: [*common, *ccache, *sccache] + <<: [*common, *ccache, *sccache, *cpp] ARROW_DEPENDENCY_SOURCE: BUNDLED CMAKE_GENERATOR: "Unix Makefiles" volumes: *ubuntu-volumes @@ -491,7 +495,7 @@ services: shm_size: *shm-size ulimits: *ulimits environment: - <<: [*common, *ccache, *sccache] + <<: [*common, *ccache, *sccache, *cpp] ARROW_BUILD_UTILITIES: "OFF" ARROW_COMPUTE: "OFF" ARROW_CSV: "OFF" @@ -538,7 +542,7 @@ services: shm_size: *shm-size ulimits: *ulimits environment: - <<: [*common, *ccache, *sccache] + <<: [*common, *ccache, *sccache, *cpp] ARROW_BUILD_UTILITIES: "OFF" ARROW_COMPUTE: "OFF" ARROW_CSV: "OFF" @@ -588,7 +592,7 @@ services: shm_size: *shm-size volumes: *ubuntu-volumes environment: - <<: [*common, *ccache] + <<: [*common, *ccache, *cpp] CC: clang-${CLANG_TOOLS} CXX: clang++-${CLANG_TOOLS} # Avoid creating huge static libraries @@ -630,7 +634,7 @@ services: shm_size: *shm-size volumes: *ubuntu-volumes environment: - <<: [*common, *ccache, *sccache] + <<: [*common, *ccache, *sccache, *cpp] CC: clang-${CLANG_TOOLS} CXX: clang++-${CLANG_TOOLS} ARROW_BUILD_STATIC: "OFF" @@ -662,7 +666,7 @@ services: shm_size: *shm-size ulimits: *ulimits environment: - <<: [*common, *ccache, *sccache] + <<: [*common, *ccache, *sccache, *cpp] ARROW_ENABLE_TIMING_TESTS: # inherit ARROW_MIMALLOC: "ON" Protobuf_SOURCE: "BUNDLED" # Need Protobuf >= 3.15 @@ -1147,7 +1151,6 @@ services: command: ["pip install -e /arrow/dev/archery && \ /arrow/ci/scripts/java_jni_manylinux_build.sh /arrow /build /arrow/java-dist && \ - source /opt/rh/rh-maven35/enable && \ /arrow/ci/scripts/java_build.sh /arrow /build /arrow/java-dist && \ /arrow/ci/scripts/java_test.sh /arrow /build /arrow/java-dist"] @@ -1742,7 +1745,6 @@ services: BUILD_DOCS_JS: "ON" BUILD_DOCS_PYTHON: "ON" BUILD_DOCS_R: "ON" - Protobuf_SOURCE: "BUNDLED" # Need Protobuf >= 3.15 volumes: *ubuntu-volumes command: &docs-command > /bin/bash -c " diff --git a/docs/source/cpp/env_vars.rst b/docs/source/cpp/env_vars.rst index 06fd73ffd0d98..8d10fd2cc2e40 100644 --- a/docs/source/cpp/env_vars.rst +++ b/docs/source/cpp/env_vars.rst @@ -26,6 +26,29 @@ Arrow C++ at runtime. Many of these variables are inspected only once per process (for example, when the Arrow C++ DLL is loaded), so you cannot assume that changing their value later will have an effect. +.. envvar:: ACERO_ALIGNMENT_HANDLING + + Arrow C++'s Acero module performs computation on streams of data. This + computation may involve a form of "type punning" that is technically + undefined behavior if the underlying array is not properly aligned. On + most modern CPUs this is not an issue, but some older CPUs may crash or + suffer poor performance. For this reason it is recommended that all + incoming array buffers are properly aligned, but some data sources + such as :ref:`Flight ` may produce unaligned buffers. + + The value of this environment variable controls what will happen when + Acero detects an unaligned buffer: + + - ``warn``: a warning is emitted + - ``ignore``: nothing, alignment checking is disabled + - ``reallocate``: the buffer is reallocated to a properly aligned address + - ``error``: the operation fails with an error + + The default behavior is ``warn``. On modern hardware it is usually safe + to change this to ``ignore``. Changing to ``reallocate`` is the safest + option but this will have a significant performance impact as the buffer + will need to be copied. + .. envvar:: ARROW_DEBUG_MEMORY_POOL Enable rudimentary memory checks to guard against buffer overflows. @@ -115,6 +138,10 @@ that changing their value later will have an effect. SIGILL (Illegal Instruction). User must rebuild Arrow and PyArrow from scratch by setting cmake option ``ARROW_SIMD_LEVEL=NONE``. +.. envvar:: AWS_ENDPOINT_URL + + Endpoint URL used for S3-like storage, for example Minio or s3.scality. + .. envvar:: GANDIVA_CACHE_SIZE The number of entries to keep in the Gandiva JIT compilation cache. diff --git a/docs/source/developers/continuous_integration/crossbow.rst b/docs/source/developers/continuous_integration/crossbow.rst index 663fc17c0a028..6308f077ac9a6 100644 --- a/docs/source/developers/continuous_integration/crossbow.rst +++ b/docs/source/developers/continuous_integration/crossbow.rst @@ -47,7 +47,7 @@ Executors Individual jobs are executed on public CI services, currently: - Linux: GitHub Actions, Travis CI, Azure Pipelines -- macOS: GitHub Actions, Travis CI, Azure Pipelines +- macOS: GitHub Actions, Azure Pipelines - Windows: GitHub Actions, Azure Pipelines Queue @@ -59,7 +59,7 @@ queue for the tasks. Anyone can host a ``queue`` repository (usually named ``/crossbow``). A job is a git commit on a particular git branch, containing the required -configuration files to run the requested builds (like ``.travis.yml``, +configuration files to run the requested builds (like ``.travis.yml``, ``azure-pipelines.yml``, or ``crossbow.yml`` for `GitHub Actions`_ ). Scheduler @@ -118,7 +118,7 @@ to step 3: ``https://travis-ci.com///settings`` - Confirm the `auto cancellation`_ feature is turned off for branch builds. This should be the default setting. - + 7. Install Python (minimum supported version is 3.8): | Miniconda is preferred, see installation instructions: diff --git a/docs/source/developers/continuous_integration/overview.rst b/docs/source/developers/continuous_integration/overview.rst index 70323c9e48927..1d82e845a3360 100644 --- a/docs/source/developers/continuous_integration/overview.rst +++ b/docs/source/developers/continuous_integration/overview.rst @@ -26,7 +26,6 @@ Some files central to Arrow CI are: - ``docker-compose.yml`` - here we define docker services which can be configured using either enviroment variables, or the default values for these variables. - ``.env`` - here we define default values to configure the services in ``docker-compose.yml`` -- ``.travis.yml`` - here we define workflows which run on Travis - ``appveyor.yml`` - here we define workflows that run on Appveyor We use :ref:`Docker` in order to have portable and reproducible Linux builds, as well as running Windows builds in Windows containers. We use :ref:`Archery` and :ref:`Crossbow` to help co-ordinate the various CI tasks. @@ -60,7 +59,6 @@ The ``.yml`` files in ``.github/worflows`` are workflows which are run on GitHub There are two other files which define action-triggered builds: -- ``.travis.yml`` - runs on all commits and is used to test on architectures such as ARM and S390x - ``appveyor.yml`` - runs on commits related to Python or C++ Extended builds diff --git a/docs/source/developers/java/building.rst b/docs/source/developers/java/building.rst index 5b525d467731e..061c616d4b971 100644 --- a/docs/source/developers/java/building.rst +++ b/docs/source/developers/java/building.rst @@ -132,11 +132,7 @@ Maven $ cd arrow/java $ export JAVA_HOME= $ java --version - $ mvn generate-resources \ - -Pgenerate-libs-jni-macos-linux \ - -DARROW_GANDIVA=ON \ - -DARROW_JAVA_JNI_ENABLE_GANDIVA=ON \ - -N + $ mvn generate-resources -Pgenerate-libs-jni-macos-linux -N $ ls -latr java-dist/lib//*_{jni,java}.* |__ libarrow_dataset_jni.dylib |__ libarrow_orc_jni.dylib @@ -216,9 +212,11 @@ CMake -DARROW_FILESYSTEM=ON \ -DARROW_GANDIVA=ON \ -DARROW_GANDIVA_STATIC_LIBSTDCPP=ON \ + -DARROW_JSON=ON \ -DARROW_ORC=ON \ -DARROW_PARQUET=ON \ -DARROW_S3=ON \ + -DARROW_SUBSTRAIT=ON \ -DARROW_USE_CCACHE=ON \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_INSTALL_LIBDIR=lib/ \ @@ -234,7 +232,9 @@ CMake -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_INSTALL_LIBDIR=lib/ \ -DCMAKE_INSTALL_PREFIX=java-dist \ - -DCMAKE_PREFIX_PATH=$PWD/java-dist + -DCMAKE_PREFIX_PATH=$PWD/java-dist \ + -DProtobuf_ROOT=$PWD/../cpp-jni/protobuf_ep-install \ + -DProtobuf_USE_STATIC_LIBS=ON $ cmake --build java-jni --target install --config Release $ ls -latr java-dist/lib//*_{jni,java}.* |__ libarrow_dataset_jni.dylib @@ -255,9 +255,12 @@ CMake -DARROW_DATASET=ON ^ -DARROW_DEPENDENCY_USE_SHARED=OFF ^ -DARROW_FILESYSTEM=ON ^ - -DARROW_ORC=OFF ^ + -DARROW_GANDIVA=OFF ^ + -DARROW_JSON=ON ^ + -DARROW_ORC=ON ^ -DARROW_PARQUET=ON ^ -DARROW_S3=ON ^ + -DARROW_SUBSTRAIT=ON ^ -DARROW_USE_CCACHE=ON ^ -DARROW_WITH_BROTLI=ON ^ -DARROW_WITH_LZ4=ON ^ @@ -276,9 +279,10 @@ CMake -S java ^ -B java-jni ^ -DARROW_JAVA_JNI_ENABLE_C=OFF ^ + -DARROW_JAVA_JNI_ENABLE_DATASET=ON ^ -DARROW_JAVA_JNI_ENABLE_DEFAULT=ON ^ -DARROW_JAVA_JNI_ENABLE_GANDIVA=OFF ^ - -DARROW_JAVA_JNI_ENABLE_ORC=OFF ^ + -DARROW_JAVA_JNI_ENABLE_ORC=ON ^ -DBUILD_TESTING=OFF ^ -DCMAKE_BUILD_TYPE=Release ^ -DCMAKE_INSTALL_LIBDIR=lib/x86_64 ^ @@ -286,6 +290,7 @@ CMake -DCMAKE_PREFIX_PATH=$PWD/java-dist $ cmake --build java-jni --target install --config Release $ dir "java-dist/bin" + |__ arrow_orc_jni.dll |__ arrow_dataset_jni.dll Archery diff --git a/docs/source/developers/release.rst b/docs/source/developers/release.rst index cb0d713f50d0c..066400b33ffb5 100644 --- a/docs/source/developers/release.rst +++ b/docs/source/developers/release.rst @@ -497,8 +497,8 @@ Be sure to go through on the following checklist: Our CI systems give us some coverage for the things that CRAN checks, but there are a couple of final tests we should do to confirm that the release binaries will work and that everything runs on the same infrastructure that - CRAN has, which is difficult/impossible to emulate fully on Travis or with - Docker. For a precise list of checks, see the + CRAN has, which is difficult/impossible to emulate fully with Docker. For a + precise list of checks, see the `packaging checklist `_. Once all checks are clean, we submit to CRAN, which has a web form for diff --git a/docs/source/format/CanonicalExtensions.rst b/docs/source/format/CanonicalExtensions.rst index 5dd269ee5c675..9f7948cbfe980 100644 --- a/docs/source/format/CanonicalExtensions.rst +++ b/docs/source/format/CanonicalExtensions.rst @@ -147,3 +147,21 @@ Fixed shape tensor This structure has no relationship with the Fixed shape tensor extension type defined by this specification. Instead, this extension type lets one use fixed shape tensors as elements in a field of a RecordBatch or a Table. + +========================= +Community Extension Types +========================= + +In addition to the canonical extension types listed above, there exist Arrow +extension types that have been established as standards within specific domain +areas. These have not been officially designated as canonical through a +discussion and vote on the Arrow development mailing list but are well known +within subcommunities of Arrow developers. + +GeoArrow +======== + +`GeoArrow `_ defines a collection of +Arrow extension types for representing vector geometries. It is well known +within the Arrow geospatial subcommunity. The GeoArrow specification is not yet +finalized. diff --git a/docs/source/format/Columnar.rst b/docs/source/format/Columnar.rst index b90e2c97ade73..3390f1b7b5f2c 100644 --- a/docs/source/format/Columnar.rst +++ b/docs/source/format/Columnar.rst @@ -256,15 +256,15 @@ Would look like: :: * Length: 5, Null count: 1 * Validity bitmap buffer: - |Byte 0 (validity bitmap) | Bytes 1-63 | - |-------------------------|-----------------------| - | 00011101 | 0 (padding) | + | Byte 0 (validity bitmap) | Bytes 1-63 | + |--------------------------|-----------------------| + | 00011101 | 0 (padding) | * Value Buffer: - |Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | Bytes 16-19 | Bytes 20-63 | - |------------|-------------|-------------|-------------|-------------|-------------| - | 1 | unspecified | 2 | 4 | 8 | unspecified | + | Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | Bytes 16-19 | Bytes 20-63 | + |-------------|-------------|-------------|-------------|-------------|-----------------------| + | 1 | unspecified | 2 | 4 | 8 | unspecified (padding) | **Example Layout: Non-null int32 Array** @@ -279,9 +279,9 @@ Would look like: :: * Value Buffer: - |Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | bytes 12-15 | bytes 16-19 | Bytes 20-63 | - |------------|-------------|-------------|-------------|-------------|-------------| - | 1 | 2 | 3 | 4 | 8 | unspecified | + | Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | Bytes 16-19 | Bytes 20-63 | + |-------------|-------------|-------------|-------------|-------------|-----------------------| + | 1 | 2 | 3 | 4 | 8 | unspecified (padding) | or with the bitmap elided: :: @@ -289,9 +289,9 @@ or with the bitmap elided: :: * Validity bitmap buffer: Not required * Value Buffer: - |Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | bytes 12-15 | bytes 16-19 | Bytes 20-63 | - |------------|-------------|-------------|-------------|-------------|-------------| - | 1 | 2 | 3 | 4 | 8 | unspecified | + | Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | bytes 12-15 | bytes 16-19 | Bytes 20-63 | + |-------------|-------------|-------------|-------------|-------------|-----------------------| + | 1 | 2 | 3 | 4 | 8 | unspecified (padding) | Variable-size Binary Layout --------------------------- @@ -342,13 +342,13 @@ will be represented as follows: :: | Bytes 0-19 | Bytes 20-63 | |----------------|-----------------------| - | 0, 3, 3, 3, 7 | unspecified | + | 0, 3, 3, 3, 7 | unspecified (padding) | * Value buffer: - | Bytes 0-6 | Bytes 7-63 | - |----------------|----------------------| - | joemark | unspecified | + | Bytes 0-6 | Bytes 7-63 | + |----------------|-----------------------| + | joemark | unspecified (padding) | .. _variable-size-list-layout: @@ -388,18 +388,18 @@ will have the following representation: :: * Offsets buffer (int32) - | Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | Bytes 16-19 | Bytes 20-63 | - |------------|-------------|-------------|-------------|-------------|-------------| - | 0 | 3 | 3 | 7 | 7 | unspecified | + | Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | Bytes 16-19 | Bytes 20-63 | + |------------|-------------|-------------|-------------|-------------|-----------------------| + | 0 | 3 | 3 | 7 | 7 | unspecified (padding) | * Values array (Int8array): * Length: 7, Null count: 0 * Validity bitmap buffer: Not required * Values buffer (int8) - | Bytes 0-6 | Bytes 7-63 | - |------------------------------|-------------| - | 12, -7, 25, 0, -127, 127, 50 | unspecified | + | Bytes 0-6 | Bytes 7-63 | + |------------------------------|-----------------------| + | 12, -7, 25, 0, -127, 127, 50 | unspecified (padding) | **Example Layout: ``List>``** @@ -412,9 +412,9 @@ will be represented as follows: :: * Validity bitmap buffer: Not required * Offsets buffer (int32) - | Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | Bytes 16-63 | - |------------|------------|------------|-------------|-------------| - | 0 | 2 | 5 | 6 | unspecified | + | Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | Bytes 16-63 | + |------------|------------|------------|-------------|-----------------------| + | 0 | 2 | 5 | 6 | unspecified (padding) | * Values array (`List`) * Length: 6, Null count: 1 @@ -426,17 +426,17 @@ will be represented as follows: :: * Offsets buffer (int32) - | Bytes 0-27 | Bytes 28-63 | - |----------------------|-------------| - | 0, 2, 4, 7, 7, 8, 10 | unspecified | + | Bytes 0-27 | Bytes 28-63 | + |----------------------|-----------------------| + | 0, 2, 4, 7, 7, 8, 10 | unspecified (padding) | * Values array (Int8): * Length: 10, Null count: 0 * Validity bitmap buffer: Not required - | Bytes 0-9 | Bytes 10-63 | - |-------------------------------|-------------| - | 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 | unspecified | + | Bytes 0-9 | Bytes 10-63 | + |-------------------------------|-----------------------| + | 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 | unspecified (padding) | Fixed-Size List Layout ---------------------- @@ -511,9 +511,9 @@ The layout for ``[{'joe', 1}, {null, 2}, null, {'mark', 4}]`` would be: :: * Length: 4, Null count: 1 * Validity bitmap buffer: - |Byte 0 (validity bitmap) | Bytes 1-63 | - |-------------------------|-----------------------| - | 00001011 | 0 (padding) | + | Byte 0 (validity bitmap) | Bytes 1-63 | + |--------------------------|-----------------------| + | 00001011 | 0 (padding) | * Children arrays: * field-0 array (`VarBinary`): @@ -528,13 +528,13 @@ The layout for ``[{'joe', 1}, {null, 2}, null, {'mark', 4}]`` would be: :: | Bytes 0-19 | Bytes 20-63 | |----------------|-----------------------| - | 0, 3, 3, 3, 7 | unspecified | + | 0, 3, 3, 3, 7 | unspecified (padding) | * Value buffer: | Bytes 0-6 | Bytes 7-63 | |----------------|-----------------------| - | joemark | unspecified | + | joemark | unspecified (padding) | * field-1 array (int32 array): * Length: 4, Null count: 1 @@ -546,9 +546,9 @@ The layout for ``[{'joe', 1}, {null, 2}, null, {'mark', 4}]`` would be: :: * Value Buffer: - |Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | Bytes 16-63 | - |------------|-------------|-------------|-------------|-------------| - | 1 | 2 | unspecified | 4 | unspecified | + | Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | Bytes 16-63 | + |-------------|-------------|-------------|-------------|-----------------------| + | 1 | 2 | unspecified | 4 | unspecified (padding) | Struct Validity ~~~~~~~~~~~~~~~ @@ -610,15 +610,15 @@ will have the following layout: :: * Length: 4, Null count: 0 * Types buffer: - |Byte 0 | Byte 1 | Byte 2 | Byte 3 | Bytes 4-63 | - |---------|-------------|----------|----------|-------------| - | 0 | 0 | 0 | 1 | unspecified | + | Byte 0 | Byte 1 | Byte 2 | Byte 3 | Bytes 4-63 | + |----------|-------------|----------|----------|-----------------------| + | 0 | 0 | 0 | 1 | unspecified (padding) | * Offset buffer: - |Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | Bytes 16-63 | - |----------|-------------|------------|-------------|-------------| - | 0 | 1 | 2 | 0 | unspecified | + | Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | Bytes 16-63 | + |-----------|-------------|------------|-------------|-----------------------| + | 0 | 1 | 2 | 0 | unspecified (padding) | * Children arrays: * Field-0 array (f: Float32): @@ -627,9 +627,9 @@ will have the following layout: :: * Value Buffer: - | Bytes 0-11 | Bytes 12-63 | - |----------------|-------------| - | 1.2, null, 3.4 | unspecified | + | Bytes 0-11 | Bytes 12-63 | + |----------------|-----------------------| + | 1.2, null, 3.4 | unspecified (padding) | * Field-1 array (i: Int32): @@ -638,9 +638,9 @@ will have the following layout: :: * Value Buffer: - | Bytes 0-3 | Bytes 4-63 | - |-----------|-------------| - | 5 | unspecified | + | Bytes 0-3 | Bytes 4-63 | + |-----------|-----------------------| + | 5 | unspecified (padding) | Sparse Union ~~~~~~~~~~~~ @@ -677,29 +677,29 @@ will have the following layout: :: * Length: 6, Null count: 4 * Validity bitmap buffer: - |Byte 0 (validity bitmap) | Bytes 1-63 | - |-------------------------|-----------------------| - |00010001 | 0 (padding) | + | Byte 0 (validity bitmap) | Bytes 1-63 | + |--------------------------|-----------------------| + | 00010001 | 0 (padding) | * Value buffer: - |Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | Bytes 16-19 | Bytes 20-23 | Bytes 24-63 | - |------------|-------------|-------------|-------------|-------------|--------------|-----------------------| - | 5 | unspecified | unspecified | unspecified | 4 | unspecified | unspecified (padding) | + | Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | Bytes 16-19 | Bytes 20-23 | Bytes 24-63 | + |-------------|-------------|-------------|-------------|-------------|--------------|-----------------------| + | 5 | unspecified | unspecified | unspecified | 4 | unspecified | unspecified (padding) | * f (Float32): * Length: 6, Null count: 4 * Validity bitmap buffer: - |Byte 0 (validity bitmap) | Bytes 1-63 | - |-------------------------|-----------------------| - | 00001010 | 0 (padding) | + | Byte 0 (validity bitmap) | Bytes 1-63 | + |--------------------------|-----------------------| + | 00001010 | 0 (padding) | * Value buffer: - |Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | Bytes 16-19 | Bytes 20-23 | Bytes 24-63 | - |-------------|-------------|-------------|-------------|-------------|--------------|-----------------------| - | unspecified | 1.2 | unspecified | 3.4 | unspecified | unspecified | unspecified (padding) | + | Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | Bytes 16-19 | Bytes 20-23 | Bytes 24-63 | + |--------------|-------------|-------------|-------------|-------------|-------------|-----------------------| + | unspecified | 1.2 | unspecified | 3.4 | unspecified | unspecified | unspecified (padding) | * s (`VarBinary`) * Length: 6, Null count: 4 @@ -711,9 +711,9 @@ will have the following layout: :: * Offsets buffer (Int32) - | Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | Bytes 16-19 | Bytes 20-23 | Bytes 24-27 | Bytes 28-63 | - |------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------| - | 0 | 0 | 0 | 3 | 3 | 3 | 7 | unspecified | + | Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | Bytes 16-19 | Bytes 20-23 | Bytes 24-27 | Bytes 28-63 | + |------------|-------------|-------------|-------------|-------------|-------------|-------------|------------------------| + | 0 | 0 | 0 | 3 | 3 | 3 | 7 | unspecified (padding) | * Values buffer: diff --git a/docs/source/python/filesystems.rst b/docs/source/python/filesystems.rst index 40656f6b76f43..3fc10dc7718d3 100644 --- a/docs/source/python/filesystems.rst +++ b/docs/source/python/filesystems.rst @@ -153,8 +153,9 @@ PyArrow implements natively a S3 filesystem for S3 compatible storage. The :class:`S3FileSystem` constructor has several options to configure the S3 connection (e.g. credentials, the region, an endpoint override, etc). In addition, the constructor will also inspect configured S3 credentials as -supported by AWS (for example the ``AWS_ACCESS_KEY_ID`` and -``AWS_SECRET_ACCESS_KEY`` environment variables). +supported by AWS (such as the ``AWS_ACCESS_KEY_ID`` and +``AWS_SECRET_ACCESS_KEY`` environment variables, AWS configuration files, +and EC2 Instance Metadata Service for EC2 nodes). Example how you can read contents from a S3 bucket:: diff --git a/docs/source/status.rst b/docs/source/status.rst index 6c55b4bd3e01a..5c8895b114ae3 100644 --- a/docs/source/status.rst +++ b/docs/source/status.rst @@ -96,7 +96,7 @@ Data Types +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ | Extension | ✓ | ✓ | ✓ | | | ✓ | ✓ | | +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ -| Run-End Encoded | | | ✓ | | | | | | +| Run-End Encoded | ✓ | | ✓ | | | | | | +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ Notes: diff --git a/format/Flight.proto b/format/Flight.proto index 107e95765406e..b5d23f6f7e6d2 100644 --- a/format/Flight.proto +++ b/format/Flight.proto @@ -20,7 +20,7 @@ syntax = "proto3"; import "google/protobuf/timestamp.proto"; option java_package = "org.apache.arrow.flight.impl"; -option go_package = "github.com/apache/arrow/go/arrow/flight/internal/flight"; +option go_package = "github.com/apache/arrow/go/arrow/flight/gen/flight"; option csharp_namespace = "Apache.Arrow.Flight.Protocol"; package arrow.flight.protocol; diff --git a/format/FlightSql.proto b/format/FlightSql.proto index 48c2d94a11f42..3c9a719f1275f 100644 --- a/format/FlightSql.proto +++ b/format/FlightSql.proto @@ -20,7 +20,7 @@ syntax = "proto3"; import "google/protobuf/descriptor.proto"; option java_package = "org.apache.arrow.flight.sql.impl"; -option go_package = "github.com/apache/arrow/go/arrow/flight/internal/flight"; +option go_package = "github.com/apache/arrow/go/arrow/flight/gen/flight"; package arrow.flight.protocol.sql; /* diff --git a/go/arrow/array/concat.go b/go/arrow/array/concat.go index 838fe91d0accc..5d3d2e005d488 100644 --- a/go/arrow/array/concat.go +++ b/go/arrow/array/concat.go @@ -41,17 +41,6 @@ func Concatenate(arrs []arrow.Array, mem memory.Allocator) (result arrow.Array, return nil, errors.New("array/concat: must pass at least one array") } - defer func() { - if pErr := recover(); pErr != nil { - switch e := pErr.(type) { - case error: - err = fmt.Errorf("arrow/concat: %w", e) - default: - err = fmt.Errorf("arrow/concat: %v", pErr) - } - } - }() - // gather Data of inputs data := make([]arrow.ArrayData, len(arrs)) for i, ar := range arrs { @@ -368,8 +357,21 @@ func concatOffsets(buffers []*memory.Buffer, byteWidth int, mem memory.Allocator // concat is the implementation for actually performing the concatenation of the arrow.ArrayData // objects that we can call internally for nested types. -func concat(data []arrow.ArrayData, mem memory.Allocator) (arrow.ArrayData, error) { +func concat(data []arrow.ArrayData, mem memory.Allocator) (arr arrow.ArrayData, err error) { out := &Data{refCount: 1, dtype: data[0].DataType(), nulls: 0} + defer func() { + if pErr := recover(); pErr != nil { + switch e := pErr.(type) { + case error: + err = fmt.Errorf("arrow/concat: %w", e) + default: + err = fmt.Errorf("arrow/concat: %v", pErr) + } + } + if err != nil { + out.Release() + } + }() for _, d := range data { out.length += d.Len() if out.nulls == UnknownNullCount || d.NullN() == UnknownNullCount { @@ -445,8 +447,8 @@ func concat(data []arrow.ArrayData, mem memory.Allocator) (arrow.ArrayData, erro if err != nil { return nil, err } - out.buffers[2] = concatBuffers(gatherBufferRanges(data, 2, valueRanges), mem) out.buffers[1] = offsetBuffer + out.buffers[2] = concatBuffers(gatherBufferRanges(data, 2, valueRanges), mem) case *arrow.ListType: offsetWidth := dt.Layout().Buffers[1].ByteWidth offsetBuffer, valueRanges, err := concatOffsets(gatherFixedBuffers(data, 1, offsetWidth), offsetWidth, mem) diff --git a/go/arrow/array/concat_test.go b/go/arrow/array/concat_test.go index 6cf86883d1520..a74166541e856 100644 --- a/go/arrow/array/concat_test.go +++ b/go/arrow/array/concat_test.go @@ -743,3 +743,23 @@ func TestConcatOverflowRunEndEncoding(t *testing.T) { }) } } + +func TestConcatPanic(t *testing.T) { + mem := memory.NewCheckedAllocator(memory.DefaultAllocator) + defer mem.AssertSize(t, 0) + + allocator := &panicAllocator{ + n: 400, + Allocator: mem, + } + + g := gen.NewRandomArrayGenerator(0, memory.DefaultAllocator) + ar1 := g.ArrayOf(arrow.STRING, 32, 0) + defer ar1.Release() + ar2 := g.ArrayOf(arrow.STRING, 32, 0) + defer ar2.Release() + + concat, err := array.Concatenate([]arrow.Array{ar1, ar2}, allocator) + assert.Error(t, err) + assert.Nil(t, concat) +} diff --git a/go/arrow/array/dictionary.go b/go/arrow/array/dictionary.go index 8c3ffb5247fe0..ccb6f32321496 100644 --- a/go/arrow/array/dictionary.go +++ b/go/arrow/array/dictionary.go @@ -814,11 +814,11 @@ func (b *dictionaryBuilder) newWithDictOffset(offset int) (indices, dict *Data, defer idxarr.Release() indices = idxarr.Data().(*Data) - indices.Retain() b.deltaOffset = b.memoTable.Size() dict, err = GetDictArrayData(b.mem, b.dt.ValueType, b.memoTable, offset) b.reset() + indices.Retain() return } @@ -842,6 +842,11 @@ func (b *dictionaryBuilder) insertDictValue(val interface{}) error { return err } +func (b *dictionaryBuilder) insertDictBytes(val []byte) error { + _, _, err := b.memoTable.GetOrInsertBytes(val) + return err +} + func (b *dictionaryBuilder) appendValue(val interface{}) error { idx, _, err := b.memoTable.GetOrInsert(val) b.idxBuilder.Append(idx) @@ -849,6 +854,13 @@ func (b *dictionaryBuilder) appendValue(val interface{}) error { return err } +func (b *dictionaryBuilder) appendBytes(val []byte) error { + idx, _, err := b.memoTable.GetOrInsertBytes(val) + b.idxBuilder.Append(idx) + b.length += 1 + return err +} + func getvalFn(arr arrow.Array) func(i int) interface{} { switch typedarr := arr.(type) { case *Int8: @@ -1285,16 +1297,18 @@ func (b *BinaryDictionaryBuilder) Append(v []byte) error { b.AppendNull() return nil } - return b.appendValue(v) + + return b.appendBytes(v) } -func (b *BinaryDictionaryBuilder) AppendString(v string) error { return b.appendValue(v) } + +func (b *BinaryDictionaryBuilder) AppendString(v string) error { return b.appendBytes([]byte(v)) } func (b *BinaryDictionaryBuilder) InsertDictValues(arr *Binary) (err error) { if !arrow.TypeEqual(arr.DataType(), b.dt.ValueType) { return fmt.Errorf("dictionary insert type mismatch: cannot insert values of type %T to dictionary type %T", arr.DataType(), b.dt.ValueType) } for i := 0; i < arr.Len(); i++ { - if err = b.insertDictValue(arr.Value(i)); err != nil { + if err = b.insertDictBytes(arr.Value(i)); err != nil { break } } diff --git a/go/arrow/array/dictionary_test.go b/go/arrow/array/dictionary_test.go index cc252e26855db..99c8e6ffcd47b 100644 --- a/go/arrow/array/dictionary_test.go +++ b/go/arrow/array/dictionary_test.go @@ -19,6 +19,7 @@ package array_test import ( "fmt" "math" + "math/rand" "reflect" "strings" "testing" @@ -1800,3 +1801,67 @@ func TestDictionaryAppendIndices(t *testing.T) { }) } } + +type panicAllocator struct { + n int + paniced bool + memory.Allocator +} + +func (p *panicAllocator) Allocate(size int) []byte { + if size > p.n { + p.paniced = true + panic("panic allocator") + } + return p.Allocator.Allocate(size) +} + +func (p *panicAllocator) Reallocate(size int, b []byte) []byte { + return p.Allocator.Reallocate(size, b) +} + +func (p *panicAllocator) Free(b []byte) { + p.Allocator.Free(b) +} + +func TestBinaryDictionaryPanic(t *testing.T) { + mem := memory.NewCheckedAllocator(memory.DefaultAllocator) + defer mem.AssertSize(t, 0) + + allocator := &panicAllocator{ + n: 400, + Allocator: mem, + } + + expectedType := &arrow.DictionaryType{IndexType: &arrow.Int8Type{}, ValueType: arrow.BinaryTypes.String} + bldr := array.NewDictionaryBuilder(allocator, expectedType) + defer bldr.Release() + + bldr.AppendNull() + allocator.n = 0 // force panic + func() { + defer func() { + recover() + }() + bldr.NewArray() + }() + assert.True(t, allocator.paniced) +} + +func BenchmarkBinaryDictionaryBuilder(b *testing.B) { + mem := memory.NewCheckedAllocator(memory.DefaultAllocator) + defer mem.AssertSize(b, 0) + + dictType := &arrow.DictionaryType{IndexType: &arrow.Int32Type{}, ValueType: arrow.BinaryTypes.String} + bldr := array.NewDictionaryBuilder(mem, dictType) + defer bldr.Release() + + randString := func() string { + return fmt.Sprintf("test-%d", rand.Intn(30)) + } + + builder := bldr.(*array.BinaryDictionaryBuilder) + for i := 0; i < b.N; i++ { + assert.NoError(b, builder.AppendString(randString())) + } +} diff --git a/go/arrow/array/timestamp.go b/go/arrow/array/timestamp.go index ee38a0eb4f041..a9928320748d1 100644 --- a/go/arrow/array/timestamp.go +++ b/go/arrow/array/timestamp.go @@ -90,7 +90,10 @@ func (a *Timestamp) ValueStr(i int) string { if a.IsNull(i) { return NullValueStr } - return a.values[i].ToTime(a.DataType().(*arrow.TimestampType).Unit).Format("2006-01-02 15:04:05.999999999") + + dt := a.DataType().(*arrow.TimestampType) + z, _ := dt.GetZone() + return a.values[i].ToTime(dt.Unit).In(z).Format("2006-01-02 15:04:05.999999999Z0700") } func (a *Timestamp) GetOneForMarshal(i int) interface{} { @@ -289,7 +292,13 @@ func (b *TimestampBuilder) AppendValueFromString(s string) error { b.AppendNull() return nil } - v, err := arrow.TimestampFromString(s, b.dtype.Unit) + + loc, err := b.dtype.GetZone() + if err != nil { + return err + } + + v, _, err := arrow.TimestampFromStringInLocation(s, b.dtype.Unit, loc) if err != nil { b.AppendNull() return err diff --git a/go/arrow/array/timestamp_test.go b/go/arrow/array/timestamp_test.go index 27978976dbc9c..d8d9f8a389274 100644 --- a/go/arrow/array/timestamp_test.go +++ b/go/arrow/array/timestamp_test.go @@ -233,3 +233,21 @@ func TestTimestampBuilder_Resize(t *testing.T) { ab.Resize(32) assert.Equal(t, 5, ab.Len()) } + +func TestTimestampValueStr(t *testing.T) { + mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer mem.AssertSize(t, 0) + + dt := &arrow.TimestampType{Unit: arrow.Second, TimeZone: "America/Phoenix"} + b := array.NewTimestampBuilder(mem, dt) + defer b.Release() + + b.Append(-34226955) + b.Append(1456767743) + + arr := b.NewArray() + defer arr.Release() + + assert.Equal(t, "1968-11-30 13:30:45-0700", arr.ValueStr(0)) + assert.Equal(t, "2016-02-29 10:42:23-0700", arr.ValueStr(1)) +} diff --git a/go/arrow/cdata/cdata_exports.go b/go/arrow/cdata/cdata_exports.go index 7b2f10ea66723..dae9f5fefe242 100644 --- a/go/arrow/cdata/cdata_exports.go +++ b/go/arrow/cdata/cdata_exports.go @@ -283,7 +283,7 @@ func (exp *schemaExporter) export(field arrow.Field) { func allocateArrowSchemaArr(n int) (out []CArrowSchema) { s := (*reflect.SliceHeader)(unsafe.Pointer(&out)) - s.Data = uintptr(C.malloc(C.sizeof_struct_ArrowSchema * C.size_t(n))) + s.Data = uintptr(C.calloc(C.size_t(n), C.sizeof_struct_ArrowSchema)) s.Len = n s.Cap = n @@ -292,7 +292,7 @@ func allocateArrowSchemaArr(n int) (out []CArrowSchema) { func allocateArrowSchemaPtrArr(n int) (out []*CArrowSchema) { s := (*reflect.SliceHeader)(unsafe.Pointer(&out)) - s.Data = uintptr(C.malloc(C.size_t(unsafe.Sizeof((*CArrowSchema)(nil))) * C.size_t(n))) + s.Data = uintptr(C.calloc(C.size_t(n), C.size_t(unsafe.Sizeof((*CArrowSchema)(nil))))) s.Len = n s.Cap = n @@ -301,7 +301,7 @@ func allocateArrowSchemaPtrArr(n int) (out []*CArrowSchema) { func allocateArrowArrayArr(n int) (out []CArrowArray) { s := (*reflect.SliceHeader)(unsafe.Pointer(&out)) - s.Data = uintptr(C.malloc(C.sizeof_struct_ArrowArray * C.size_t(n))) + s.Data = uintptr(C.calloc(C.size_t(n), C.sizeof_struct_ArrowArray)) s.Len = n s.Cap = n @@ -310,7 +310,7 @@ func allocateArrowArrayArr(n int) (out []CArrowArray) { func allocateArrowArrayPtrArr(n int) (out []*CArrowArray) { s := (*reflect.SliceHeader)(unsafe.Pointer(&out)) - s.Data = uintptr(C.malloc(C.size_t(unsafe.Sizeof((*CArrowArray)(nil))) * C.size_t(n))) + s.Data = uintptr(C.calloc(C.size_t(n), C.size_t(unsafe.Sizeof((*CArrowArray)(nil))))) s.Len = n s.Cap = n @@ -319,7 +319,7 @@ func allocateArrowArrayPtrArr(n int) (out []*CArrowArray) { func allocateBufferPtrArr(n int) (out []*C.void) { s := (*reflect.SliceHeader)(unsafe.Pointer(&out)) - s.Data = uintptr(C.malloc(C.size_t(unsafe.Sizeof((*C.void)(nil))) * C.size_t(n))) + s.Data = uintptr(C.calloc(C.size_t(n), C.size_t(unsafe.Sizeof((*C.void)(nil))))) s.Len = n s.Cap = n diff --git a/go/arrow/cdata/cdata_fulltest.c b/go/arrow/cdata/cdata_fulltest.c index b85e1e8310f94..7aed597942b51 100644 --- a/go/arrow/cdata/cdata_fulltest.c +++ b/go/arrow/cdata/cdata_fulltest.c @@ -404,6 +404,7 @@ void setup_array_stream_test(const int n_batches, struct ArrowArrayStream* out) int test_exported_stream(struct ArrowArrayStream* stream) { while (1) { struct ArrowArray array; + memset(&array, 0, sizeof(array)); // Garbage - implementation should not try to call it, though! array.release = (void*)0xDEADBEEF; int rc = stream->get_next(stream, &array); @@ -447,3 +448,35 @@ void test_stream_schema_fallible(struct ArrowArrayStream* stream) { stream->private_data = &kFallibleStream; stream->release = FallibleRelease; } + +int confuse_go_gc(struct ArrowArrayStream* stream, unsigned int seed) { + struct ArrowSchema schema; + // Try to confuse the Go GC by putting what looks like a Go pointer here. +#ifdef _WIN32 + // Thread-safe on Windows with the multithread CRT +#define DORAND rand() +#else +#define DORAND rand_r(&seed) +#endif + schema.name = (char*)(0xc000000000L + (DORAND % 0x2000)); + schema.format = (char*)(0xc000000000L + (DORAND % 0x2000)); + int rc = stream->get_schema(stream, &schema); + if (rc != 0) return rc; + schema.release(&schema); + + while (1) { + struct ArrowArray array; + array.release = (void*)(0xc000000000L + (DORAND % 0x2000)); + array.private_data = (void*)(0xc000000000L + (DORAND % 0x2000)); + int rc = stream->get_next(stream, &array); + if (rc != 0) return rc; + + if (array.release == NULL) { + stream->release(stream); + break; + } + array.release(&array); + } + return 0; +#undef DORAND +} diff --git a/go/arrow/cdata/cdata_test.go b/go/arrow/cdata/cdata_test.go index f336dec3707da..0c4bbae3d5526 100644 --- a/go/arrow/cdata/cdata_test.go +++ b/go/arrow/cdata/cdata_test.go @@ -29,6 +29,7 @@ import ( "io" "runtime" "runtime/cgo" + "sync" "testing" "time" "unsafe" @@ -768,6 +769,34 @@ func TestExportRecordReaderStream(t *testing.T) { assert.EqualValues(t, len(reclist), i) } +func TestExportRecordReaderStreamLifetime(t *testing.T) { + mem := memory.NewCheckedAllocator(memory.DefaultAllocator) + defer mem.AssertSize(t, 0) + + schema := arrow.NewSchema([]arrow.Field{ + {Name: "strings", Type: arrow.BinaryTypes.String, Nullable: false}, + }, nil) + + bldr := array.NewBuilder(mem, &arrow.StringType{}) + defer bldr.Release() + + arr := bldr.NewArray() + defer arr.Release() + + rec := array.NewRecord(schema, []arrow.Array{arr}, 0) + defer rec.Release() + + rdr, _ := array.NewRecordReader(schema, []arrow.Record{rec}) + defer rdr.Release() + + out := createTestStreamObj() + ExportRecordReader(rdr, out) + + // C Stream is holding on to memory + assert.NotEqual(t, 0, mem.CurrentAlloc()) + releaseStream(out) +} + func TestEmptyListExport(t *testing.T) { bldr := array.NewBuilder(memory.DefaultAllocator, arrow.LargeListOf(arrow.PrimitiveTypes.Int32)) defer bldr.Release() @@ -940,3 +969,28 @@ func TestRecordReaderImportError(t *testing.T) { } assert.Contains(t, err.Error(), "Expected error message") } + +func TestConfuseGoGc(t *testing.T) { + // Regression test for https://github.com/apache/arrow-adbc/issues/729 + reclist := arrdata.Records["primitives"] + + var wg sync.WaitGroup + concurrency := 32 + wg.Add(concurrency) + + // XXX: this test is a bit expensive + for i := 0; i < concurrency; i++ { + go func() { + for i := 0; i < 256; i++ { + rdr, err := array.NewRecordReader(reclist[0].Schema(), reclist) + assert.NoError(t, err) + runtime.GC() + assert.NoError(t, confuseGoGc(rdr)) + runtime.GC() + } + wg.Done() + }() + } + + wg.Wait() +} diff --git a/go/arrow/cdata/cdata_test_framework.go b/go/arrow/cdata/cdata_test_framework.go index fb6122964168b..c731c730c6bcd 100644 --- a/go/arrow/cdata/cdata_test_framework.go +++ b/go/arrow/cdata/cdata_test_framework.go @@ -21,11 +21,16 @@ package cdata // #include // #include +// #include // #include "arrow/c/abi.h" // #include "arrow/c/helpers.h" // // void setup_array_stream_test(const int n_batches, struct ArrowArrayStream* out); -// struct ArrowArray* get_test_arr() { return (struct ArrowArray*)(malloc(sizeof(struct ArrowArray))); } +// struct ArrowArray* get_test_arr() { +// struct ArrowArray* array = (struct ArrowArray*)malloc(sizeof(struct ArrowArray)); +// memset(array, 0, sizeof(*array)); +// return array; +// } // struct ArrowArrayStream* get_test_stream() { // struct ArrowArrayStream* out = (struct ArrowArrayStream*)malloc(sizeof(struct ArrowArrayStream)); // memset(out, 0, sizeof(struct ArrowArrayStream)); @@ -56,11 +61,13 @@ package cdata // struct ArrowSchema** test_union(const char** fmts, const char** names, int64_t* flags, const int n); // int test_exported_stream(struct ArrowArrayStream* stream); // void test_stream_schema_fallible(struct ArrowArrayStream* stream); +// int confuse_go_gc(struct ArrowArrayStream* stream, unsigned int seed); import "C" import ( "errors" "fmt" "io" + "math/rand" "unsafe" "github.com/apache/arrow/go/v13/arrow" @@ -271,15 +278,17 @@ func createCArr(arr arrow.Array) *CArrowArray { carr.null_count = C.int64_t(arr.NullN()) carr.offset = C.int64_t(arr.Data().Offset()) buffers := arr.Data().Buffers() - cbuf := []unsafe.Pointer{} - for _, b := range buffers { + cbufs := allocateBufferPtrArr(len(buffers)) + for i, b := range buffers { if b != nil { - cbuf = append(cbuf, C.CBytes(b.Bytes())) + cbufs[i] = (*C.void)(C.CBytes(b.Bytes())) + } else { + cbufs[i] = nil } } - carr.n_buffers = C.int64_t(len(cbuf)) - if len(cbuf) > 0 { - carr.buffers = &cbuf[0] + carr.n_buffers = C.int64_t(len(cbufs)) + if len(cbufs) > 0 { + carr.buffers = (*unsafe.Pointer)(unsafe.Pointer(&cbufs[0])) } carr.release = (*[0]byte)(C.release_test_arr) @@ -350,3 +359,14 @@ func fallibleSchemaTest() error { } return nil } + +func confuseGoGc(reader array.RecordReader) error { + out := C.get_test_stream() + ExportRecordReader(reader, out) + rc := C.confuse_go_gc(out, C.uint(rand.Int())) + C.free(unsafe.Pointer(out)) + if rc == 0 { + return nil + } + return fmt.Errorf("Exported stream test failed with return code %d", int(rc)) +} diff --git a/go/arrow/cdata/exports.go b/go/arrow/cdata/exports.go index 2bbd45e58af01..118dec2c38b96 100644 --- a/go/arrow/cdata/exports.go +++ b/go/arrow/cdata/exports.go @@ -28,11 +28,14 @@ import ( // #include // #include "arrow/c/helpers.h" // -// typedef const char cchar_t; -// extern int streamGetSchema(struct ArrowArrayStream*, struct ArrowSchema*); -// extern int streamGetNext(struct ArrowArrayStream*, struct ArrowArray*); -// extern const char* streamGetError(struct ArrowArrayStream*); -// extern void streamRelease(struct ArrowArrayStream*); +// typedef const char cchar_t; +// extern int streamGetSchema(struct ArrowArrayStream*, struct ArrowSchema*); +// extern int streamGetNext(struct ArrowArrayStream*, struct ArrowArray*); +// extern const char* streamGetError(struct ArrowArrayStream*); +// extern void streamRelease(struct ArrowArrayStream*); +// // XXX(https://github.com/apache/arrow-adbc/issues/729) +// int streamGetSchemaTrampoline(struct ArrowArrayStream* stream, struct ArrowSchema* out); +// int streamGetNextTrampoline(struct ArrowArrayStream* stream, struct ArrowArray* out); // import "C" @@ -154,10 +157,11 @@ func streamRelease(handle *CArrowArrayStream) { } func exportStream(rdr array.RecordReader, out *CArrowArrayStream) { - out.get_schema = (*[0]byte)(C.streamGetSchema) - out.get_next = (*[0]byte)(C.streamGetNext) + out.get_schema = (*[0]byte)(C.streamGetSchemaTrampoline) + out.get_next = (*[0]byte)(C.streamGetNextTrampoline) out.get_last_error = (*[0]byte)(C.streamGetError) out.release = (*[0]byte)(C.streamRelease) + rdr.Retain() h := cgo.NewHandle(cRecordReader{rdr: rdr, err: nil}) out.private_data = createHandle(h) } diff --git a/go/arrow/cdata/interface.go b/go/arrow/cdata/interface.go index 64b8176ad221a..50404878005b9 100644 --- a/go/arrow/cdata/interface.go +++ b/go/arrow/cdata/interface.go @@ -198,6 +198,11 @@ func ImportCRecordReader(stream *CArrowArrayStream, schema *arrow.Schema) (arrio // the populating of the struct. Any memory allocated will be allocated using malloc // which means that it is invisible to the Go Garbage Collector and must be freed manually // using the callback on the CArrowSchema object. +// +// WARNING: the output ArrowSchema MUST BE ZERO INITIALIZED, or the Go garbage collector +// may error at runtime, due to CGO rules ("the current implementation may sometimes +// cause a runtime error if the contents of the C memory appear to be a Go pointer"). +// You have been warned! func ExportArrowSchema(schema *arrow.Schema, out *CArrowSchema) { dummy := arrow.Field{Type: arrow.StructOf(schema.Fields()...), Metadata: schema.Metadata()} exportField(dummy, out) @@ -220,6 +225,11 @@ func ExportArrowSchema(schema *arrow.Schema, out *CArrowSchema) { // The release function on the populated CArrowArray will properly decrease the reference counts, // and release the memory if the record has already been released. But since this must be explicitly // done, make sure it is released so that you do not create a memory leak. +// +// WARNING: the output ArrowArray MUST BE ZERO INITIALIZED, or the Go garbage collector +// may error at runtime, due to CGO rules ("the current implementation may sometimes +// cause a runtime error if the contents of the C memory appear to be a Go pointer"). +// You have been warned! func ExportArrowRecordBatch(rb arrow.Record, out *CArrowArray, outSchema *CArrowSchema) { children := make([]arrow.ArrayData, rb.NumCols()) for i := range rb.Columns() { @@ -243,6 +253,11 @@ func ExportArrowRecordBatch(rb arrow.Record, out *CArrowArray, outSchema *CArrow // being used by the arrow.Array passed in, in order to share with zero-copy across the C // Data Interface. See the documentation for ExportArrowRecordBatch for details on how to ensure // you do not leak memory and prevent unwanted, undefined or strange behaviors. +// +// WARNING: the output ArrowArray MUST BE ZERO INITIALIZED, or the Go garbage collector +// may error at runtime, due to CGO rules ("the current implementation may sometimes +// cause a runtime error if the contents of the C memory appear to be a Go pointer"). +// You have been warned! func ExportArrowArray(arr arrow.Array, out *CArrowArray, outSchema *CArrowSchema) { exportArray(arr, out, outSchema) } @@ -252,6 +267,11 @@ func ExportArrowArray(arr arrow.Array, out *CArrowArray, outSchema *CArrowSchema // CArrowArrayStream takes ownership of the RecordReader until the consumer calls the release // callback, as such it is unnecesary to call Release on the passed in reader unless it has // previously been retained. +// +// WARNING: the output ArrowArrayStream MUST BE ZERO INITIALIZED, or the Go garbage +// collector may error at runtime, due to CGO rules ("the current implementation may +// sometimes cause a runtime error if the contents of the C memory appear to be a Go +// pointer"). You have been warned! func ExportRecordReader(reader array.RecordReader, out *CArrowArrayStream) { exportStream(reader, out) } diff --git a/go/arrow/cdata/trampoline.c b/go/arrow/cdata/trampoline.c new file mode 100644 index 0000000000000..01db13fab4845 --- /dev/null +++ b/go/arrow/cdata/trampoline.c @@ -0,0 +1,34 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "arrow/c/abi.h" + +int streamGetSchema(struct ArrowArrayStream*, struct ArrowSchema*); +int streamGetNext(struct ArrowArrayStream*, struct ArrowArray*); + +int streamGetSchemaTrampoline(struct ArrowArrayStream* stream, struct ArrowSchema* out) { + // XXX(https://github.com/apache/arrow-adbc/issues/729) + memset(out, 0, sizeof(*out)); + return streamGetSchema(stream, out); +} + +int streamGetNextTrampoline(struct ArrowArrayStream* stream, struct ArrowArray* out) { + // XXX(https://github.com/apache/arrow-adbc/issues/729) + memset(out, 0, sizeof(*out)); + return streamGetNext(stream, out); +} diff --git a/go/arrow/csv/common.go b/go/arrow/csv/common.go index 48224d7ad25bd..ccdf964eae5a1 100644 --- a/go/arrow/csv/common.go +++ b/go/arrow/csv/common.go @@ -225,6 +225,7 @@ func validate(schema *arrow.Schema) { case *arrow.ListType, *arrow.LargeListType, *arrow.FixedSizeListType: case *arrow.BinaryType, *arrow.LargeBinaryType, *arrow.FixedSizeBinaryType: case arrow.ExtensionType: + case *arrow.NullType: default: panic(fmt.Errorf("arrow/csv: field %d (%s) has invalid data type %T", i, f.Name, ft)) } diff --git a/go/arrow/csv/transformer.go b/go/arrow/csv/transformer.go index 886282d49ff80..3eeb44cfe6e01 100644 --- a/go/arrow/csv/transformer.go +++ b/go/arrow/csv/transformer.go @@ -308,6 +308,10 @@ func (w *Writer) transformColToStringArr(typ arrow.DataType, col arrow.Array) [] res[i] = arr.ValueStr(i) } } + case *arrow.NullType: + for i := 0; i < col.Len(); i++ { + res[i] = w.nullValue + } default: panic(fmt.Errorf("arrow/csv: field has unsupported data type %s", typ.String())) } diff --git a/go/arrow/csv/writer_test.go b/go/arrow/csv/writer_test.go index cfce4dd0a6142..7216eb8cbd439 100644 --- a/go/arrow/csv/writer_test.go +++ b/go/arrow/csv/writer_test.go @@ -134,18 +134,18 @@ func Example_writer() { var ( fullData = [][]string{ - {"bool", "i8", "i16", "i32", "i64", "u8", "u16", "u32", "u64", "f16", "f32", "f64", "str", "large_str", "ts_s", "d32", "d64", "dec128", "dec256", "list(i64)", "large_list(i64)", "fixed_size_list(i64)", "binary", "large_binary", "fixed_size_binary", "uuid"}, - {"true", "-1", "-1", "-1", "-1", "0", "0", "0", "0", "0", "0", "0", "str-0", "str-0", "2014-07-28 15:04:05", "2017-05-18", "2028-04-26", "-123.45", "-123.45", "{1,2,3}", "{1,2,3}", "{1,2,3}", "AAEC", "AAEC", "AAEC", "00000000-0000-0000-0000-000000000001"}, - {"false", "0", "0", "0", "0", "1", "1", "1", "1", "0.099975586", "0.1", "0.1", "str-1", "str-1", "2016-09-08 15:04:05", "2022-11-08", "2031-06-28", "0", "0", "{4,5,6}", "{4,5,6}", "{4,5,6}", "AwQF", "AwQF", "AwQF", "00000000-0000-0000-0000-000000000002"}, - {"true", "1", "1", "1", "1", "2", "2", "2", "2", "0.19995117", "0.2", "0.2", "str-2", "str-2", "2021-09-18 15:04:05", "2025-08-04", "2034-08-28", "123.45", "123.45", "{7,8,9}", "{7,8,9}", "{7,8,9}", "", "", "AAAA", "00000000-0000-0000-0000-000000000003"}, - {nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal}, + {"bool", "i8", "i16", "i32", "i64", "u8", "u16", "u32", "u64", "f16", "f32", "f64", "str", "large_str", "ts_s", "d32", "d64", "dec128", "dec256", "list(i64)", "large_list(i64)", "fixed_size_list(i64)", "binary", "large_binary", "fixed_size_binary", "uuid", "null"}, + {"true", "-1", "-1", "-1", "-1", "0", "0", "0", "0", "0", "0", "0", "str-0", "str-0", "2014-07-28 15:04:05", "2017-05-18", "2028-04-26", "-123.45", "-123.45", "{1,2,3}", "{1,2,3}", "{1,2,3}", "AAEC", "AAEC", "AAEC", "00000000-0000-0000-0000-000000000001", nullVal}, + {"false", "0", "0", "0", "0", "1", "1", "1", "1", "0.099975586", "0.1", "0.1", "str-1", "str-1", "2016-09-08 15:04:05", "2022-11-08", "2031-06-28", "0", "0", "{4,5,6}", "{4,5,6}", "{4,5,6}", "AwQF", "AwQF", "AwQF", "00000000-0000-0000-0000-000000000002", nullVal}, + {"true", "1", "1", "1", "1", "2", "2", "2", "2", "0.19995117", "0.2", "0.2", "str-2", "str-2", "2021-09-18 15:04:05", "2025-08-04", "2034-08-28", "123.45", "123.45", "{7,8,9}", "{7,8,9}", "{7,8,9}", "", "", "AAAA", "00000000-0000-0000-0000-000000000003", nullVal}, + {nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal}, } bananaData = [][]string{ - {"bool", "i8", "i16", "i32", "i64", "u8", "u16", "u32", "u64", "f16", "f32", "f64", "str", "large_str", "ts_s", "d32", "d64", "dec128", "dec256", "list(i64)", "large_list(i64)", "fixed_size_list(i64)", "binary", "large_binary", "fixed_size_binary", "uuid"}, - {"BANANA", "-1", "-1", "-1", "-1", "0", "0", "0", "0", "0", "0", "0", "str-0", "str-0", "2014-07-28 15:04:05", "2017-05-18", "2028-04-26", "-123.45", "-123.45", "{1,2,3}", "{1,2,3}", "{1,2,3}", "AAEC", "AAEC", "AAEC", "00000000-0000-0000-0000-000000000001"}, - {"MANGO", "0", "0", "0", "0", "1", "1", "1", "1", "0.099975586", "0.1", "0.1", "str-1", "str-1", "2016-09-08 15:04:05", "2022-11-08", "2031-06-28", "0", "0", "{4,5,6}", "{4,5,6}", "{4,5,6}", "AwQF", "AwQF", "AwQF", "00000000-0000-0000-0000-000000000002"}, - {"BANANA", "1", "1", "1", "1", "2", "2", "2", "2", "0.19995117", "0.2", "0.2", "str-2", "str-2", "2021-09-18 15:04:05", "2025-08-04", "2034-08-28", "123.45", "123.45", "{7,8,9}", "{7,8,9}", "{7,8,9}", "", "", "AAAA", "00000000-0000-0000-0000-000000000003"}, - {nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal}, + {"bool", "i8", "i16", "i32", "i64", "u8", "u16", "u32", "u64", "f16", "f32", "f64", "str", "large_str", "ts_s", "d32", "d64", "dec128", "dec256", "list(i64)", "large_list(i64)", "fixed_size_list(i64)", "binary", "large_binary", "fixed_size_binary", "uuid", "null"}, + {"BANANA", "-1", "-1", "-1", "-1", "0", "0", "0", "0", "0", "0", "0", "str-0", "str-0", "2014-07-28 15:04:05", "2017-05-18", "2028-04-26", "-123.45", "-123.45", "{1,2,3}", "{1,2,3}", "{1,2,3}", "AAEC", "AAEC", "AAEC", "00000000-0000-0000-0000-000000000001", nullVal}, + {"MANGO", "0", "0", "0", "0", "1", "1", "1", "1", "0.099975586", "0.1", "0.1", "str-1", "str-1", "2016-09-08 15:04:05", "2022-11-08", "2031-06-28", "0", "0", "{4,5,6}", "{4,5,6}", "{4,5,6}", "AwQF", "AwQF", "AwQF", "00000000-0000-0000-0000-000000000002", nullVal}, + {"BANANA", "1", "1", "1", "1", "2", "2", "2", "2", "0.19995117", "0.2", "0.2", "str-2", "str-2", "2021-09-18 15:04:05", "2025-08-04", "2034-08-28", "123.45", "123.45", "{7,8,9}", "{7,8,9}", "{7,8,9}", "", "", "AAAA", "00000000-0000-0000-0000-000000000003", nullVal}, + {nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal}, } ) @@ -230,6 +230,7 @@ func testCSVWriter(t *testing.T, data [][]string, writeHeader bool, fmtr func(bo {Name: "large_binary", Type: arrow.BinaryTypes.LargeBinary}, {Name: "fixed_size_binary", Type: &arrow.FixedSizeBinaryType{ByteWidth: 3}}, {Name: "uuid", Type: types.NewUUIDType()}, + {Name: "null", Type: arrow.Null}, }, nil, ) @@ -284,6 +285,7 @@ func testCSVWriter(t *testing.T, data [][]string, writeHeader bool, fmtr func(bo b.Field(23).(*array.BinaryBuilder).AppendValues([][]byte{{0, 1, 2}, {3, 4, 5}, {}}, nil) b.Field(24).(*array.FixedSizeBinaryBuilder).AppendValues([][]byte{{0, 1, 2}, {3, 4, 5}, {}}, nil) b.Field(25).(*types.UUIDBuilder).AppendValues([]uuid.UUID{uuid.MustParse("00000000-0000-0000-0000-000000000001"), uuid.MustParse("00000000-0000-0000-0000-000000000002"), uuid.MustParse("00000000-0000-0000-0000-000000000003")}, nil) + b.Field(26).(*array.NullBuilder).AppendEmptyValues(3) for _, field := range b.Fields() { field.AppendNull() diff --git a/go/arrow/datatype_fixedwidth.go b/go/arrow/datatype_fixedwidth.go index d6550c1cf896d..4b6ca55291537 100644 --- a/go/arrow/datatype_fixedwidth.go +++ b/go/arrow/datatype_fixedwidth.go @@ -192,10 +192,16 @@ func TimestampFromString(val string, unit TimeUnit) (Timestamp, error) { } func (t Timestamp) ToTime(unit TimeUnit) time.Time { - if unit == Second { + switch unit { + case Second: return time.Unix(int64(t), 0).UTC() + case Millisecond: + return time.UnixMilli(int64(t)).UTC() + case Microsecond: + return time.UnixMicro(int64(t)).UTC() + default: + return time.Unix(0, int64(t)).UTC() } - return time.Unix(0, int64(t)*int64(unit.Multiplier())).UTC() } // TimestampFromTime allows converting time.Time to Timestamp @@ -327,6 +333,8 @@ const ( var TimeUnitValues = []TimeUnit{Second, Millisecond, Microsecond, Nanosecond} +// Multiplier returns a time.Duration value to multiply by in order to +// convert the value into nanoseconds func (u TimeUnit) Multiplier() time.Duration { return [...]time.Duration{time.Second, time.Millisecond, time.Microsecond, time.Nanosecond}[uint(u)&3] } diff --git a/go/arrow/datatype_fixedwidth_test.go b/go/arrow/datatype_fixedwidth_test.go index 50747366a255e..669c7f9ca87ad 100644 --- a/go/arrow/datatype_fixedwidth_test.go +++ b/go/arrow/datatype_fixedwidth_test.go @@ -159,6 +159,13 @@ func TestTimestampType(t *testing.T) { } } +func TestTimestampToTime(t *testing.T) { + ts := arrow.Timestamp(11865225600000) + tm := ts.ToTime(arrow.Millisecond) + + assert.Equal(t, "2345-12-30 00:00:00", tm.Format("2006-01-02 15:04:05.999")) +} + func TestTime32Type(t *testing.T) { for _, tc := range []struct { unit arrow.TimeUnit diff --git a/go/arrow/flight/client.go b/go/arrow/flight/client.go index 31ffc26cfd35a..1de5fc47f5f28 100644 --- a/go/arrow/flight/client.go +++ b/go/arrow/flight/client.go @@ -26,7 +26,7 @@ import ( "strings" "sync/atomic" - "github.com/apache/arrow/go/v13/arrow/flight/internal/flight" + "github.com/apache/arrow/go/v13/arrow/flight/gen/flight" "google.golang.org/grpc" "google.golang.org/grpc/codes" "google.golang.org/grpc/metadata" diff --git a/go/arrow/flight/flightsql/client.go b/go/arrow/flight/flightsql/client.go index 76c9f6fb01d32..f4cd6ee7ce56f 100644 --- a/go/arrow/flight/flightsql/client.go +++ b/go/arrow/flight/flightsql/client.go @@ -25,7 +25,7 @@ import ( "github.com/apache/arrow/go/v13/arrow" "github.com/apache/arrow/go/v13/arrow/array" "github.com/apache/arrow/go/v13/arrow/flight" - pb "github.com/apache/arrow/go/v13/arrow/flight/internal/flight" + pb "github.com/apache/arrow/go/v13/arrow/flight/gen/flight" "github.com/apache/arrow/go/v13/arrow/ipc" "github.com/apache/arrow/go/v13/arrow/memory" "google.golang.org/grpc" diff --git a/go/arrow/flight/flightsql/client_test.go b/go/arrow/flight/flightsql/client_test.go index 2b57596fb188c..1532ef5f32f54 100644 --- a/go/arrow/flight/flightsql/client_test.go +++ b/go/arrow/flight/flightsql/client_test.go @@ -26,7 +26,7 @@ import ( "github.com/apache/arrow/go/v13/arrow/array" "github.com/apache/arrow/go/v13/arrow/flight" "github.com/apache/arrow/go/v13/arrow/flight/flightsql" - pb "github.com/apache/arrow/go/v13/arrow/flight/internal/flight" + pb "github.com/apache/arrow/go/v13/arrow/flight/gen/flight" "github.com/apache/arrow/go/v13/arrow/memory" "github.com/stretchr/testify/mock" "github.com/stretchr/testify/suite" diff --git a/go/arrow/flight/flightsql/server.go b/go/arrow/flight/flightsql/server.go index ee457ad7a8bca..48c0314fa6490 100644 --- a/go/arrow/flight/flightsql/server.go +++ b/go/arrow/flight/flightsql/server.go @@ -24,7 +24,7 @@ import ( "github.com/apache/arrow/go/v13/arrow/array" "github.com/apache/arrow/go/v13/arrow/flight" "github.com/apache/arrow/go/v13/arrow/flight/flightsql/schema_ref" - pb "github.com/apache/arrow/go/v13/arrow/flight/internal/flight" + pb "github.com/apache/arrow/go/v13/arrow/flight/gen/flight" "github.com/apache/arrow/go/v13/arrow/internal/debug" "github.com/apache/arrow/go/v13/arrow/ipc" "github.com/apache/arrow/go/v13/arrow/memory" diff --git a/go/arrow/flight/flightsql/server_test.go b/go/arrow/flight/flightsql/server_test.go index 9ced8e0ed6cdf..43a23bb7e9ac6 100644 --- a/go/arrow/flight/flightsql/server_test.go +++ b/go/arrow/flight/flightsql/server_test.go @@ -26,7 +26,7 @@ import ( "github.com/apache/arrow/go/v13/arrow/array" "github.com/apache/arrow/go/v13/arrow/flight" "github.com/apache/arrow/go/v13/arrow/flight/flightsql" - pb "github.com/apache/arrow/go/v13/arrow/flight/internal/flight" + pb "github.com/apache/arrow/go/v13/arrow/flight/gen/flight" "github.com/apache/arrow/go/v13/arrow/memory" "github.com/stretchr/testify/suite" "google.golang.org/grpc" diff --git a/go/arrow/flight/flightsql/types.go b/go/arrow/flight/flightsql/types.go index 72de81115a77a..34db36b44df2a 100644 --- a/go/arrow/flight/flightsql/types.go +++ b/go/arrow/flight/flightsql/types.go @@ -17,7 +17,7 @@ package flightsql import ( - pb "github.com/apache/arrow/go/v13/arrow/flight/internal/flight" + pb "github.com/apache/arrow/go/v13/arrow/flight/gen/flight" "google.golang.org/protobuf/proto" "google.golang.org/protobuf/types/known/anypb" ) diff --git a/go/arrow/flight/gen.go b/go/arrow/flight/gen.go index 4109059af8ed1..cfdd0e036703a 100644 --- a/go/arrow/flight/gen.go +++ b/go/arrow/flight/gen.go @@ -16,5 +16,5 @@ package flight -//go:generate protoc -I../../../format --go_out=./internal/flight --go-grpc_out=./internal/flight --go_opt=paths=source_relative --go-grpc_opt=paths=source_relative Flight.proto -//go:generate protoc --experimental_allow_proto3_optional -I../../../format --go_out=./internal/flight --go-grpc_out=./internal/flight --go_opt=paths=source_relative --go-grpc_opt=paths=source_relative FlightSql.proto +//go:generate protoc -I../../../format --go_out=./gen/flight --go-grpc_out=./gen/flight --go_opt=paths=source_relative --go-grpc_opt=paths=source_relative Flight.proto +//go:generate protoc --experimental_allow_proto3_optional -I../../../format --go_out=./gen/flight --go-grpc_out=./gen/flight --go_opt=paths=source_relative --go-grpc_opt=paths=source_relative FlightSql.proto diff --git a/go/arrow/flight/internal/flight/Flight.pb.go b/go/arrow/flight/gen/flight/Flight.pb.go similarity index 98% rename from go/arrow/flight/internal/flight/Flight.pb.go rename to go/arrow/flight/gen/flight/Flight.pb.go index 7b4d1e2fd9298..f91d762014603 100644 --- a/go/arrow/flight/internal/flight/Flight.pb.go +++ b/go/arrow/flight/gen/flight/Flight.pb.go @@ -18,15 +18,15 @@ // Code generated by protoc-gen-go. DO NOT EDIT. // versions: // protoc-gen-go v1.28.1 -// protoc v3.21.12 +// protoc v3.12.4 // source: Flight.proto package flight import ( + timestamp "github.com/golang/protobuf/ptypes/timestamp" protoreflect "google.golang.org/protobuf/reflect/protoreflect" protoimpl "google.golang.org/protobuf/runtime/protoimpl" - timestamppb "google.golang.org/protobuf/types/known/timestamppb" reflect "reflect" sync "sync" ) @@ -38,6 +38,7 @@ const ( _ = protoimpl.EnforceVersion(protoimpl.MaxVersion - 20) ) +// // The result of a cancel operation. // // This is used by CancelFlightInfoResult.status. @@ -102,17 +103,19 @@ func (CancelStatus) EnumDescriptor() ([]byte, []int) { return file_Flight_proto_rawDescGZIP(), []int{0} } +// // Describes what type of descriptor is defined. type FlightDescriptor_DescriptorType int32 const ( // Protobuf pattern, not used. FlightDescriptor_UNKNOWN FlightDescriptor_DescriptorType = 0 + // // A named path that identifies a dataset. A path is composed of a string // or list of strings describing a particular dataset. This is conceptually - // - // similar to a path inside a filesystem. + // similar to a path inside a filesystem. FlightDescriptor_PATH FlightDescriptor_DescriptorType = 1 + // // An opaque command to generate a dataset. FlightDescriptor_CMD FlightDescriptor_DescriptorType = 2 ) @@ -158,14 +161,17 @@ func (FlightDescriptor_DescriptorType) EnumDescriptor() ([]byte, []int) { return file_Flight_proto_rawDescGZIP(), []int{12, 0} } +// // The request that a client provides to a server on handshake. type HandshakeRequest struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields + // // A defined protocol version ProtocolVersion uint64 `protobuf:"varint,1,opt,name=protocol_version,json=protocolVersion,proto3" json:"protocol_version,omitempty"` + // // Arbitrary auth/handshake info. Payload []byte `protobuf:"bytes,2,opt,name=payload,proto3" json:"payload,omitempty"` } @@ -221,8 +227,10 @@ type HandshakeResponse struct { sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields + // // A defined protocol version ProtocolVersion uint64 `protobuf:"varint,1,opt,name=protocol_version,json=protocolVersion,proto3" json:"protocol_version,omitempty"` + // // Arbitrary auth/handshake info. Payload []byte `protobuf:"bytes,2,opt,name=payload,proto3" json:"payload,omitempty"` } @@ -273,6 +281,7 @@ func (x *HandshakeResponse) GetPayload() []byte { return nil } +// // A message for doing simple auth. type BasicAuth struct { state protoimpl.MessageState @@ -367,6 +376,7 @@ func (*Empty) Descriptor() ([]byte, []int) { return file_Flight_proto_rawDescGZIP(), []int{3} } +// // Describes an available action, including both the name used for execution // along with a short description of the purpose of the action. type ActionType struct { @@ -424,6 +434,7 @@ func (x *ActionType) GetDescription() string { return "" } +// // A service specific expression that can be used to return a limited set // of available Arrow Flight streams. type Criteria struct { @@ -473,6 +484,7 @@ func (x *Criteria) GetExpression() []byte { return nil } +// // An opaque action specific for the service. type Action struct { state protoimpl.MessageState @@ -529,6 +541,7 @@ func (x *Action) GetBody() []byte { return nil } +// // The request of the CancelFlightInfo action. // // The request should be stored in Action.body. @@ -579,6 +592,7 @@ func (x *CancelFlightInfoRequest) GetInfo() *FlightInfo { return nil } +// // The request of the RenewFlightEndpoint action. // // The request should be stored in Action.body. @@ -629,6 +643,7 @@ func (x *RenewFlightEndpointRequest) GetEndpoint() *FlightEndpoint { return nil } +// // An opaque result returned after executing an action. type Result struct { state protoimpl.MessageState @@ -677,6 +692,7 @@ func (x *Result) GetBody() []byte { return nil } +// // The result of the CancelFlightInfo action. // // The result should be stored in Result.body. @@ -727,6 +743,7 @@ func (x *CancelFlightInfoResult) GetStatus() CancelStatus { return CancelStatus_CANCEL_STATUS_UNSPECIFIED } +// // Wrap the result of a getSchema call type SchemaResult struct { state protoimpl.MessageState @@ -734,10 +751,9 @@ type SchemaResult struct { unknownFields protoimpl.UnknownFields // The schema of the dataset in its IPC form: - // - // 4 bytes - an optional IPC_CONTINUATION_TOKEN prefix - // 4 bytes - the byte length of the payload - // a flatbuffer Message whose header is the Schema + // 4 bytes - an optional IPC_CONTINUATION_TOKEN prefix + // 4 bytes - the byte length of the payload + // a flatbuffer Message whose header is the Schema Schema []byte `protobuf:"bytes,1,opt,name=schema,proto3" json:"schema,omitempty"` } @@ -780,6 +796,7 @@ func (x *SchemaResult) GetSchema() []byte { return nil } +// // The name or tag for a Flight. May be used as a way to retrieve or generate // a flight or be used to expose a set of previously defined flights. type FlightDescriptor struct { @@ -788,9 +805,11 @@ type FlightDescriptor struct { unknownFields protoimpl.UnknownFields Type FlightDescriptor_DescriptorType `protobuf:"varint,1,opt,name=type,proto3,enum=arrow.flight.protocol.FlightDescriptor_DescriptorType" json:"type,omitempty"` + // // Opaque value used to express a command. Should only be defined when // type = CMD. Cmd []byte `protobuf:"bytes,2,opt,name=cmd,proto3" json:"cmd,omitempty"` + // // List of strings identifying a particular dataset. Should only be defined // when type = PATH. Path []string `protobuf:"bytes,3,rep,name=path,proto3" json:"path,omitempty"` @@ -849,6 +868,7 @@ func (x *FlightDescriptor) GetPath() []string { return nil } +// // The access coordinates for retrieval of a dataset. With a FlightInfo, a // consumer is able to determine how to retrieve a dataset. type FlightInfo struct { @@ -857,13 +877,14 @@ type FlightInfo struct { unknownFields protoimpl.UnknownFields // The schema of the dataset in its IPC form: - // - // 4 bytes - an optional IPC_CONTINUATION_TOKEN prefix - // 4 bytes - the byte length of the payload - // a flatbuffer Message whose header is the Schema + // 4 bytes - an optional IPC_CONTINUATION_TOKEN prefix + // 4 bytes - the byte length of the payload + // a flatbuffer Message whose header is the Schema Schema []byte `protobuf:"bytes,1,opt,name=schema,proto3" json:"schema,omitempty"` + // // The descriptor associated with this info. FlightDescriptor *FlightDescriptor `protobuf:"bytes,2,opt,name=flight_descriptor,json=flightDescriptor,proto3" json:"flight_descriptor,omitempty"` + // // A list of endpoints associated with the flight. To consume the // whole flight, all endpoints (and hence all Tickets) must be // consumed. Endpoints can be consumed in any order. @@ -883,13 +904,14 @@ type FlightInfo struct { // ordering is important for an application, an application must // choose one of them: // - // - An application requires that all clients must read data in - // returned endpoints order. - // - An application must return the all data in a single endpoint. + // * An application requires that all clients must read data in + // returned endpoints order. + // * An application must return the all data in a single endpoint. Endpoint []*FlightEndpoint `protobuf:"bytes,3,rep,name=endpoint,proto3" json:"endpoint,omitempty"` // Set these to -1 if unknown. TotalRecords int64 `protobuf:"varint,4,opt,name=total_records,json=totalRecords,proto3" json:"total_records,omitempty"` TotalBytes int64 `protobuf:"varint,5,opt,name=total_bytes,json=totalBytes,proto3" json:"total_bytes,omitempty"` + // // FlightEndpoints are in the same order as the data. Ordered bool `protobuf:"varint,6,opt,name=ordered,proto3" json:"ordered,omitempty"` } @@ -968,14 +990,17 @@ func (x *FlightInfo) GetOrdered() bool { return false } +// // A particular stream or split associated with a flight. type FlightEndpoint struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields + // // Token used to retrieve this stream. Ticket *Ticket `protobuf:"bytes,1,opt,name=ticket,proto3" json:"ticket,omitempty"` + // // A list of URIs where this ticket can be redeemed via DoGet(). // // If the list is empty, the expectation is that the ticket can only @@ -991,10 +1016,11 @@ type FlightEndpoint struct { // In other words, an application can use multiple locations to // represent redundant and/or load balanced services. Location []*Location `protobuf:"bytes,2,rep,name=location,proto3" json:"location,omitempty"` + // // Expiration time of this stream. If present, clients may assume // they can retry DoGet requests. Otherwise, it is // application-defined whether DoGet requests may be retried. - ExpirationTime *timestamppb.Timestamp `protobuf:"bytes,3,opt,name=expiration_time,json=expirationTime,proto3" json:"expiration_time,omitempty"` + ExpirationTime *timestamp.Timestamp `protobuf:"bytes,3,opt,name=expiration_time,json=expirationTime,proto3" json:"expiration_time,omitempty"` } func (x *FlightEndpoint) Reset() { @@ -1043,13 +1069,14 @@ func (x *FlightEndpoint) GetLocation() []*Location { return nil } -func (x *FlightEndpoint) GetExpirationTime() *timestamppb.Timestamp { +func (x *FlightEndpoint) GetExpirationTime() *timestamp.Timestamp { if x != nil { return x.ExpirationTime } return nil } +// // A location where a Flight service will accept retrieval of a particular // stream given a ticket. type Location struct { @@ -1099,6 +1126,7 @@ func (x *Location) GetUri() string { return "" } +// // An opaque identifier that the service can use to retrieve a particular // portion of a stream. // @@ -1151,19 +1179,24 @@ func (x *Ticket) GetTicket() []byte { return nil } +// // A batch of Arrow data as part of a stream of batches. type FlightData struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields + // // The descriptor of the data. This is only relevant when a client is // starting a new DoPut stream. FlightDescriptor *FlightDescriptor `protobuf:"bytes,1,opt,name=flight_descriptor,json=flightDescriptor,proto3" json:"flight_descriptor,omitempty"` + // // Header for message data as described in Message.fbs::Message. DataHeader []byte `protobuf:"bytes,2,opt,name=data_header,json=dataHeader,proto3" json:"data_header,omitempty"` + // // Application-defined metadata. AppMetadata []byte `protobuf:"bytes,3,opt,name=app_metadata,json=appMetadata,proto3" json:"app_metadata,omitempty"` + // // The actual batch of Arrow data. Preferably handled with minimal-copies // coming last in the definition to help with sidecar patterns (it is // expected that some implementations will fetch this field off the wire @@ -1231,7 +1264,7 @@ func (x *FlightData) GetDataBody() []byte { return nil } -// * +//* // The response message associated with the submission of a DoPut. type PutResult struct { state protoimpl.MessageState @@ -1502,7 +1535,7 @@ var file_Flight_proto_goTypes = []interface{}{ (*Ticket)(nil), // 18: arrow.flight.protocol.Ticket (*FlightData)(nil), // 19: arrow.flight.protocol.FlightData (*PutResult)(nil), // 20: arrow.flight.protocol.PutResult - (*timestamppb.Timestamp)(nil), // 21: google.protobuf.Timestamp + (*timestamp.Timestamp)(nil), // 21: google.protobuf.Timestamp } var file_Flight_proto_depIdxs = []int32{ 15, // 0: arrow.flight.protocol.CancelFlightInfoRequest.info:type_name -> arrow.flight.protocol.FlightInfo diff --git a/go/arrow/flight/internal/flight/FlightSql.pb.go b/go/arrow/flight/gen/flight/FlightSql.pb.go similarity index 94% rename from go/arrow/flight/internal/flight/FlightSql.pb.go rename to go/arrow/flight/gen/flight/FlightSql.pb.go index b61ac29066836..3b1ba232d3d12 100644 --- a/go/arrow/flight/internal/flight/FlightSql.pb.go +++ b/go/arrow/flight/gen/flight/FlightSql.pb.go @@ -18,15 +18,15 @@ // Code generated by protoc-gen-go. DO NOT EDIT. // versions: // protoc-gen-go v1.28.1 -// protoc v3.21.12 +// protoc v3.12.4 // source: FlightSql.proto package flight import ( + descriptor "github.com/golang/protobuf/protoc-gen-go/descriptor" protoreflect "google.golang.org/protobuf/reflect/protoreflect" protoimpl "google.golang.org/protobuf/runtime/protoimpl" - descriptorpb "google.golang.org/protobuf/types/descriptorpb" reflect "reflect" sync "sync" ) @@ -48,27 +48,33 @@ const ( SqlInfo_FLIGHT_SQL_SERVER_VERSION SqlInfo = 1 // Retrieves a UTF-8 string with the Arrow format version of the Flight SQL Server. SqlInfo_FLIGHT_SQL_SERVER_ARROW_VERSION SqlInfo = 2 + // // Retrieves a boolean value indicating whether the Flight SQL Server is read only. // // Returns: // - false: if read-write // - true: if read only SqlInfo_FLIGHT_SQL_SERVER_READ_ONLY SqlInfo = 3 + // // Retrieves a boolean value indicating whether the Flight SQL Server supports executing // SQL queries. // // Note that the absence of this info (as opposed to a false value) does not necessarily // mean that SQL is not supported, as this property was not originally defined. SqlInfo_FLIGHT_SQL_SERVER_SQL SqlInfo = 4 + // // Retrieves a boolean value indicating whether the Flight SQL Server supports executing // Substrait plans. SqlInfo_FLIGHT_SQL_SERVER_SUBSTRAIT SqlInfo = 5 + // // Retrieves a string value indicating the minimum supported Substrait version, or null // if Substrait is not supported. SqlInfo_FLIGHT_SQL_SERVER_SUBSTRAIT_MIN_VERSION SqlInfo = 6 + // // Retrieves a string value indicating the maximum supported Substrait version, or null // if Substrait is not supported. SqlInfo_FLIGHT_SQL_SERVER_SUBSTRAIT_MAX_VERSION SqlInfo = 7 + // // Retrieves an int32 indicating whether the Flight SQL Server supports the // BeginTransaction/EndTransaction/BeginSavepoint/EndSavepoint actions. // @@ -78,51 +84,61 @@ const ( // // The possible values are listed in `SqlSupportedTransaction`. SqlInfo_FLIGHT_SQL_SERVER_TRANSACTION SqlInfo = 8 + // // Retrieves a boolean value indicating whether the Flight SQL Server supports explicit // query cancellation (the CancelQuery action). SqlInfo_FLIGHT_SQL_SERVER_CANCEL SqlInfo = 9 + // // Retrieves an int32 indicating the timeout (in milliseconds) for prepared statement handles. // // If 0, there is no timeout. Servers should reset the timeout when the handle is used in a command. SqlInfo_FLIGHT_SQL_SERVER_STATEMENT_TIMEOUT SqlInfo = 100 + // // Retrieves an int32 indicating the timeout (in milliseconds) for transactions, since transactions are not tied to a connection. // // If 0, there is no timeout. Servers should reset the timeout when the handle is used in a command. SqlInfo_FLIGHT_SQL_SERVER_TRANSACTION_TIMEOUT SqlInfo = 101 + // // Retrieves a boolean value indicating whether the Flight SQL Server supports CREATE and DROP of catalogs. // // Returns: // - false: if it doesn't support CREATE and DROP of catalogs. // - true: if it supports CREATE and DROP of catalogs. SqlInfo_SQL_DDL_CATALOG SqlInfo = 500 + // // Retrieves a boolean value indicating whether the Flight SQL Server supports CREATE and DROP of schemas. // // Returns: // - false: if it doesn't support CREATE and DROP of schemas. // - true: if it supports CREATE and DROP of schemas. SqlInfo_SQL_DDL_SCHEMA SqlInfo = 501 + // // Indicates whether the Flight SQL Server supports CREATE and DROP of tables. // // Returns: // - false: if it doesn't support CREATE and DROP of tables. // - true: if it supports CREATE and DROP of tables. SqlInfo_SQL_DDL_TABLE SqlInfo = 502 + // // Retrieves a int32 ordinal representing the case sensitivity of catalog, table, schema and table names. // // The possible values are listed in `arrow.flight.protocol.sql.SqlSupportedCaseSensitivity`. SqlInfo_SQL_IDENTIFIER_CASE SqlInfo = 503 // Retrieves a UTF-8 string with the supported character(s) used to surround a delimited identifier. SqlInfo_SQL_IDENTIFIER_QUOTE_CHAR SqlInfo = 504 + // // Retrieves a int32 describing the case sensitivity of quoted identifiers. // // The possible values are listed in `arrow.flight.protocol.sql.SqlSupportedCaseSensitivity`. SqlInfo_SQL_QUOTED_IDENTIFIER_CASE SqlInfo = 505 + // // Retrieves a boolean value indicating whether all tables are selectable. // // Returns: // - false: if not all tables are selectable or if none are; // - true: if all tables are selectable. SqlInfo_SQL_ALL_TABLES_ARE_SELECTABLE SqlInfo = 506 + // // Retrieves the null ordering. // // Returns a int32 ordinal for the null ordering being used, as described in @@ -138,15 +154,18 @@ const ( SqlInfo_SQL_SYSTEM_FUNCTIONS SqlInfo = 511 // Retrieves a UTF-8 string list with values of the supported datetime functions. SqlInfo_SQL_DATETIME_FUNCTIONS SqlInfo = 512 + // // Retrieves the UTF-8 string that can be used to escape wildcard characters. // This is the string that can be used to escape '_' or '%' in the catalog search parameters that are a pattern // (and therefore use one of the wildcard characters). // The '_' character represents any single character; the '%' character represents any sequence of zero or more // characters. SqlInfo_SQL_SEARCH_STRING_ESCAPE SqlInfo = 513 + // // Retrieves a UTF-8 string with all the "extra" characters that can be used in unquoted identifier names // (those beyond a-z, A-Z, 0-9 and _). SqlInfo_SQL_EXTRA_NAME_CHARACTERS SqlInfo = 514 + // // Retrieves a boolean value indicating whether column aliasing is supported. // If so, the SQL AS clause can be used to provide names for computed columns or to provide alias names for columns // as required. @@ -155,6 +174,7 @@ const ( // - false: if column aliasing is unsupported; // - true: if column aliasing is supported. SqlInfo_SQL_SUPPORTS_COLUMN_ALIASING SqlInfo = 515 + // // Retrieves a boolean value indicating whether concatenations between null and non-null values being // null are supported. // @@ -162,11 +182,13 @@ const ( // - false: if concatenations between null and non-null values being null are unsupported; // - true: if concatenations between null and non-null values being null are supported. SqlInfo_SQL_NULL_PLUS_NULL_IS_NULL SqlInfo = 516 + // // Retrieves a map where the key is the type to convert from and the value is a list with the types to convert to, // indicating the supported conversions. Each key and each item on the list value is a value to a predefined type on // SqlSupportsConvert enum. // The returned map will be: map> SqlInfo_SQL_SUPPORTS_CONVERT SqlInfo = 517 + // // Retrieves a boolean value indicating whether, when table correlation names are supported, // they are restricted to being different from the names of the tables. // @@ -174,6 +196,7 @@ const ( // - false: if table correlation names are unsupported; // - true: if table correlation names are supported. SqlInfo_SQL_SUPPORTS_TABLE_CORRELATION_NAMES SqlInfo = 518 + // // Retrieves a boolean value indicating whether, when table correlation names are supported, // they are restricted to being different from the names of the tables. // @@ -181,12 +204,14 @@ const ( // - false: if different table correlation names are unsupported; // - true: if different table correlation names are supported SqlInfo_SQL_SUPPORTS_DIFFERENT_TABLE_CORRELATION_NAMES SqlInfo = 519 + // // Retrieves a boolean value indicating whether expressions in ORDER BY lists are supported. // // Returns: // - false: if expressions in ORDER BY are unsupported; // - true: if expressions in ORDER BY are supported; SqlInfo_SQL_SUPPORTS_EXPRESSIONS_IN_ORDER_BY SqlInfo = 520 + // // Retrieves a boolean value indicating whether using a column that is not in the SELECT statement in a GROUP BY // clause is supported. // @@ -194,6 +219,7 @@ const ( // - false: if using a column that is not in the SELECT statement in a GROUP BY clause is unsupported; // - true: if using a column that is not in the SELECT statement in a GROUP BY clause is supported. SqlInfo_SQL_SUPPORTS_ORDER_BY_UNRELATED SqlInfo = 521 + // // Retrieves the supported GROUP BY commands; // // Returns an int32 bitmask value representing the supported commands. @@ -206,18 +232,21 @@ const ( // - return 3 (\b11) => [SQL_GROUP_BY_UNRELATED, SQL_GROUP_BY_BEYOND_SELECT]. // Valid GROUP BY types are described under `arrow.flight.protocol.sql.SqlSupportedGroupBy`. SqlInfo_SQL_SUPPORTED_GROUP_BY SqlInfo = 522 + // // Retrieves a boolean value indicating whether specifying a LIKE escape clause is supported. // // Returns: // - false: if specifying a LIKE escape clause is unsupported; // - true: if specifying a LIKE escape clause is supported. SqlInfo_SQL_SUPPORTS_LIKE_ESCAPE_CLAUSE SqlInfo = 523 + // // Retrieves a boolean value indicating whether columns may be defined as non-nullable. // // Returns: // - false: if columns cannot be defined as non-nullable; // - true: if columns may be defined as non-nullable. SqlInfo_SQL_SUPPORTS_NON_NULLABLE_COLUMNS SqlInfo = 524 + // // Retrieves the supported SQL grammar level as per the ODBC specification. // // Returns an int32 bitmask value representing the supported SQL grammar level. @@ -234,6 +263,7 @@ const ( // - return 7 (\b111) => [SQL_MINIMUM_GRAMMAR, SQL_CORE_GRAMMAR, SQL_EXTENDED_GRAMMAR]. // Valid SQL grammar levels are described under `arrow.flight.protocol.sql.SupportedSqlGrammar`. SqlInfo_SQL_SUPPORTED_GRAMMAR SqlInfo = 525 + // // Retrieves the supported ANSI92 SQL grammar level. // // Returns an int32 bitmask value representing the supported ANSI92 SQL grammar level. @@ -250,12 +280,14 @@ const ( // - return 7 (\b111) => [ANSI92_ENTRY_SQL, ANSI92_INTERMEDIATE_SQL, ANSI92_FULL_SQL]. // Valid ANSI92 SQL grammar levels are described under `arrow.flight.protocol.sql.SupportedAnsi92SqlGrammarLevel`. SqlInfo_SQL_ANSI92_SUPPORTED_LEVEL SqlInfo = 526 + // // Retrieves a boolean value indicating whether the SQL Integrity Enhancement Facility is supported. // // Returns: // - false: if the SQL Integrity Enhancement Facility is supported; // - true: if the SQL Integrity Enhancement Facility is supported. SqlInfo_SQL_SUPPORTS_INTEGRITY_ENHANCEMENT_FACILITY SqlInfo = 527 + // // Retrieves the support level for SQL OUTER JOINs. // // Returns a int32 ordinal for the SQL ordering being used, as described in @@ -265,14 +297,17 @@ const ( SqlInfo_SQL_SCHEMA_TERM SqlInfo = 529 // Retrieves a UTF-8 string with the preferred term for "procedure". SqlInfo_SQL_PROCEDURE_TERM SqlInfo = 530 + // // Retrieves a UTF-8 string with the preferred term for "catalog". // If a empty string is returned its assumed that the server does NOT supports catalogs. SqlInfo_SQL_CATALOG_TERM SqlInfo = 531 + // // Retrieves a boolean value indicating whether a catalog appears at the start of a fully qualified table name. // // - false: if a catalog does not appear at the start of a fully qualified table name; // - true: if a catalog appears at the start of a fully qualified table name. SqlInfo_SQL_CATALOG_AT_START SqlInfo = 532 + // // Retrieves the supported actions for a SQL schema. // // Returns an int32 bitmask value representing the supported actions for a SQL schema. @@ -289,6 +324,7 @@ const ( // - return 7 (\b111) => [SQL_ELEMENT_IN_PROCEDURE_CALLS, SQL_ELEMENT_IN_INDEX_DEFINITIONS, SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS]. // Valid actions for a SQL schema described under `arrow.flight.protocol.sql.SqlSupportedElementActions`. SqlInfo_SQL_SCHEMAS_SUPPORTED_ACTIONS SqlInfo = 533 + // // Retrieves the supported actions for a SQL schema. // // Returns an int32 bitmask value representing the supported actions for a SQL catalog. @@ -305,6 +341,7 @@ const ( // - return 7 (\b111) => [SQL_ELEMENT_IN_PROCEDURE_CALLS, SQL_ELEMENT_IN_INDEX_DEFINITIONS, SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS]. // Valid actions for a SQL catalog are described under `arrow.flight.protocol.sql.SqlSupportedElementActions`. SqlInfo_SQL_CATALOGS_SUPPORTED_ACTIONS SqlInfo = 534 + // // Retrieves the supported SQL positioned commands. // // Returns an int32 bitmask value representing the supported SQL positioned commands. @@ -317,12 +354,14 @@ const ( // - return 3 (\b11) => [SQL_POSITIONED_DELETE, SQL_POSITIONED_UPDATE]. // Valid SQL positioned commands are described under `arrow.flight.protocol.sql.SqlSupportedPositionedCommands`. SqlInfo_SQL_SUPPORTED_POSITIONED_COMMANDS SqlInfo = 535 + // // Retrieves a boolean value indicating whether SELECT FOR UPDATE statements are supported. // // Returns: // - false: if SELECT FOR UPDATE statements are unsupported; // - true: if SELECT FOR UPDATE statements are supported. SqlInfo_SQL_SELECT_FOR_UPDATE_SUPPORTED SqlInfo = 536 + // // Retrieves a boolean value indicating whether stored procedure calls that use the stored procedure escape syntax // are supported. // @@ -330,6 +369,7 @@ const ( // - false: if stored procedure calls that use the stored procedure escape syntax are unsupported; // - true: if stored procedure calls that use the stored procedure escape syntax are supported. SqlInfo_SQL_STORED_PROCEDURES_SUPPORTED SqlInfo = 537 + // // Retrieves the supported SQL subqueries. // // Returns an int32 bitmask value representing the supported SQL subqueries. @@ -355,12 +395,14 @@ const ( // - ... // Valid SQL subqueries are described under `arrow.flight.protocol.sql.SqlSupportedSubqueries`. SqlInfo_SQL_SUPPORTED_SUBQUERIES SqlInfo = 538 + // // Retrieves a boolean value indicating whether correlated subqueries are supported. // // Returns: // - false: if correlated subqueries are unsupported; // - true: if correlated subqueries are supported. SqlInfo_SQL_CORRELATED_SUBQUERIES_SUPPORTED SqlInfo = 539 + // // Retrieves the supported SQL UNIONs. // // Returns an int32 bitmask value representing the supported SQL UNIONs. @@ -393,6 +435,7 @@ const ( SqlInfo_SQL_MAX_CONNECTIONS SqlInfo = 549 // Retrieves a int64 value the maximum number of characters allowed in a cursor name. SqlInfo_SQL_MAX_CURSOR_NAME_LENGTH SqlInfo = 550 + // // Retrieves a int64 value representing the maximum number of bytes allowed for an index, // including all of the parts of the index. SqlInfo_SQL_MAX_INDEX_LENGTH SqlInfo = 551 @@ -404,15 +447,17 @@ const ( SqlInfo_SQL_MAX_CATALOG_NAME_LENGTH SqlInfo = 554 // Retrieves a int64 value representing the maximum number of bytes allowed in a single row. SqlInfo_SQL_MAX_ROW_SIZE SqlInfo = 555 + // // Retrieves a boolean indicating whether the return value for the JDBC method getMaxRowSize includes the SQL // data types LONGVARCHAR and LONGVARBINARY. // // Returns: - // - false: if return value for the JDBC method getMaxRowSize does - // not include the SQL data types LONGVARCHAR and LONGVARBINARY; - // - true: if return value for the JDBC method getMaxRowSize includes - // the SQL data types LONGVARCHAR and LONGVARBINARY. + // - false: if return value for the JDBC method getMaxRowSize does + // not include the SQL data types LONGVARCHAR and LONGVARBINARY; + // - true: if return value for the JDBC method getMaxRowSize includes + // the SQL data types LONGVARCHAR and LONGVARBINARY. SqlInfo_SQL_MAX_ROW_SIZE_INCLUDES_BLOBS SqlInfo = 556 + // // Retrieves a int64 value representing the maximum number of characters allowed for an SQL statement; // a result of 0 (zero) means that there is no limit or the limit is not known. SqlInfo_SQL_MAX_STATEMENT_LENGTH SqlInfo = 557 @@ -424,11 +469,13 @@ const ( SqlInfo_SQL_MAX_TABLES_IN_SELECT SqlInfo = 560 // Retrieves a int64 value representing the maximum number of characters allowed in a user name. SqlInfo_SQL_MAX_USERNAME_LENGTH SqlInfo = 561 + // // Retrieves this database's default transaction isolation level as described in // `arrow.flight.protocol.sql.SqlTransactionIsolationLevel`. // // Returns a int32 ordinal for the SQL transaction isolation level. SqlInfo_SQL_DEFAULT_TRANSACTION_ISOLATION SqlInfo = 562 + // // Retrieves a boolean value indicating whether transactions are supported. If not, invoking the method commit is a // noop, and the isolation level is `arrow.flight.protocol.sql.SqlTransactionIsolationLevel.TRANSACTION_NONE`. // @@ -436,6 +483,7 @@ const ( // - false: if transactions are unsupported; // - true: if transactions are supported. SqlInfo_SQL_TRANSACTIONS_SUPPORTED SqlInfo = 563 + // // Retrieves the supported transactions isolation levels. // // Returns an int32 bitmask value representing the supported transactions isolation levels. @@ -462,6 +510,7 @@ const ( // - ... // Valid SQL positioned commands are described under `arrow.flight.protocol.sql.SqlTransactionIsolationLevel`. SqlInfo_SQL_SUPPORTED_TRANSACTIONS_ISOLATION_LEVELS SqlInfo = 564 + // // Retrieves a boolean value indicating whether a data definition statement within a transaction forces // the transaction to commit. // @@ -469,12 +518,14 @@ const ( // - false: if a data definition statement within a transaction does not force the transaction to commit; // - true: if a data definition statement within a transaction forces the transaction to commit. SqlInfo_SQL_DATA_DEFINITION_CAUSES_TRANSACTION_COMMIT SqlInfo = 565 + // // Retrieves a boolean value indicating whether a data definition statement within a transaction is ignored. // // Returns: // - false: if a data definition statement within a transaction is taken into account; // - true: a data definition statement within a transaction is ignored. SqlInfo_SQL_DATA_DEFINITIONS_IN_TRANSACTIONS_IGNORED SqlInfo = 566 + // // Retrieves an int32 bitmask value representing the supported result set types. // The returned bitmask should be parsed in order to retrieve the supported result set types. // @@ -491,6 +542,7 @@ const ( // - ... // Valid result set types are described under `arrow.flight.protocol.sql.SqlSupportedResultSetType`. SqlInfo_SQL_SUPPORTED_RESULT_SET_TYPES SqlInfo = 567 + // // Returns an int32 bitmask value concurrency types supported for // `arrow.flight.protocol.sql.SqlSupportedResultSetType.SQL_RESULT_SET_TYPE_UNSPECIFIED`. // @@ -505,6 +557,7 @@ const ( // - return 7 (\b111) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] // Valid result set types are described under `arrow.flight.protocol.sql.SqlSupportedResultSetConcurrency`. SqlInfo_SQL_SUPPORTED_CONCURRENCIES_FOR_RESULT_SET_UNSPECIFIED SqlInfo = 568 + // // Returns an int32 bitmask value concurrency types supported for // `arrow.flight.protocol.sql.SqlSupportedResultSetType.SQL_RESULT_SET_TYPE_FORWARD_ONLY`. // @@ -519,6 +572,7 @@ const ( // - return 7 (\b111) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] // Valid result set types are described under `arrow.flight.protocol.sql.SqlSupportedResultSetConcurrency`. SqlInfo_SQL_SUPPORTED_CONCURRENCIES_FOR_RESULT_SET_FORWARD_ONLY SqlInfo = 569 + // // Returns an int32 bitmask value concurrency types supported for // `arrow.flight.protocol.sql.SqlSupportedResultSetType.SQL_RESULT_SET_TYPE_SCROLL_SENSITIVE`. // @@ -533,6 +587,7 @@ const ( // - return 7 (\b111) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] // Valid result set types are described under `arrow.flight.protocol.sql.SqlSupportedResultSetConcurrency`. SqlInfo_SQL_SUPPORTED_CONCURRENCIES_FOR_RESULT_SET_SCROLL_SENSITIVE SqlInfo = 570 + // // Returns an int32 bitmask value concurrency types supported for // `arrow.flight.protocol.sql.SqlSupportedResultSetType.SQL_RESULT_SET_TYPE_SCROLL_INSENSITIVE`. // @@ -547,29 +602,34 @@ const ( // - return 7 (\b111) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] // Valid result set types are described under `arrow.flight.protocol.sql.SqlSupportedResultSetConcurrency`. SqlInfo_SQL_SUPPORTED_CONCURRENCIES_FOR_RESULT_SET_SCROLL_INSENSITIVE SqlInfo = 571 + // // Retrieves a boolean value indicating whether this database supports batch updates. // // - false: if this database does not support batch updates; // - true: if this database supports batch updates. SqlInfo_SQL_BATCH_UPDATES_SUPPORTED SqlInfo = 572 + // // Retrieves a boolean value indicating whether this database supports savepoints. // // Returns: // - false: if this database does not support savepoints; // - true: if this database supports savepoints. SqlInfo_SQL_SAVEPOINTS_SUPPORTED SqlInfo = 573 + // // Retrieves a boolean value indicating whether named parameters are supported in callable statements. // // Returns: // - false: if named parameters in callable statements are unsupported; // - true: if named parameters in callable statements are supported. SqlInfo_SQL_NAMED_PARAMETERS_SUPPORTED SqlInfo = 574 + // // Retrieves a boolean value indicating whether updates made to a LOB are made on a copy or directly to the LOB. // // Returns: // - false: if updates made to a LOB are made directly to the LOB; // - true: if updates made to a LOB are made on a copy. SqlInfo_SQL_LOCATORS_UPDATE_COPY SqlInfo = 575 + // // Retrieves a boolean value indicating whether invoking user-defined or vendor functions // using the stored procedure escape syntax is supported. // @@ -1642,7 +1702,7 @@ func (SqlSupportsConvert) EnumDescriptor() ([]byte, []int) { return file_FlightSql_proto_rawDescGZIP(), []int{16} } -// * +//* // The JDBC/ODBC-defined type of any object. // All the values here are the sames as in the JDBC and ODBC specs. type XdbcDataType int32 @@ -1757,7 +1817,7 @@ func (XdbcDataType) EnumDescriptor() ([]byte, []int) { return file_FlightSql_proto_rawDescGZIP(), []int{17} } -// * +//* // Detailed subtype information for XDBC_TYPE_DATETIME and XDBC_TYPE_INTERVAL. type XdbcDatetimeSubcode int32 @@ -1898,13 +1958,13 @@ func (XdbcDatetimeSubcode) EnumDescriptor() ([]byte, []int) { type Nullable int32 const ( - // * + //* // Indicates that the fields does not allow the use of null values. Nullable_NULLABILITY_NO_NULLS Nullable = 0 - // * + //* // Indicates that the fields allow the use of null values. Nullable_NULLABILITY_NULLABLE Nullable = 1 - // * + //* // Indicates that nullability of the fields can not be determined. Nullable_NULLABILITY_UNKNOWN Nullable = 2 ) @@ -1953,21 +2013,21 @@ func (Nullable) EnumDescriptor() ([]byte, []int) { type Searchable int32 const ( - // * + //* // Indicates that column can not be used in a WHERE clause. Searchable_SEARCHABLE_NONE Searchable = 0 - // * + //* // Indicates that the column can be used in a WHERE clause if it is using a // LIKE operator. Searchable_SEARCHABLE_CHAR Searchable = 1 - // * + //* // Indicates that the column can be used In a WHERE clause with any // operator other than LIKE. // - // - Allowed operators: comparison, quantified comparison, BETWEEN, - // DISTINCT, IN, MATCH, and UNIQUE. + // - Allowed operators: comparison, quantified comparison, BETWEEN, + // DISTINCT, IN, MATCH, and UNIQUE. Searchable_SEARCHABLE_BASIC Searchable = 2 - // * + //* // Indicates that the column can be used in a WHERE clause using any operator. Searchable_SEARCHABLE_FULL Searchable = 3 ) @@ -2233,23 +2293,22 @@ func (ActionCancelQueryResult_CancelResult) EnumDescriptor() ([]byte, []int) { return file_FlightSql_proto_rawDescGZIP(), []int{29, 0} } +// // Represents a metadata request. Used in the command member of FlightDescriptor // for the following RPC calls: -// - GetSchema: return the Arrow schema of the query. -// - GetFlightInfo: execute the metadata request. +// - GetSchema: return the Arrow schema of the query. +// - GetFlightInfo: execute the metadata request. // // The returned Arrow schema will be: // < -// -// info_name: uint32 not null, -// value: dense_union< -// string_value: utf8, -// bool_value: bool, -// bigint_value: int64, -// int32_bitmask: int32, -// string_list: list -// int32_to_int32_list_map: map> -// +// info_name: uint32 not null, +// value: dense_union< +// string_value: utf8, +// bool_value: bool, +// bigint_value: int64, +// int32_bitmask: int32, +// string_list: list +// int32_to_int32_list_map: map> // > // where there is one row per requested piece of metadata information. type CommandGetSqlInfo struct { @@ -2257,6 +2316,7 @@ type CommandGetSqlInfo struct { sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields + // // Values are modelled after ODBC's SQLGetInfo() function. This information is intended to provide // Flight SQL clients with basic, SQL syntax and SQL functions related information. // More information types can be added in future releases. @@ -2316,62 +2376,61 @@ func (x *CommandGetSqlInfo) GetInfo() []uint32 { return nil } +// // Represents a request to retrieve information about data type supported on a Flight SQL enabled backend. // Used in the command member of FlightDescriptor for the following RPC calls: -// - GetSchema: return the schema of the query. -// - GetFlightInfo: execute the catalog metadata request. +// - GetSchema: return the schema of the query. +// - GetFlightInfo: execute the catalog metadata request. // // The returned schema will be: // < -// -// type_name: utf8 not null (The name of the data type, for example: VARCHAR, INTEGER, etc), -// data_type: int32 not null (The SQL data type), -// column_size: int32 (The maximum size supported by that column. -// In case of exact numeric types, this represents the maximum precision. -// In case of string types, this represents the character length. -// In case of datetime data types, this represents the length in characters of the string representation. -// NULL is returned for data types where column size is not applicable.), -// literal_prefix: utf8 (Character or characters used to prefix a literal, NULL is returned for -// data types where a literal prefix is not applicable.), -// literal_suffix: utf8 (Character or characters used to terminate a literal, -// NULL is returned for data types where a literal suffix is not applicable.), -// create_params: list -// (A list of keywords corresponding to which parameters can be used when creating -// a column for that specific type. -// NULL is returned if there are no parameters for the data type definition.), -// nullable: int32 not null (Shows if the data type accepts a NULL value. The possible values can be seen in the -// Nullable enum.), -// case_sensitive: bool not null (Shows if a character data type is case-sensitive in collations and comparisons), -// searchable: int32 not null (Shows how the data type is used in a WHERE clause. The possible values can be seen in the -// Searchable enum.), -// unsigned_attribute: bool (Shows if the data type is unsigned. NULL is returned if the attribute is -// not applicable to the data type or the data type is not numeric.), -// fixed_prec_scale: bool not null (Shows if the data type has predefined fixed precision and scale.), -// auto_increment: bool (Shows if the data type is auto incremental. NULL is returned if the attribute -// is not applicable to the data type or the data type is not numeric.), -// local_type_name: utf8 (Localized version of the data source-dependent name of the data type. NULL -// is returned if a localized name is not supported by the data source), -// minimum_scale: int32 (The minimum scale of the data type on the data source. -// If a data type has a fixed scale, the MINIMUM_SCALE and MAXIMUM_SCALE -// columns both contain this value. NULL is returned if scale is not applicable.), -// maximum_scale: int32 (The maximum scale of the data type on the data source. -// NULL is returned if scale is not applicable.), -// sql_data_type: int32 not null (The value of the SQL DATA TYPE which has the same values -// as data_type value. Except for interval and datetime, which -// uses generic values. More info about those types can be -// obtained through datetime_subcode. The possible values can be seen -// in the XdbcDataType enum.), -// datetime_subcode: int32 (Only used when the SQL DATA TYPE is interval or datetime. It contains -// its sub types. For type different from interval and datetime, this value -// is NULL. The possible values can be seen in the XdbcDatetimeSubcode enum.), -// num_prec_radix: int32 (If the data type is an approximate numeric type, this column contains -// the value 2 to indicate that COLUMN_SIZE specifies a number of bits. For -// exact numeric types, this column contains the value 10 to indicate that -// column size specifies a number of decimal digits. Otherwise, this column is NULL.), -// interval_precision: int32 (If the data type is an interval data type, then this column contains the value -// of the interval leading precision. Otherwise, this column is NULL. This fields -// is only relevant to be used by ODBC). -// +// type_name: utf8 not null (The name of the data type, for example: VARCHAR, INTEGER, etc), +// data_type: int32 not null (The SQL data type), +// column_size: int32 (The maximum size supported by that column. +// In case of exact numeric types, this represents the maximum precision. +// In case of string types, this represents the character length. +// In case of datetime data types, this represents the length in characters of the string representation. +// NULL is returned for data types where column size is not applicable.), +// literal_prefix: utf8 (Character or characters used to prefix a literal, NULL is returned for +// data types where a literal prefix is not applicable.), +// literal_suffix: utf8 (Character or characters used to terminate a literal, +// NULL is returned for data types where a literal suffix is not applicable.), +// create_params: list +// (A list of keywords corresponding to which parameters can be used when creating +// a column for that specific type. +// NULL is returned if there are no parameters for the data type definition.), +// nullable: int32 not null (Shows if the data type accepts a NULL value. The possible values can be seen in the +// Nullable enum.), +// case_sensitive: bool not null (Shows if a character data type is case-sensitive in collations and comparisons), +// searchable: int32 not null (Shows how the data type is used in a WHERE clause. The possible values can be seen in the +// Searchable enum.), +// unsigned_attribute: bool (Shows if the data type is unsigned. NULL is returned if the attribute is +// not applicable to the data type or the data type is not numeric.), +// fixed_prec_scale: bool not null (Shows if the data type has predefined fixed precision and scale.), +// auto_increment: bool (Shows if the data type is auto incremental. NULL is returned if the attribute +// is not applicable to the data type or the data type is not numeric.), +// local_type_name: utf8 (Localized version of the data source-dependent name of the data type. NULL +// is returned if a localized name is not supported by the data source), +// minimum_scale: int32 (The minimum scale of the data type on the data source. +// If a data type has a fixed scale, the MINIMUM_SCALE and MAXIMUM_SCALE +// columns both contain this value. NULL is returned if scale is not applicable.), +// maximum_scale: int32 (The maximum scale of the data type on the data source. +// NULL is returned if scale is not applicable.), +// sql_data_type: int32 not null (The value of the SQL DATA TYPE which has the same values +// as data_type value. Except for interval and datetime, which +// uses generic values. More info about those types can be +// obtained through datetime_subcode. The possible values can be seen +// in the XdbcDataType enum.), +// datetime_subcode: int32 (Only used when the SQL DATA TYPE is interval or datetime. It contains +// its sub types. For type different from interval and datetime, this value +// is NULL. The possible values can be seen in the XdbcDatetimeSubcode enum.), +// num_prec_radix: int32 (If the data type is an approximate numeric type, this column contains +// the value 2 to indicate that COLUMN_SIZE specifies a number of bits. For +// exact numeric types, this column contains the value 10 to indicate that +// column size specifies a number of decimal digits. Otherwise, this column is NULL.), +// interval_precision: int32 (If the data type is an interval data type, then this column contains the value +// of the interval leading precision. Otherwise, this column is NULL. This fields +// is only relevant to be used by ODBC). // > // The returned data should be ordered by data_type and then by type_name. type CommandGetXdbcTypeInfo struct { @@ -2379,6 +2438,7 @@ type CommandGetXdbcTypeInfo struct { sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields + // // Specifies the data type to search for the info. DataType *int32 `protobuf:"varint,1,opt,name=data_type,json=dataType,proto3,oneof" json:"data_type,omitempty"` } @@ -2422,17 +2482,16 @@ func (x *CommandGetXdbcTypeInfo) GetDataType() int32 { return 0 } +// // Represents a request to retrieve the list of catalogs on a Flight SQL enabled backend. // The definition of a catalog depends on vendor/implementation. It is usually the database itself // Used in the command member of FlightDescriptor for the following RPC calls: -// - GetSchema: return the Arrow schema of the query. -// - GetFlightInfo: execute the catalog metadata request. +// - GetSchema: return the Arrow schema of the query. +// - GetFlightInfo: execute the catalog metadata request. // // The returned Arrow schema will be: // < -// -// catalog_name: utf8 not null -// +// catalog_name: utf8 not null // > // The returned data should be ordered by catalog_name. type CommandGetCatalogs struct { @@ -2473,18 +2532,17 @@ func (*CommandGetCatalogs) Descriptor() ([]byte, []int) { return file_FlightSql_proto_rawDescGZIP(), []int{2} } +// // Represents a request to retrieve the list of database schemas on a Flight SQL enabled backend. // The definition of a database schema depends on vendor/implementation. It is usually a collection of tables. // Used in the command member of FlightDescriptor for the following RPC calls: -// - GetSchema: return the Arrow schema of the query. -// - GetFlightInfo: execute the catalog metadata request. +// - GetSchema: return the Arrow schema of the query. +// - GetFlightInfo: execute the catalog metadata request. // // The returned Arrow schema will be: // < -// -// catalog_name: utf8, -// db_schema_name: utf8 not null -// +// catalog_name: utf8, +// db_schema_name: utf8 not null // > // The returned data should be ordered by catalog_name, then db_schema_name. type CommandGetDbSchemas struct { @@ -2492,15 +2550,17 @@ type CommandGetDbSchemas struct { sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields + // // Specifies the Catalog to search for the tables. // An empty string retrieves those without a catalog. // If omitted the catalog name should not be used to narrow the search. Catalog *string `protobuf:"bytes,1,opt,name=catalog,proto3,oneof" json:"catalog,omitempty"` + // // Specifies a filter pattern for schemas to search for. // When no db_schema_filter_pattern is provided, the pattern will not be used to narrow the search. // In the pattern string, two special characters can be used to denote matching rules: - // - "%" means to match any substring with 0 or more characters. - // - "_" means to match any one character. + // - "%" means to match any substring with 0 or more characters. + // - "_" means to match any one character. DbSchemaFilterPattern *string `protobuf:"bytes,2,opt,name=db_schema_filter_pattern,json=dbSchemaFilterPattern,proto3,oneof" json:"db_schema_filter_pattern,omitempty"` } @@ -2550,56 +2610,58 @@ func (x *CommandGetDbSchemas) GetDbSchemaFilterPattern() string { return "" } +// // Represents a request to retrieve the list of tables, and optionally their schemas, on a Flight SQL enabled backend. // Used in the command member of FlightDescriptor for the following RPC calls: -// - GetSchema: return the Arrow schema of the query. -// - GetFlightInfo: execute the catalog metadata request. +// - GetSchema: return the Arrow schema of the query. +// - GetFlightInfo: execute the catalog metadata request. // // The returned Arrow schema will be: // < -// -// catalog_name: utf8, -// db_schema_name: utf8, -// table_name: utf8 not null, -// table_type: utf8 not null, -// [optional] table_schema: bytes not null (schema of the table as described in Schema.fbs::Schema, -// it is serialized as an IPC message.) -// +// catalog_name: utf8, +// db_schema_name: utf8, +// table_name: utf8 not null, +// table_type: utf8 not null, +// [optional] table_schema: bytes not null (schema of the table as described in Schema.fbs::Schema, +// it is serialized as an IPC message.) // > // Fields on table_schema may contain the following metadata: -// - ARROW:FLIGHT:SQL:CATALOG_NAME - Table's catalog name -// - ARROW:FLIGHT:SQL:DB_SCHEMA_NAME - Database schema name -// - ARROW:FLIGHT:SQL:TABLE_NAME - Table name -// - ARROW:FLIGHT:SQL:TYPE_NAME - The data source-specific name for the data type of the column. -// - ARROW:FLIGHT:SQL:PRECISION - Column precision/size -// - ARROW:FLIGHT:SQL:SCALE - Column scale/decimal digits if applicable -// - ARROW:FLIGHT:SQL:IS_AUTO_INCREMENT - "1" indicates if the column is auto incremented, "0" otherwise. -// - ARROW:FLIGHT:SQL:IS_CASE_SENSITIVE - "1" indicates if the column is case sensitive, "0" otherwise. -// - ARROW:FLIGHT:SQL:IS_READ_ONLY - "1" indicates if the column is read only, "0" otherwise. -// - ARROW:FLIGHT:SQL:IS_SEARCHABLE - "1" indicates if the column is searchable via WHERE clause, "0" otherwise. -// +// - ARROW:FLIGHT:SQL:CATALOG_NAME - Table's catalog name +// - ARROW:FLIGHT:SQL:DB_SCHEMA_NAME - Database schema name +// - ARROW:FLIGHT:SQL:TABLE_NAME - Table name +// - ARROW:FLIGHT:SQL:TYPE_NAME - The data source-specific name for the data type of the column. +// - ARROW:FLIGHT:SQL:PRECISION - Column precision/size +// - ARROW:FLIGHT:SQL:SCALE - Column scale/decimal digits if applicable +// - ARROW:FLIGHT:SQL:IS_AUTO_INCREMENT - "1" indicates if the column is auto incremented, "0" otherwise. +// - ARROW:FLIGHT:SQL:IS_CASE_SENSITIVE - "1" indicates if the column is case sensitive, "0" otherwise. +// - ARROW:FLIGHT:SQL:IS_READ_ONLY - "1" indicates if the column is read only, "0" otherwise. +// - ARROW:FLIGHT:SQL:IS_SEARCHABLE - "1" indicates if the column is searchable via WHERE clause, "0" otherwise. // The returned data should be ordered by catalog_name, db_schema_name, table_name, then table_type, followed by table_schema if requested. type CommandGetTables struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields + // // Specifies the Catalog to search for the tables. // An empty string retrieves those without a catalog. // If omitted the catalog name should not be used to narrow the search. Catalog *string `protobuf:"bytes,1,opt,name=catalog,proto3,oneof" json:"catalog,omitempty"` + // // Specifies a filter pattern for schemas to search for. // When no db_schema_filter_pattern is provided, all schemas matching other filters are searched. // In the pattern string, two special characters can be used to denote matching rules: - // - "%" means to match any substring with 0 or more characters. - // - "_" means to match any one character. + // - "%" means to match any substring with 0 or more characters. + // - "_" means to match any one character. DbSchemaFilterPattern *string `protobuf:"bytes,2,opt,name=db_schema_filter_pattern,json=dbSchemaFilterPattern,proto3,oneof" json:"db_schema_filter_pattern,omitempty"` + // // Specifies a filter pattern for tables to search for. // When no table_name_filter_pattern is provided, all tables matching other filters are searched. // In the pattern string, two special characters can be used to denote matching rules: - // - "%" means to match any substring with 0 or more characters. - // - "_" means to match any one character. + // - "%" means to match any substring with 0 or more characters. + // - "_" means to match any one character. TableNameFilterPattern *string `protobuf:"bytes,3,opt,name=table_name_filter_pattern,json=tableNameFilterPattern,proto3,oneof" json:"table_name_filter_pattern,omitempty"` + // // Specifies a filter of table types which must match. // The table types depend on vendor/implementation. It is usually used to separate tables from views or system tables. // TABLE, VIEW, and SYSTEM TABLE are commonly supported. @@ -2675,18 +2737,17 @@ func (x *CommandGetTables) GetIncludeSchema() bool { return false } +// // Represents a request to retrieve the list of table types on a Flight SQL enabled backend. // The table types depend on vendor/implementation. It is usually used to separate tables from views or system tables. // TABLE, VIEW, and SYSTEM TABLE are commonly supported. // Used in the command member of FlightDescriptor for the following RPC calls: -// - GetSchema: return the Arrow schema of the query. -// - GetFlightInfo: execute the catalog metadata request. +// - GetSchema: return the Arrow schema of the query. +// - GetFlightInfo: execute the catalog metadata request. // // The returned Arrow schema will be: // < -// -// table_type: utf8 not null -// +// table_type: utf8 not null // > // The returned data should be ordered by table_type. type CommandGetTableTypes struct { @@ -2727,21 +2788,20 @@ func (*CommandGetTableTypes) Descriptor() ([]byte, []int) { return file_FlightSql_proto_rawDescGZIP(), []int{5} } +// // Represents a request to retrieve the primary keys of a table on a Flight SQL enabled backend. // Used in the command member of FlightDescriptor for the following RPC calls: -// - GetSchema: return the Arrow schema of the query. -// - GetFlightInfo: execute the catalog metadata request. +// - GetSchema: return the Arrow schema of the query. +// - GetFlightInfo: execute the catalog metadata request. // // The returned Arrow schema will be: // < -// -// catalog_name: utf8, -// db_schema_name: utf8, -// table_name: utf8 not null, -// column_name: utf8 not null, -// key_name: utf8, -// key_sequence: int32 not null -// +// catalog_name: utf8, +// db_schema_name: utf8, +// table_name: utf8 not null, +// column_name: utf8 not null, +// key_name: utf8, +// key_sequence: int32 not null // > // The returned data should be ordered by catalog_name, db_schema_name, table_name, key_name, then key_sequence. type CommandGetPrimaryKeys struct { @@ -2749,10 +2809,12 @@ type CommandGetPrimaryKeys struct { sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields + // // Specifies the catalog to search for the table. // An empty string retrieves those without a catalog. // If omitted the catalog name should not be used to narrow the search. Catalog *string `protobuf:"bytes,1,opt,name=catalog,proto3,oneof" json:"catalog,omitempty"` + // // Specifies the schema to search for the table. // An empty string retrieves those without a schema. // If omitted the schema name should not be used to narrow the search. @@ -2814,29 +2876,28 @@ func (x *CommandGetPrimaryKeys) GetTable() string { return "" } +// // Represents a request to retrieve a description of the foreign key columns that reference the given table's // primary key columns (the foreign keys exported by a table) of a table on a Flight SQL enabled backend. // Used in the command member of FlightDescriptor for the following RPC calls: -// - GetSchema: return the Arrow schema of the query. -// - GetFlightInfo: execute the catalog metadata request. +// - GetSchema: return the Arrow schema of the query. +// - GetFlightInfo: execute the catalog metadata request. // // The returned Arrow schema will be: // < -// -// pk_catalog_name: utf8, -// pk_db_schema_name: utf8, -// pk_table_name: utf8 not null, -// pk_column_name: utf8 not null, -// fk_catalog_name: utf8, -// fk_db_schema_name: utf8, -// fk_table_name: utf8 not null, -// fk_column_name: utf8 not null, -// key_sequence: int32 not null, -// fk_key_name: utf8, -// pk_key_name: utf8, -// update_rule: uint8 not null, -// delete_rule: uint8 not null -// +// pk_catalog_name: utf8, +// pk_db_schema_name: utf8, +// pk_table_name: utf8 not null, +// pk_column_name: utf8 not null, +// fk_catalog_name: utf8, +// fk_db_schema_name: utf8, +// fk_table_name: utf8 not null, +// fk_column_name: utf8 not null, +// key_sequence: int32 not null, +// fk_key_name: utf8, +// pk_key_name: utf8, +// update_rule: uint8 not null, +// delete_rule: uint8 not null // > // The returned data should be ordered by fk_catalog_name, fk_db_schema_name, fk_table_name, fk_key_name, then key_sequence. // update_rule and delete_rule returns a byte that is equivalent to actions declared on UpdateDeleteRules enum. @@ -2845,10 +2906,12 @@ type CommandGetExportedKeys struct { sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields + // // Specifies the catalog to search for the foreign key table. // An empty string retrieves those without a catalog. // If omitted the catalog name should not be used to narrow the search. Catalog *string `protobuf:"bytes,1,opt,name=catalog,proto3,oneof" json:"catalog,omitempty"` + // // Specifies the schema to search for the foreign key table. // An empty string retrieves those without a schema. // If omitted the schema name should not be used to narrow the search. @@ -2910,45 +2973,46 @@ func (x *CommandGetExportedKeys) GetTable() string { return "" } +// // Represents a request to retrieve the foreign keys of a table on a Flight SQL enabled backend. // Used in the command member of FlightDescriptor for the following RPC calls: -// - GetSchema: return the Arrow schema of the query. -// - GetFlightInfo: execute the catalog metadata request. +// - GetSchema: return the Arrow schema of the query. +// - GetFlightInfo: execute the catalog metadata request. // // The returned Arrow schema will be: // < -// -// pk_catalog_name: utf8, -// pk_db_schema_name: utf8, -// pk_table_name: utf8 not null, -// pk_column_name: utf8 not null, -// fk_catalog_name: utf8, -// fk_db_schema_name: utf8, -// fk_table_name: utf8 not null, -// fk_column_name: utf8 not null, -// key_sequence: int32 not null, -// fk_key_name: utf8, -// pk_key_name: utf8, -// update_rule: uint8 not null, -// delete_rule: uint8 not null -// +// pk_catalog_name: utf8, +// pk_db_schema_name: utf8, +// pk_table_name: utf8 not null, +// pk_column_name: utf8 not null, +// fk_catalog_name: utf8, +// fk_db_schema_name: utf8, +// fk_table_name: utf8 not null, +// fk_column_name: utf8 not null, +// key_sequence: int32 not null, +// fk_key_name: utf8, +// pk_key_name: utf8, +// update_rule: uint8 not null, +// delete_rule: uint8 not null // > // The returned data should be ordered by pk_catalog_name, pk_db_schema_name, pk_table_name, pk_key_name, then key_sequence. // update_rule and delete_rule returns a byte that is equivalent to actions: -// - 0 = CASCADE -// - 1 = RESTRICT -// - 2 = SET NULL -// - 3 = NO ACTION -// - 4 = SET DEFAULT +// - 0 = CASCADE +// - 1 = RESTRICT +// - 2 = SET NULL +// - 3 = NO ACTION +// - 4 = SET DEFAULT type CommandGetImportedKeys struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields + // // Specifies the catalog to search for the primary key table. // An empty string retrieves those without a catalog. // If omitted the catalog name should not be used to narrow the search. Catalog *string `protobuf:"bytes,1,opt,name=catalog,proto3,oneof" json:"catalog,omitempty"` + // // Specifies the schema to search for the primary key table. // An empty string retrieves those without a schema. // If omitted the schema name should not be used to narrow the search. @@ -3010,67 +3074,66 @@ func (x *CommandGetImportedKeys) GetTable() string { return "" } +// // Represents a request to retrieve a description of the foreign key columns in the given foreign key table that // reference the primary key or the columns representing a unique constraint of the parent table (could be the same // or a different table) on a Flight SQL enabled backend. // Used in the command member of FlightDescriptor for the following RPC calls: -// - GetSchema: return the Arrow schema of the query. -// - GetFlightInfo: execute the catalog metadata request. +// - GetSchema: return the Arrow schema of the query. +// - GetFlightInfo: execute the catalog metadata request. // // The returned Arrow schema will be: // < -// -// pk_catalog_name: utf8, -// pk_db_schema_name: utf8, -// pk_table_name: utf8 not null, -// pk_column_name: utf8 not null, -// fk_catalog_name: utf8, -// fk_db_schema_name: utf8, -// fk_table_name: utf8 not null, -// fk_column_name: utf8 not null, -// key_sequence: int32 not null, -// fk_key_name: utf8, -// pk_key_name: utf8, -// update_rule: uint8 not null, -// delete_rule: uint8 not null -// +// pk_catalog_name: utf8, +// pk_db_schema_name: utf8, +// pk_table_name: utf8 not null, +// pk_column_name: utf8 not null, +// fk_catalog_name: utf8, +// fk_db_schema_name: utf8, +// fk_table_name: utf8 not null, +// fk_column_name: utf8 not null, +// key_sequence: int32 not null, +// fk_key_name: utf8, +// pk_key_name: utf8, +// update_rule: uint8 not null, +// delete_rule: uint8 not null // > // The returned data should be ordered by pk_catalog_name, pk_db_schema_name, pk_table_name, pk_key_name, then key_sequence. // update_rule and delete_rule returns a byte that is equivalent to actions: -// - 0 = CASCADE -// - 1 = RESTRICT -// - 2 = SET NULL -// - 3 = NO ACTION -// - 4 = SET DEFAULT +// - 0 = CASCADE +// - 1 = RESTRICT +// - 2 = SET NULL +// - 3 = NO ACTION +// - 4 = SET DEFAULT type CommandGetCrossReference struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields - // * + //* // The catalog name where the parent table is. // An empty string retrieves those without a catalog. // If omitted the catalog name should not be used to narrow the search. PkCatalog *string `protobuf:"bytes,1,opt,name=pk_catalog,json=pkCatalog,proto3,oneof" json:"pk_catalog,omitempty"` - // * + //* // The Schema name where the parent table is. // An empty string retrieves those without a schema. // If omitted the schema name should not be used to narrow the search. PkDbSchema *string `protobuf:"bytes,2,opt,name=pk_db_schema,json=pkDbSchema,proto3,oneof" json:"pk_db_schema,omitempty"` - // * + //* // The parent table name. It cannot be null. PkTable string `protobuf:"bytes,3,opt,name=pk_table,json=pkTable,proto3" json:"pk_table,omitempty"` - // * + //* // The catalog name where the foreign table is. // An empty string retrieves those without a catalog. // If omitted the catalog name should not be used to narrow the search. FkCatalog *string `protobuf:"bytes,4,opt,name=fk_catalog,json=fkCatalog,proto3,oneof" json:"fk_catalog,omitempty"` - // * + //* // The schema name where the foreign table is. // An empty string retrieves those without a schema. // If omitted the schema name should not be used to narrow the search. FkDbSchema *string `protobuf:"bytes,5,opt,name=fk_db_schema,json=fkDbSchema,proto3,oneof" json:"fk_db_schema,omitempty"` - // * + //* // The foreign table name. It cannot be null. FkTable string `protobuf:"bytes,6,opt,name=fk_table,json=fkTable,proto3" json:"fk_table,omitempty"` } @@ -3149,6 +3212,7 @@ func (x *CommandGetCrossReference) GetFkTable() string { return "" } +// // Request message for the "CreatePreparedStatement" action on a Flight SQL enabled backend. type ActionCreatePreparedStatementRequest struct { state protoimpl.MessageState @@ -3208,6 +3272,7 @@ func (x *ActionCreatePreparedStatementRequest) GetTransactionId() []byte { return nil } +// // An embedded message describing a Substrait plan to execute. type SubstraitPlan struct { state protoimpl.MessageState @@ -3271,6 +3336,7 @@ func (x *SubstraitPlan) GetVersion() string { return "" } +// // Request message for the "CreatePreparedSubstraitPlan" action on a Flight SQL enabled backend. type ActionCreatePreparedSubstraitPlanRequest struct { state protoimpl.MessageState @@ -3330,6 +3396,7 @@ func (x *ActionCreatePreparedSubstraitPlanRequest) GetTransactionId() []byte { return nil } +// // Wrap the result of a "CreatePreparedStatement" or "CreatePreparedSubstraitPlan" action. // // The resultant PreparedStatement can be closed either: @@ -3405,6 +3472,7 @@ func (x *ActionCreatePreparedStatementResult) GetParameterSchema() []byte { return nil } +// // Request message for the "ClosePreparedStatement" action on a Flight SQL enabled backend. // Closes server resources associated with the prepared statement handle. type ActionClosePreparedStatementRequest struct { @@ -3455,6 +3523,7 @@ func (x *ActionClosePreparedStatementRequest) GetPreparedStatementHandle() []byt return nil } +// // Request message for the "BeginTransaction" action. // Begins a transaction. type ActionBeginTransactionRequest struct { @@ -3495,6 +3564,7 @@ func (*ActionBeginTransactionRequest) Descriptor() ([]byte, []int) { return file_FlightSql_proto_rawDescGZIP(), []int{15} } +// // Request message for the "BeginSavepoint" action. // Creates a savepoint within a transaction. // @@ -3557,6 +3627,7 @@ func (x *ActionBeginSavepointRequest) GetName() string { return "" } +// // The result of a "BeginTransaction" action. // // The transaction can be manipulated with the "EndTransaction" action, or @@ -3612,6 +3683,7 @@ func (x *ActionBeginTransactionResult) GetTransactionId() []byte { return nil } +// // The result of a "BeginSavepoint" action. // // The transaction can be manipulated with the "EndSavepoint" action. @@ -3667,6 +3739,7 @@ func (x *ActionBeginSavepointResult) GetSavepointId() []byte { return nil } +// // Request message for the "EndTransaction" action. // // Commit (COMMIT) or rollback (ROLLBACK) the transaction. @@ -3730,6 +3803,7 @@ func (x *ActionEndTransactionRequest) GetAction() ActionEndTransactionRequest_En return ActionEndTransactionRequest_END_TRANSACTION_UNSPECIFIED } +// // Request message for the "EndSavepoint" action. // // Release (RELEASE) the savepoint or rollback (ROLLBACK) to the @@ -3795,21 +3869,22 @@ func (x *ActionEndSavepointRequest) GetAction() ActionEndSavepointRequest_EndSav return ActionEndSavepointRequest_END_SAVEPOINT_UNSPECIFIED } +// // Represents a SQL query. Used in the command member of FlightDescriptor // for the following RPC calls: -// - GetSchema: return the Arrow schema of the query. -// Fields on this schema may contain the following metadata: -// - ARROW:FLIGHT:SQL:CATALOG_NAME - Table's catalog name -// - ARROW:FLIGHT:SQL:DB_SCHEMA_NAME - Database schema name -// - ARROW:FLIGHT:SQL:TABLE_NAME - Table name -// - ARROW:FLIGHT:SQL:TYPE_NAME - The data source-specific name for the data type of the column. -// - ARROW:FLIGHT:SQL:PRECISION - Column precision/size -// - ARROW:FLIGHT:SQL:SCALE - Column scale/decimal digits if applicable -// - ARROW:FLIGHT:SQL:IS_AUTO_INCREMENT - "1" indicates if the column is auto incremented, "0" otherwise. -// - ARROW:FLIGHT:SQL:IS_CASE_SENSITIVE - "1" indicates if the column is case sensitive, "0" otherwise. -// - ARROW:FLIGHT:SQL:IS_READ_ONLY - "1" indicates if the column is read only, "0" otherwise. -// - ARROW:FLIGHT:SQL:IS_SEARCHABLE - "1" indicates if the column is searchable via WHERE clause, "0" otherwise. -// - GetFlightInfo: execute the query. +// - GetSchema: return the Arrow schema of the query. +// Fields on this schema may contain the following metadata: +// - ARROW:FLIGHT:SQL:CATALOG_NAME - Table's catalog name +// - ARROW:FLIGHT:SQL:DB_SCHEMA_NAME - Database schema name +// - ARROW:FLIGHT:SQL:TABLE_NAME - Table name +// - ARROW:FLIGHT:SQL:TYPE_NAME - The data source-specific name for the data type of the column. +// - ARROW:FLIGHT:SQL:PRECISION - Column precision/size +// - ARROW:FLIGHT:SQL:SCALE - Column scale/decimal digits if applicable +// - ARROW:FLIGHT:SQL:IS_AUTO_INCREMENT - "1" indicates if the column is auto incremented, "0" otherwise. +// - ARROW:FLIGHT:SQL:IS_CASE_SENSITIVE - "1" indicates if the column is case sensitive, "0" otherwise. +// - ARROW:FLIGHT:SQL:IS_READ_ONLY - "1" indicates if the column is read only, "0" otherwise. +// - ARROW:FLIGHT:SQL:IS_SEARCHABLE - "1" indicates if the column is searchable via WHERE clause, "0" otherwise. +// - GetFlightInfo: execute the query. type CommandStatementQuery struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache @@ -3867,22 +3942,23 @@ func (x *CommandStatementQuery) GetTransactionId() []byte { return nil } +// // Represents a Substrait plan. Used in the command member of FlightDescriptor // for the following RPC calls: -// - GetSchema: return the Arrow schema of the query. -// Fields on this schema may contain the following metadata: -// - ARROW:FLIGHT:SQL:CATALOG_NAME - Table's catalog name -// - ARROW:FLIGHT:SQL:DB_SCHEMA_NAME - Database schema name -// - ARROW:FLIGHT:SQL:TABLE_NAME - Table name -// - ARROW:FLIGHT:SQL:TYPE_NAME - The data source-specific name for the data type of the column. -// - ARROW:FLIGHT:SQL:PRECISION - Column precision/size -// - ARROW:FLIGHT:SQL:SCALE - Column scale/decimal digits if applicable -// - ARROW:FLIGHT:SQL:IS_AUTO_INCREMENT - "1" indicates if the column is auto incremented, "0" otherwise. -// - ARROW:FLIGHT:SQL:IS_CASE_SENSITIVE - "1" indicates if the column is case sensitive, "0" otherwise. -// - ARROW:FLIGHT:SQL:IS_READ_ONLY - "1" indicates if the column is read only, "0" otherwise. -// - ARROW:FLIGHT:SQL:IS_SEARCHABLE - "1" indicates if the column is searchable via WHERE clause, "0" otherwise. -// - GetFlightInfo: execute the query. -// - DoPut: execute the query. +// - GetSchema: return the Arrow schema of the query. +// Fields on this schema may contain the following metadata: +// - ARROW:FLIGHT:SQL:CATALOG_NAME - Table's catalog name +// - ARROW:FLIGHT:SQL:DB_SCHEMA_NAME - Database schema name +// - ARROW:FLIGHT:SQL:TABLE_NAME - Table name +// - ARROW:FLIGHT:SQL:TYPE_NAME - The data source-specific name for the data type of the column. +// - ARROW:FLIGHT:SQL:PRECISION - Column precision/size +// - ARROW:FLIGHT:SQL:SCALE - Column scale/decimal digits if applicable +// - ARROW:FLIGHT:SQL:IS_AUTO_INCREMENT - "1" indicates if the column is auto incremented, "0" otherwise. +// - ARROW:FLIGHT:SQL:IS_CASE_SENSITIVE - "1" indicates if the column is case sensitive, "0" otherwise. +// - ARROW:FLIGHT:SQL:IS_READ_ONLY - "1" indicates if the column is read only, "0" otherwise. +// - ARROW:FLIGHT:SQL:IS_SEARCHABLE - "1" indicates if the column is searchable via WHERE clause, "0" otherwise. +// - GetFlightInfo: execute the query. +// - DoPut: execute the query. type CommandStatementSubstraitPlan struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache @@ -3940,7 +4016,7 @@ func (x *CommandStatementSubstraitPlan) GetTransactionId() []byte { return nil } -// * +//* // Represents a ticket resulting from GetFlightInfo with a CommandStatementQuery. // This should be used only once and treated as an opaque value, that is, clients should not attempt to parse this. type TicketStatementQuery struct { @@ -3991,22 +4067,23 @@ func (x *TicketStatementQuery) GetStatementHandle() []byte { return nil } +// // Represents an instance of executing a prepared statement. Used in the command member of FlightDescriptor for // the following RPC calls: -// - GetSchema: return the Arrow schema of the query. -// Fields on this schema may contain the following metadata: -// - ARROW:FLIGHT:SQL:CATALOG_NAME - Table's catalog name -// - ARROW:FLIGHT:SQL:DB_SCHEMA_NAME - Database schema name -// - ARROW:FLIGHT:SQL:TABLE_NAME - Table name -// - ARROW:FLIGHT:SQL:TYPE_NAME - The data source-specific name for the data type of the column. -// - ARROW:FLIGHT:SQL:PRECISION - Column precision/size -// - ARROW:FLIGHT:SQL:SCALE - Column scale/decimal digits if applicable -// - ARROW:FLIGHT:SQL:IS_AUTO_INCREMENT - "1" indicates if the column is auto incremented, "0" otherwise. -// - ARROW:FLIGHT:SQL:IS_CASE_SENSITIVE - "1" indicates if the column is case sensitive, "0" otherwise. -// - ARROW:FLIGHT:SQL:IS_READ_ONLY - "1" indicates if the column is read only, "0" otherwise. -// - ARROW:FLIGHT:SQL:IS_SEARCHABLE - "1" indicates if the column is searchable via WHERE clause, "0" otherwise. -// - DoPut: bind parameter values. All of the bound parameter sets will be executed as a single atomic execution. -// - GetFlightInfo: execute the prepared statement instance. +// - GetSchema: return the Arrow schema of the query. +// Fields on this schema may contain the following metadata: +// - ARROW:FLIGHT:SQL:CATALOG_NAME - Table's catalog name +// - ARROW:FLIGHT:SQL:DB_SCHEMA_NAME - Database schema name +// - ARROW:FLIGHT:SQL:TABLE_NAME - Table name +// - ARROW:FLIGHT:SQL:TYPE_NAME - The data source-specific name for the data type of the column. +// - ARROW:FLIGHT:SQL:PRECISION - Column precision/size +// - ARROW:FLIGHT:SQL:SCALE - Column scale/decimal digits if applicable +// - ARROW:FLIGHT:SQL:IS_AUTO_INCREMENT - "1" indicates if the column is auto incremented, "0" otherwise. +// - ARROW:FLIGHT:SQL:IS_CASE_SENSITIVE - "1" indicates if the column is case sensitive, "0" otherwise. +// - ARROW:FLIGHT:SQL:IS_READ_ONLY - "1" indicates if the column is read only, "0" otherwise. +// - ARROW:FLIGHT:SQL:IS_SEARCHABLE - "1" indicates if the column is searchable via WHERE clause, "0" otherwise. +// - DoPut: bind parameter values. All of the bound parameter sets will be executed as a single atomic execution. +// - GetFlightInfo: execute the prepared statement instance. type CommandPreparedStatementQuery struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache @@ -4055,6 +4132,7 @@ func (x *CommandPreparedStatementQuery) GetPreparedStatementHandle() []byte { return nil } +// // Represents a SQL update query. Used in the command member of FlightDescriptor // for the the RPC call DoPut to cause the server to execute the included SQL update. type CommandStatementUpdate struct { @@ -4114,6 +4192,7 @@ func (x *CommandStatementUpdate) GetTransactionId() []byte { return nil } +// // Represents a SQL update query. Used in the command member of FlightDescriptor // for the the RPC call DoPut to cause the server to execute the included // prepared statement handle as an update. @@ -4165,6 +4244,7 @@ func (x *CommandPreparedStatementUpdate) GetPreparedStatementHandle() []byte { return nil } +// // Returned from the RPC call DoPut when a CommandStatementUpdate // CommandPreparedStatementUpdate was in the request, containing // results from the update. @@ -4217,6 +4297,7 @@ func (x *DoPutUpdateResult) GetRecordCount() int64 { return 0 } +// // Request message for the "CancelQuery" action. // // Explicitly cancel a running query. @@ -4285,10 +4366,14 @@ func (x *ActionCancelQueryRequest) GetInfo() []byte { return nil } +// // The result of cancelling a query. // // The result should be wrapped in a google.protobuf.Any message. // +// This command is deprecated since 13.0.0. Use the "CancelFlightInfo" +// action with DoAction instead. +// // Deprecated: Do not use. type ActionCancelQueryResult struct { state protoimpl.MessageState @@ -4339,7 +4424,7 @@ func (x *ActionCancelQueryResult) GetResult() ActionCancelQueryResult_CancelResu var file_FlightSql_proto_extTypes = []protoimpl.ExtensionInfo{ { - ExtendedType: (*descriptorpb.MessageOptions)(nil), + ExtendedType: (*descriptor.MessageOptions)(nil), ExtensionType: (*bool)(nil), Field: 1000, Name: "arrow.flight.protocol.sql.experimental", @@ -4348,7 +4433,7 @@ var file_FlightSql_proto_extTypes = []protoimpl.ExtensionInfo{ }, } -// Extension fields to descriptorpb.MessageOptions. +// Extension fields to descriptor.MessageOptions. var ( // optional bool experimental = 1000; E_Experimental = &file_FlightSql_proto_extTypes[0] @@ -5163,7 +5248,7 @@ var file_FlightSql_proto_goTypes = []interface{}{ (*DoPutUpdateResult)(nil), // 52: arrow.flight.protocol.sql.DoPutUpdateResult (*ActionCancelQueryRequest)(nil), // 53: arrow.flight.protocol.sql.ActionCancelQueryRequest (*ActionCancelQueryResult)(nil), // 54: arrow.flight.protocol.sql.ActionCancelQueryResult - (*descriptorpb.MessageOptions)(nil), // 55: google.protobuf.MessageOptions + (*descriptor.MessageOptions)(nil), // 55: google.protobuf.MessageOptions } var file_FlightSql_proto_depIdxs = []int32{ 36, // 0: arrow.flight.protocol.sql.ActionCreatePreparedSubstraitPlanRequest.plan:type_name -> arrow.flight.protocol.sql.SubstraitPlan diff --git a/go/arrow/flight/internal/flight/Flight_grpc.pb.go b/go/arrow/flight/gen/flight/Flight_grpc.pb.go similarity index 93% rename from go/arrow/flight/internal/flight/Flight_grpc.pb.go rename to go/arrow/flight/gen/flight/Flight_grpc.pb.go index 10fd285a5c10b..9613114448796 100644 --- a/go/arrow/flight/internal/flight/Flight_grpc.pb.go +++ b/go/arrow/flight/gen/flight/Flight_grpc.pb.go @@ -1,4 +1,8 @@ // Code generated by protoc-gen-go-grpc. DO NOT EDIT. +// versions: +// - protoc-gen-go-grpc v1.2.0 +// - protoc v3.12.4 +// source: Flight.proto package flight @@ -11,17 +15,20 @@ import ( // This is a compile-time assertion to ensure that this generated file // is compatible with the grpc package it is being compiled against. +// Requires gRPC-Go v1.32.0 or later. const _ = grpc.SupportPackageIsVersion7 // FlightServiceClient is the client API for FlightService service. // // For semantics around ctx use and closing/ending streaming RPCs, please refer to https://pkg.go.dev/google.golang.org/grpc/?tab=doc#ClientConn.NewStream. type FlightServiceClient interface { + // // Handshake between client and server. Depending on the server, the // handshake may be required to determine the token that should be used for // future operations. Both request and response are streams to allow multiple // round-trips depending on auth mechanism. Handshake(ctx context.Context, opts ...grpc.CallOption) (FlightService_HandshakeClient, error) + // // Get a list of available streams given a particular criteria. Most flight // services will expose one or more streams that are readily available for // retrieval. This api allows listing the streams available for @@ -29,6 +36,7 @@ type FlightServiceClient interface { // the subset of streams that can be listed via this interface. Each flight // service allows its own definition of how to consume criteria. ListFlights(ctx context.Context, in *Criteria, opts ...grpc.CallOption) (FlightService_ListFlightsClient, error) + // // For a given FlightDescriptor, get information about how the flight can be // consumed. This is a useful interface if the consumer of the interface // already can identify the specific flight to consume. This interface can @@ -40,16 +48,19 @@ type FlightServiceClient interface { // available for consumption for the duration defined by the specific flight // service. GetFlightInfo(ctx context.Context, in *FlightDescriptor, opts ...grpc.CallOption) (*FlightInfo, error) + // // For a given FlightDescriptor, get the Schema as described in Schema.fbs::Schema // This is used when a consumer needs the Schema of flight stream. Similar to // GetFlightInfo this interface may generate a new flight that was not previously // available in ListFlights. GetSchema(ctx context.Context, in *FlightDescriptor, opts ...grpc.CallOption) (*SchemaResult, error) + // // Retrieve a single stream associated with a particular descriptor // associated with the referenced ticket. A Flight can be composed of one or // more streams where each stream can be retrieved using a separate opaque // ticket that the flight service uses for managing a collection of streams. DoGet(ctx context.Context, in *Ticket, opts ...grpc.CallOption) (FlightService_DoGetClient, error) + // // Push a stream to the flight service associated with a particular // flight stream. This allows a client of a flight service to upload a stream // of data. Depending on the particular flight service, a client consumer @@ -57,12 +68,14 @@ type FlightServiceClient interface { // number. In the latter, the service might implement a 'seal' action that // can be applied to a descriptor once all streams are uploaded. DoPut(ctx context.Context, opts ...grpc.CallOption) (FlightService_DoPutClient, error) + // // Open a bidirectional data channel for a given descriptor. This // allows clients to send and receive arbitrary Arrow data and // application-specific metadata in a single logical stream. In // contrast to DoGet/DoPut, this is more suited for clients // offloading computation (rather than storage) to a Flight service. DoExchange(ctx context.Context, opts ...grpc.CallOption) (FlightService_DoExchangeClient, error) + // // Flight services can support an arbitrary number of simple actions in // addition to the possible ListFlights, GetFlightInfo, DoGet, DoPut // operations that are potentially available. DoAction allows a flight client @@ -70,6 +83,7 @@ type FlightServiceClient interface { // opaque request and response objects that are specific to the type action // being undertaken. DoAction(ctx context.Context, in *Action, opts ...grpc.CallOption) (FlightService_DoActionClient, error) + // // A flight service exposes all of the available action types that it has // along with descriptions. This allows different flight consumers to // understand the capabilities of the flight service. @@ -85,7 +99,7 @@ func NewFlightServiceClient(cc grpc.ClientConnInterface) FlightServiceClient { } func (c *flightServiceClient) Handshake(ctx context.Context, opts ...grpc.CallOption) (FlightService_HandshakeClient, error) { - stream, err := c.cc.NewStream(ctx, &_FlightService_serviceDesc.Streams[0], "/arrow.flight.protocol.FlightService/Handshake", opts...) + stream, err := c.cc.NewStream(ctx, &FlightService_ServiceDesc.Streams[0], "/arrow.flight.protocol.FlightService/Handshake", opts...) if err != nil { return nil, err } @@ -116,7 +130,7 @@ func (x *flightServiceHandshakeClient) Recv() (*HandshakeResponse, error) { } func (c *flightServiceClient) ListFlights(ctx context.Context, in *Criteria, opts ...grpc.CallOption) (FlightService_ListFlightsClient, error) { - stream, err := c.cc.NewStream(ctx, &_FlightService_serviceDesc.Streams[1], "/arrow.flight.protocol.FlightService/ListFlights", opts...) + stream, err := c.cc.NewStream(ctx, &FlightService_ServiceDesc.Streams[1], "/arrow.flight.protocol.FlightService/ListFlights", opts...) if err != nil { return nil, err } @@ -166,7 +180,7 @@ func (c *flightServiceClient) GetSchema(ctx context.Context, in *FlightDescripto } func (c *flightServiceClient) DoGet(ctx context.Context, in *Ticket, opts ...grpc.CallOption) (FlightService_DoGetClient, error) { - stream, err := c.cc.NewStream(ctx, &_FlightService_serviceDesc.Streams[2], "/arrow.flight.protocol.FlightService/DoGet", opts...) + stream, err := c.cc.NewStream(ctx, &FlightService_ServiceDesc.Streams[2], "/arrow.flight.protocol.FlightService/DoGet", opts...) if err != nil { return nil, err } @@ -198,7 +212,7 @@ func (x *flightServiceDoGetClient) Recv() (*FlightData, error) { } func (c *flightServiceClient) DoPut(ctx context.Context, opts ...grpc.CallOption) (FlightService_DoPutClient, error) { - stream, err := c.cc.NewStream(ctx, &_FlightService_serviceDesc.Streams[3], "/arrow.flight.protocol.FlightService/DoPut", opts...) + stream, err := c.cc.NewStream(ctx, &FlightService_ServiceDesc.Streams[3], "/arrow.flight.protocol.FlightService/DoPut", opts...) if err != nil { return nil, err } @@ -229,7 +243,7 @@ func (x *flightServiceDoPutClient) Recv() (*PutResult, error) { } func (c *flightServiceClient) DoExchange(ctx context.Context, opts ...grpc.CallOption) (FlightService_DoExchangeClient, error) { - stream, err := c.cc.NewStream(ctx, &_FlightService_serviceDesc.Streams[4], "/arrow.flight.protocol.FlightService/DoExchange", opts...) + stream, err := c.cc.NewStream(ctx, &FlightService_ServiceDesc.Streams[4], "/arrow.flight.protocol.FlightService/DoExchange", opts...) if err != nil { return nil, err } @@ -260,7 +274,7 @@ func (x *flightServiceDoExchangeClient) Recv() (*FlightData, error) { } func (c *flightServiceClient) DoAction(ctx context.Context, in *Action, opts ...grpc.CallOption) (FlightService_DoActionClient, error) { - stream, err := c.cc.NewStream(ctx, &_FlightService_serviceDesc.Streams[5], "/arrow.flight.protocol.FlightService/DoAction", opts...) + stream, err := c.cc.NewStream(ctx, &FlightService_ServiceDesc.Streams[5], "/arrow.flight.protocol.FlightService/DoAction", opts...) if err != nil { return nil, err } @@ -292,7 +306,7 @@ func (x *flightServiceDoActionClient) Recv() (*Result, error) { } func (c *flightServiceClient) ListActions(ctx context.Context, in *Empty, opts ...grpc.CallOption) (FlightService_ListActionsClient, error) { - stream, err := c.cc.NewStream(ctx, &_FlightService_serviceDesc.Streams[6], "/arrow.flight.protocol.FlightService/ListActions", opts...) + stream, err := c.cc.NewStream(ctx, &FlightService_ServiceDesc.Streams[6], "/arrow.flight.protocol.FlightService/ListActions", opts...) if err != nil { return nil, err } @@ -327,11 +341,13 @@ func (x *flightServiceListActionsClient) Recv() (*ActionType, error) { // All implementations must embed UnimplementedFlightServiceServer // for forward compatibility type FlightServiceServer interface { + // // Handshake between client and server. Depending on the server, the // handshake may be required to determine the token that should be used for // future operations. Both request and response are streams to allow multiple // round-trips depending on auth mechanism. Handshake(FlightService_HandshakeServer) error + // // Get a list of available streams given a particular criteria. Most flight // services will expose one or more streams that are readily available for // retrieval. This api allows listing the streams available for @@ -339,6 +355,7 @@ type FlightServiceServer interface { // the subset of streams that can be listed via this interface. Each flight // service allows its own definition of how to consume criteria. ListFlights(*Criteria, FlightService_ListFlightsServer) error + // // For a given FlightDescriptor, get information about how the flight can be // consumed. This is a useful interface if the consumer of the interface // already can identify the specific flight to consume. This interface can @@ -350,16 +367,19 @@ type FlightServiceServer interface { // available for consumption for the duration defined by the specific flight // service. GetFlightInfo(context.Context, *FlightDescriptor) (*FlightInfo, error) + // // For a given FlightDescriptor, get the Schema as described in Schema.fbs::Schema // This is used when a consumer needs the Schema of flight stream. Similar to // GetFlightInfo this interface may generate a new flight that was not previously // available in ListFlights. GetSchema(context.Context, *FlightDescriptor) (*SchemaResult, error) + // // Retrieve a single stream associated with a particular descriptor // associated with the referenced ticket. A Flight can be composed of one or // more streams where each stream can be retrieved using a separate opaque // ticket that the flight service uses for managing a collection of streams. DoGet(*Ticket, FlightService_DoGetServer) error + // // Push a stream to the flight service associated with a particular // flight stream. This allows a client of a flight service to upload a stream // of data. Depending on the particular flight service, a client consumer @@ -367,12 +387,14 @@ type FlightServiceServer interface { // number. In the latter, the service might implement a 'seal' action that // can be applied to a descriptor once all streams are uploaded. DoPut(FlightService_DoPutServer) error + // // Open a bidirectional data channel for a given descriptor. This // allows clients to send and receive arbitrary Arrow data and // application-specific metadata in a single logical stream. In // contrast to DoGet/DoPut, this is more suited for clients // offloading computation (rather than storage) to a Flight service. DoExchange(FlightService_DoExchangeServer) error + // // Flight services can support an arbitrary number of simple actions in // addition to the possible ListFlights, GetFlightInfo, DoGet, DoPut // operations that are potentially available. DoAction allows a flight client @@ -380,6 +402,7 @@ type FlightServiceServer interface { // opaque request and response objects that are specific to the type action // being undertaken. DoAction(*Action, FlightService_DoActionServer) error + // // A flight service exposes all of the available action types that it has // along with descriptions. This allows different flight consumers to // understand the capabilities of the flight service. @@ -427,8 +450,8 @@ type UnsafeFlightServiceServer interface { mustEmbedUnimplementedFlightServiceServer() } -func RegisterFlightServiceServer(s *grpc.Server, srv FlightServiceServer) { - s.RegisterService(&_FlightService_serviceDesc, srv) +func RegisterFlightServiceServer(s grpc.ServiceRegistrar, srv FlightServiceServer) { + s.RegisterService(&FlightService_ServiceDesc, srv) } func _FlightService_Handshake_Handler(srv interface{}, stream grpc.ServerStream) error { @@ -629,7 +652,10 @@ func (x *flightServiceListActionsServer) Send(m *ActionType) error { return x.ServerStream.SendMsg(m) } -var _FlightService_serviceDesc = grpc.ServiceDesc{ +// FlightService_ServiceDesc is the grpc.ServiceDesc for FlightService service. +// It's only intended for direct use with grpc.RegisterService, +// and not to be introspected or modified (even as a copy) +var FlightService_ServiceDesc = grpc.ServiceDesc{ ServiceName: "arrow.flight.protocol.FlightService", HandlerType: (*FlightServiceServer)(nil), Methods: []grpc.MethodDesc{ diff --git a/go/arrow/flight/server.go b/go/arrow/flight/server.go index 1dd02d0defaed..c5e64986d5f78 100644 --- a/go/arrow/flight/server.go +++ b/go/arrow/flight/server.go @@ -22,7 +22,7 @@ import ( "os" "os/signal" - "github.com/apache/arrow/go/v13/arrow/flight/internal/flight" + "github.com/apache/arrow/go/v13/arrow/flight/gen/flight" "google.golang.org/grpc" ) diff --git a/go/arrow/ipc/reader.go b/go/arrow/ipc/reader.go index 99aab597ce950..bee48cf965682 100644 --- a/go/arrow/ipc/reader.go +++ b/go/arrow/ipc/reader.go @@ -159,6 +159,7 @@ func (r *Reader) Release() { r.r.Release() r.r = nil } + r.memo.Clear() } } diff --git a/go/arrow/ipc/reader_test.go b/go/arrow/ipc/reader_test.go index a8930984fbf37..7bcf737af0d6d 100644 --- a/go/arrow/ipc/reader_test.go +++ b/go/arrow/ipc/reader_test.go @@ -56,3 +56,40 @@ func TestReaderCatchPanic(t *testing.T) { assert.Contains(t, err.Error(), "arrow/ipc: unknown error while reading") } } + +func TestReaderCheckedAllocator(t *testing.T) { + alloc := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer alloc.AssertSize(t, 0) + schema := arrow.NewSchema([]arrow.Field{ + { + Name: "s", + Type: &arrow.DictionaryType{ + ValueType: arrow.BinaryTypes.String, + IndexType: arrow.PrimitiveTypes.Int32, + }, + }, + }, nil) + + b := array.NewRecordBuilder(alloc, schema) + defer b.Release() + + bldr := b.Field(0).(*array.BinaryDictionaryBuilder) + bldr.Append([]byte("foo")) + bldr.Append([]byte("bar")) + bldr.Append([]byte("baz")) + + rec := b.NewRecord() + defer rec.Release() + + buf := new(bytes.Buffer) + writer := NewWriter(buf, WithSchema(schema), WithAllocator(alloc)) + defer writer.Close() + require.NoError(t, writer.Write(rec)) + + reader, err := NewReader(buf, WithAllocator(alloc)) + require.NoError(t, err) + defer reader.Release() + + _, err = reader.Read() + require.NoError(t, err) +} diff --git a/go/internal/hashing/xxh3_memo_table.gen.go b/go/internal/hashing/xxh3_memo_table.gen.go index 0c36aee950f83..f561c5f30f895 100644 --- a/go/internal/hashing/xxh3_memo_table.gen.go +++ b/go/internal/hashing/xxh3_memo_table.gen.go @@ -298,6 +298,11 @@ func (s *Int8MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err e return } +// GetOrInsertBytes is unimplemented +func (s *Int8MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, err error) { + panic("unimplemented") +} + type payloadUint8 struct { val uint8 memoIdx int32 @@ -570,6 +575,11 @@ func (s *Uint8MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err return } +// GetOrInsertBytes is unimplemented +func (s *Uint8MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, err error) { + panic("unimplemented") +} + type payloadInt16 struct { val int16 memoIdx int32 @@ -842,6 +852,11 @@ func (s *Int16MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err return } +// GetOrInsertBytes is unimplemented +func (s *Int16MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, err error) { + panic("unimplemented") +} + type payloadUint16 struct { val uint16 memoIdx int32 @@ -1114,6 +1129,11 @@ func (s *Uint16MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err return } +// GetOrInsertBytes is unimplemented +func (s *Uint16MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, err error) { + panic("unimplemented") +} + type payloadInt32 struct { val int32 memoIdx int32 @@ -1386,6 +1406,11 @@ func (s *Int32MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err return } +// GetOrInsertBytes is unimplemented +func (s *Int32MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, err error) { + panic("unimplemented") +} + type payloadInt64 struct { val int64 memoIdx int32 @@ -1658,6 +1683,11 @@ func (s *Int64MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err return } +// GetOrInsertBytes is unimplemented +func (s *Int64MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, err error) { + panic("unimplemented") +} + type payloadUint32 struct { val uint32 memoIdx int32 @@ -1930,6 +1960,11 @@ func (s *Uint32MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err return } +// GetOrInsertBytes is unimplemented +func (s *Uint32MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, err error) { + panic("unimplemented") +} + type payloadUint64 struct { val uint64 memoIdx int32 @@ -2202,6 +2237,11 @@ func (s *Uint64MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err return } +// GetOrInsertBytes is unimplemented +func (s *Uint64MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, err error) { + panic("unimplemented") +} + type payloadFloat32 struct { val float32 memoIdx int32 @@ -2493,6 +2533,11 @@ func (s *Float32MemoTable) GetOrInsert(val interface{}) (idx int, found bool, er return } +// GetOrInsertBytes is unimplemented +func (s *Float32MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, err error) { + panic("unimplemented") +} + type payloadFloat64 struct { val float64 memoIdx int32 @@ -2781,3 +2826,8 @@ func (s *Float64MemoTable) GetOrInsert(val interface{}) (idx int, found bool, er } return } + +// GetOrInsertBytes is unimplemented +func (s *Float64MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, err error) { + panic("unimplemented") +} diff --git a/go/internal/hashing/xxh3_memo_table.gen.go.tmpl b/go/internal/hashing/xxh3_memo_table.gen.go.tmpl index 94c893b94b314..10127c43cc6b1 100644 --- a/go/internal/hashing/xxh3_memo_table.gen.go.tmpl +++ b/go/internal/hashing/xxh3_memo_table.gen.go.tmpl @@ -340,4 +340,10 @@ func (s *{{.Name}}MemoTable) GetOrInsert(val interface{}) (idx int, found bool, } return } + + +// GetOrInsertBytes is unimplemented +func (s *{{.Name}}MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, err error) { + panic("unimplemented") +} {{end}} diff --git a/go/internal/hashing/xxh3_memo_table.go b/go/internal/hashing/xxh3_memo_table.go index 67e2aef380488..81994f0a88541 100644 --- a/go/internal/hashing/xxh3_memo_table.go +++ b/go/internal/hashing/xxh3_memo_table.go @@ -53,6 +53,12 @@ type MemoTable interface { // the table (if false, the value was inserted). An error is returned // if val is not the appropriate type for the table. GetOrInsert(val interface{}) (idx int, existed bool, err error) + // GetOrInsertBytes returns the index of the table the specified value is, + // and a boolean indicating whether or not the value was found in + // the table (if false, the value was inserted). An error is returned + // if val is not the appropriate type for the table. This function is intended to be used by + // the BinaryMemoTable to prevent uncessary allocations of the data when converting from a []byte to interface{}. + GetOrInsertBytes(val []byte) (idx int, existed bool, err error) // GetOrInsertNull returns the index of the null value in the table, // inserting one if it hasn't already been inserted. It returns a boolean // indicating if the null value already existed or not in the table. @@ -231,6 +237,22 @@ func (b *BinaryMemoTable) Get(val interface{}) (int, bool) { return KeyNotFound, false } +// GetOrInsertBytes returns the index of the given value in the table, if not found +// it is inserted into the table. The return value 'found' indicates whether the value +// was found in the table (true) or inserted (false) along with any possible error. +func (b *BinaryMemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, err error) { + h := Hash(val, 0) + p, found := b.lookup(h, val) + if found { + idx = int(p.payload.val) + } else { + idx = b.Size() + b.builder.Append(val) + b.tbl.Insert(p, h, int32(idx), -1) + } + return +} + // GetOrInsert returns the index of the given value in the table, if not found // it is inserted into the table. The return value 'found' indicates whether the value // was found in the table (true) or inserted (false) along with any possible error. diff --git a/go/parquet/file/file_reader_test.go b/go/parquet/file/file_reader_test.go index 6b201cadcee26..fa5a51cb5b8d5 100644 --- a/go/parquet/file/file_reader_test.go +++ b/go/parquet/file/file_reader_test.go @@ -333,3 +333,55 @@ func TestIncompleteMetadata(t *testing.T) { _, err := file.NewParquetReader(bytes.NewReader(buf.Bytes())) assert.Error(t, err) } + +func TestDeltaLengthByteArrayPackingWithNulls(t *testing.T) { + // produce file with DeltaLengthByteArray Encoding with mostly null values but one actual value. + root, _ := schema.NewGroupNode("schema", parquet.Repetitions.Repeated, schema.FieldList{ + schema.NewByteArrayNode("byte_array_col", parquet.Repetitions.Optional, -1), + }, -1) + props := parquet.NewWriterProperties(parquet.WithVersion(parquet.V2_LATEST), + parquet.WithEncoding(parquet.Encodings.DeltaLengthByteArray), parquet.WithDictionaryDefault(false)) + sink := encoding.NewBufferWriter(0, memory.DefaultAllocator) + + writer := file.NewParquetWriter(sink, root, file.WithWriterProps(props)) + rgw := writer.AppendRowGroup() + ccw, err := rgw.NextColumn() + assert.NoError(t, err) + const elements = 500 + data := make([]parquet.ByteArray, elements) + data[0] = parquet.ByteArray{1, 2, 3, 4, 5, 6, 7, 8} + + defLvls := make([]int16, elements) + repLvls := make([]int16, elements) + defLvls[0] = 1 + + _, err = ccw.(*file.ByteArrayColumnChunkWriter).WriteBatch(data, defLvls, repLvls) + assert.NoError(t, err) + assert.NoError(t, ccw.Close()) + assert.NoError(t, rgw.Close()) + assert.NoError(t, writer.Close()) + buf := sink.Finish() + defer buf.Release() + + // read file back in + reader, err := file.NewParquetReader(bytes.NewReader(buf.Bytes())) + assert.NoError(t, err) + defer reader.Close() + ccr, err := reader.RowGroup(0).Column(0) + assert.NoError(t, err) + const batchSize = 500 + + for ccr.HasNext() { + readData := make([]parquet.ByteArray, batchSize) + readdevLvls := make([]int16, batchSize) + readrepLvls := make([]int16, batchSize) + cr := ccr.(*file.ByteArrayColumnChunkReader) + + total, read, err := cr.ReadBatch(batchSize, readData, readdevLvls, readrepLvls) + assert.NoError(t, err) + assert.Equal(t, int64(batchSize), total) + assert.Equal(t, 1, read) + assert.Equal(t, data[0], readData[0]) + assert.NotNil(t, readData[0]) + } +} diff --git a/go/parquet/file/file_writer.go b/go/parquet/file/file_writer.go index c931377323e2b..cd0445f4180f1 100644 --- a/go/parquet/file/file_writer.go +++ b/go/parquet/file/file_writer.go @@ -18,6 +18,7 @@ package file import ( "encoding/binary" + "fmt" "io" "github.com/apache/arrow/go/v13/parquet" @@ -155,7 +156,7 @@ func (fw *Writer) startFile() { // Close closes any open row group writer and writes the file footer. Subsequent // calls to close will have no effect. -func (fw *Writer) Close() error { +func (fw *Writer) Close() (err error) { if fw.open { // if any functions here panic, we set open to be false so // that this doesn't get called again @@ -165,11 +166,20 @@ func (fw *Writer) Close() error { fw.rowGroupWriter.Close() } fw.rowGroupWriter = nil - defer fw.sink.Close() + defer func() { + ierr := fw.sink.Close() + if err != nil { + if ierr != nil { + err = fmt.Errorf("error on close:%w, %s", err, ierr) + } + return + } + + err = ierr + }() fileEncryptProps := fw.props.FileEncryptionProperties() if fileEncryptProps == nil { // non encrypted file - var err error if fw.FileMetadata, err = fw.metadata.Finish(); err != nil { return err } diff --git a/go/parquet/file/file_writer_test.go b/go/parquet/file/file_writer_test.go index bba0d2be28d98..2cbdb910724ad 100644 --- a/go/parquet/file/file_writer_test.go +++ b/go/parquet/file/file_writer_test.go @@ -18,6 +18,7 @@ package file_test import ( "bytes" + "fmt" "reflect" "testing" @@ -395,3 +396,25 @@ func TestSerialize(t *testing.T) { }) } } + +type errCloseWriter struct { + sink *encoding.BufferWriter +} + +func (c *errCloseWriter) Write(p []byte) (n int, err error) { + return c.sink.Write(p) +} +func (c *errCloseWriter) Close() error { + return fmt.Errorf("error during close") +} +func (c *errCloseWriter) Bytes() []byte { + return c.sink.Bytes() +} + +func TestCloseError(t *testing.T) { + fields := schema.FieldList{schema.NewInt32Node("col", parquet.Repetitions.Required, 1)} + sc, _ := schema.NewGroupNode("schema", parquet.Repetitions.Required, fields, 0) + sink := &errCloseWriter{sink: encoding.NewBufferWriter(0, memory.DefaultAllocator)} + writer := file.NewParquetWriter(sink, sc) + assert.Error(t, writer.Close()) +} diff --git a/go/parquet/internal/encoding/delta_bit_packing.go b/go/parquet/internal/encoding/delta_bit_packing.go index 2ebe6ad98354c..ab542eabb2d3d 100644 --- a/go/parquet/internal/encoding/delta_bit_packing.go +++ b/go/parquet/internal/encoding/delta_bit_packing.go @@ -156,7 +156,7 @@ func (d *DeltaBitPackInt32Decoder) unpackNextMini() error { // Decode retrieves min(remaining values, len(out)) values from the data and returns the number // of values actually decoded and any errors encountered. func (d *DeltaBitPackInt32Decoder) Decode(out []int32) (int, error) { - max := shared_utils.MinInt(len(out), d.nvals) + max := shared_utils.MinInt(len(out), int(d.totalValues)) if max == 0 { return 0, nil } @@ -315,7 +315,7 @@ const ( // Consists of a header followed by blocks of delta encoded values binary packed. // // Format -// [header] [block 1] [block 2] ... [block N] +// [header] [block 1] [block 2] ... [block N] // // Header // [block size] [number of mini blocks per block] [total value count] [first value] diff --git a/go/parquet/internal/encoding/delta_byte_array_test.go b/go/parquet/internal/encoding/delta_byte_array_test.go new file mode 100644 index 0000000000000..1c008505252fb --- /dev/null +++ b/go/parquet/internal/encoding/delta_byte_array_test.go @@ -0,0 +1,47 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package encoding + +import ( + "fmt" + "github.com/apache/arrow/go/v13/arrow/memory" + "github.com/apache/arrow/go/v13/parquet" + "github.com/stretchr/testify/assert" + "testing" +) + +func TestDeltaByteArrayDecoder_SetData(t *testing.T) { + tests := []struct { + name string + nvalues int + data []byte + wantErr assert.ErrorAssertionFunc + }{ + { + name: "null only page", + nvalues: 126609, + data: []byte{128, 1, 4, 0, 0}, + wantErr: assert.NoError, + }, + } + for _, tt := range tests { + d := NewDecoder(parquet.Types.ByteArray, parquet.Encodings.DeltaLengthByteArray, nil, memory.DefaultAllocator) + t.Run(tt.name, func(t *testing.T) { + tt.wantErr(t, d.SetData(tt.nvalues, tt.data), fmt.Sprintf("SetData(%v, %v)", tt.nvalues, tt.data)) + }) + } +} diff --git a/go/parquet/internal/encoding/delta_length_byte_array.go b/go/parquet/internal/encoding/delta_length_byte_array.go index c11ded1b8f352..d719dcf829cbd 100644 --- a/go/parquet/internal/encoding/delta_length_byte_array.go +++ b/go/parquet/internal/encoding/delta_length_byte_array.go @@ -117,7 +117,7 @@ func (d *DeltaLengthByteArrayDecoder) SetData(nvalues int, data []byte) error { if err := dec.SetData(nvalues, data); err != nil { return err } - d.lengths = make([]int32, nvalues) + d.lengths = make([]int32, dec.totalValues) dec.Decode(d.lengths) return d.decoder.SetData(nvalues, data[int(dec.bytesRead()):]) diff --git a/go/parquet/schema/helpers.go b/go/parquet/schema/helpers.go index 7cc89efca6e8e..1198b0b926ac8 100644 --- a/go/parquet/schema/helpers.go +++ b/go/parquet/schema/helpers.go @@ -24,43 +24,62 @@ import ( // ListOf is a convenience helper function to create a properly structured // list structure according to the Parquet Spec. // -// group (LIST) { -// repeated group list { -// element; -// } -// } +// group (LIST) { +// repeated group list { +// element; +// } +// } // -// can only be optional or required. panics if repeated. -// can only be optional or required. panics if repeated. +// can only be optional or required. +// can only be optional or required. func ListOf(n Node, rep parquet.Repetition, fieldID int32) (*GroupNode, error) { - if rep == parquet.Repetitions.Repeated || n.RepetitionType() == parquet.Repetitions.Repeated { - return nil, xerrors.New("parquet: listof repetition and element repetition must not be repeated.") + return ListOfWithName(n.Name(), n, rep, fieldID) +} + +// ListOf is a convenience helper function to create a properly structured +// list structure according to the Parquet Spec. +// +// group (LIST) { +// repeated group list { +// element; +// } +// } +// +// can only be optional or required. +// can only be optional or required. +func ListOfWithName(listName string, element Node, rep parquet.Repetition, fieldID int32) (*GroupNode, error) { + if rep == parquet.Repetitions.Repeated { + return nil, xerrors.Errorf("parquet: listof repetition must not be repeated, got :%s", rep) } - listName := n.Name() - switch n := n.(type) { + if element.RepetitionType() == parquet.Repetitions.Repeated { + return nil, xerrors.Errorf("parquet: element repetition must not be repeated, got: %s", element.RepetitionType()) + } + + switch n := element.(type) { case *PrimitiveNode: n.name = "element" case *GroupNode: n.name = "element" } - list, err := NewGroupNode("list" /* name */, parquet.Repetitions.Repeated, FieldList{n}, -1 /* fieldID */) + list, err := NewGroupNode("list" /* name */, parquet.Repetitions.Repeated, FieldList{element}, -1 /* fieldID */) if err != nil { return nil, err } + return NewGroupNodeLogical(listName, rep, FieldList{list}, ListLogicalType{}, fieldID) } // MapOf is a convenience helper function to create a properly structured // parquet map node setup according to the Parquet Spec. // -// group (MAP) { -// repeated group key_value { -// required key; -// value; -// } -// } +// group (MAP) { +// repeated group key_value { +// required key; +// value; +// } +// } // // key node will be renamed to "key", value node if not nil will be renamed to "value" // @@ -69,14 +88,15 @@ func ListOf(n Node, rep parquet.Repetition, fieldID int32) (*GroupNode, error) { // the key node *must* be required repetition. panics if optional or repeated // // value node can be nil (omitted) or have a repetition of required or optional *only*. -// panics if value node is not nil and has a repetition of repeated. func MapOf(name string, key Node, value Node, mapRep parquet.Repetition, fieldID int32) (*GroupNode, error) { if mapRep == parquet.Repetitions.Repeated { - return nil, xerrors.New("parquet: map repetition cannot be Repeated") + return nil, xerrors.Errorf("parquet: map repetition cannot be Repeated, got: %s", mapRep) } + if key.RepetitionType() != parquet.Repetitions.Required { - return nil, xerrors.New("parquet: map key repetition must be Required") + return nil, xerrors.Errorf("parquet: map key repetition must be Required, got: %s", key.RepetitionType()) } + if value != nil { if value.RepetitionType() == parquet.Repetitions.Repeated { return nil, xerrors.New("parquet: map value cannot have repetition Repeated") diff --git a/go/parquet/schema/helpers_test.go b/go/parquet/schema/helpers_test.go index 055fe7f46d127..b4f0b684003db 100644 --- a/go/parquet/schema/helpers_test.go +++ b/go/parquet/schema/helpers_test.go @@ -62,6 +62,25 @@ func TestListOfNested(t *testing.T) { }`, strings.TrimSpace(buf.String())) } +func TestListOfWithNameNested(t *testing.T) { + n, err := schema.ListOfWithName("arrays", schema.NewInt32Node("element", parquet.Repetitions.Required, -1), parquet.Repetitions.Required, -1) + assert.NoError(t, err) + final, err := schema.ListOf(n, parquet.Repetitions.Required, -1) + assert.NoError(t, err) + + var buf bytes.Buffer + schema.PrintSchema(final, &buf, 4) + assert.Equal(t, + `required group field_id=-1 arrays (List) { + repeated group field_id=-1 list { + required group field_id=-1 element (List) { + repeated group field_id=-1 list { + required int32 field_id=-1 element; + } + } + } +}`, strings.TrimSpace(buf.String())) +} func TestMapOfNestedTypes(t *testing.T) { n, err := schema.NewGroupNode("student", parquet.Repetitions.Required, schema.FieldList{ schema.NewByteArrayNode("name", parquet.Repetitions.Required, -1), diff --git a/go/parquet/schema/logical_types.go b/go/parquet/schema/logical_types.go index ade6e750adacb..4075edc1e9402 100644 --- a/go/parquet/schema/logical_types.go +++ b/go/parquet/schema/logical_types.go @@ -616,6 +616,55 @@ func NewTimestampLogicalTypeForce(isAdjustedToUTC bool, unit TimeUnitType) Logic } } +// TimestampOpt options used with New Timestamp Logical Type +type TimestampOpt func(*TimestampLogicalType) + +// WithTSIsAdjustedToUTC sets the IsAdjustedToUTC field of the timestamp type. +func WithTSIsAdjustedToUTC() TimestampOpt { + return func(t *TimestampLogicalType) { + t.typ.IsAdjustedToUTC = true + } +} + +// WithTSTimeUnitType sets the time unit for the timestamp type +func WithTSTimeUnitType(unit TimeUnitType) TimestampOpt { + return func(t *TimestampLogicalType) { + t.typ.Unit = createTimeUnit(unit) + } +} + +// WithTSForceConverted enable force converted mode +func WithTSForceConverted() TimestampOpt { + return func(t *TimestampLogicalType) { + t.forceConverted = true + } +} + +// WithTSFromConverted enable the timestamp logical type to be +// constructed from a converted type. +func WithTSFromConverted() TimestampOpt { + return func(t *TimestampLogicalType) { + t.fromConverted = true + } +} + +// NewTimestampLogicalTypeWithOpts creates a new TimestampLogicalType with the provided options. +// +// TimestampType Unit defaults to milliseconds (TimeUnitMillis) +func NewTimestampLogicalTypeWithOpts(opts ...TimestampOpt) LogicalType { + ts := &TimestampLogicalType{ + typ: &format.TimestampType{ + Unit: createTimeUnit(TimeUnitMillis), // default to milliseconds + }, + } + + for _, o := range opts { + o(ts) + } + + return ts +} + // TimestampLogicalType represents an int64 number that can be decoded // into a year, month, day, hour, minute, second, and subsecond type TimestampLogicalType struct { diff --git a/go/parquet/schema/logical_types_test.go b/go/parquet/schema/logical_types_test.go index 540899d79a02a..117157f95ef83 100644 --- a/go/parquet/schema/logical_types_test.go +++ b/go/parquet/schema/logical_types_test.go @@ -93,6 +93,7 @@ func TestConvertedTypeCompatibility(t *testing.T) { {"time_micro", schema.NewTimeLogicalType(true /* adjutedToUTC */, schema.TimeUnitMicros), schema.ConvertedTypes.TimeMicros}, {"timestamp_milli", schema.NewTimestampLogicalType(true /* adjutedToUTC */, schema.TimeUnitMillis), schema.ConvertedTypes.TimestampMillis}, {"timestamp_micro", schema.NewTimestampLogicalType(true /* adjutedToUTC */, schema.TimeUnitMicros), schema.ConvertedTypes.TimestampMicros}, + {"timestamp_milli_opts", schema.NewTimestampLogicalTypeWithOpts(schema.WithTSIsAdjustedToUTC(), schema.WithTSTimeUnitType(schema.TimeUnitMillis)), schema.ConvertedTypes.TimestampMillis}, {"uint8", schema.NewIntLogicalType(8 /* bitWidth */, false /* signed */), schema.ConvertedTypes.Uint8}, {"uint16", schema.NewIntLogicalType(16 /* bitWidth */, false /* signed */), schema.ConvertedTypes.Uint16}, {"uint32", schema.NewIntLogicalType(32 /* bitWidth */, false /* signed */), schema.ConvertedTypes.Uint32}, diff --git a/go/parquet/schema/reflection.go b/go/parquet/schema/reflection.go index f1e204a171712..b85c1c28c781a 100644 --- a/go/parquet/schema/reflection.go +++ b/go/parquet/schema/reflection.go @@ -64,6 +64,8 @@ type taggedInfo struct { LogicalType LogicalType KeyLogicalType LogicalType ValueLogicalType LogicalType + + Exclude bool } func (t *taggedInfo) CopyForKey() (ret taggedInfo) { @@ -186,6 +188,7 @@ func newTaggedInfo() taggedInfo { LogicalType: NoLogicalType{}, KeyLogicalType: NoLogicalType{}, ValueLogicalType: NoLogicalType{}, + Exclude: false, } } @@ -232,6 +235,10 @@ func infoFromTags(f reflect.StructTag) *taggedInfo { if ptags, ok := f.Lookup("parquet"); ok { info := newTaggedInfo() + if ptags == "-" { + info.Exclude = true + return &info + } for _, tag := range strings.Split(strings.Replace(ptags, "\t", "", -1), ",") { tag = strings.TrimSpace(tag) kv := strings.SplitN(tag, "=", 2) @@ -370,8 +377,10 @@ func typeToNode(name string, typ reflect.Type, repType parquet.Repetition, info fields := make(FieldList, 0) for i := 0; i < typ.NumField(); i++ { f := typ.Field(i) - - fields = append(fields, typeToNode(f.Name, f.Type, parquet.Repetitions.Required, infoFromTags(f.Tag))) + tags := infoFromTags(f.Tag) + if tags == nil || !tags.Exclude { + fields = append(fields, typeToNode(f.Name, f.Type, parquet.Repetitions.Required, tags)) + } } // group nodes don't have a physical type if physical != parquet.Types.Undefined { diff --git a/go/parquet/schema/reflection_test.go b/go/parquet/schema/reflection_test.go index 7be1475513c52..4a029d058155a 100644 --- a/go/parquet/schema/reflection_test.go +++ b/go/parquet/schema/reflection_test.go @@ -309,7 +309,8 @@ func TestStructFromSchema(t *testing.T) { func TestStructFromSchemaWithNesting(t *testing.T) { type Other struct { - List *[]*float32 + List *[]*float32 + Excluded int32 `parquet:"-"` } type Nested struct { diff --git a/java/dataset/src/main/cpp/jni_wrapper.cc b/java/dataset/src/main/cpp/jni_wrapper.cc index 871a2e95b94ec..5640bc4349670 100644 --- a/java/dataset/src/main/cpp/jni_wrapper.cc +++ b/java/dataset/src/main/cpp/jni_wrapper.cc @@ -27,6 +27,7 @@ #include "arrow/dataset/file_base.h" #include "arrow/filesystem/localfs.h" #include "arrow/filesystem/path_util.h" +#include "arrow/filesystem/s3fs.h" #include "arrow/engine/substrait/util.h" #include "arrow/ipc/api.h" #include "arrow/util/iterator.h" @@ -569,6 +570,18 @@ JNIEXPORT void JNICALL Java_org_apache_arrow_dataset_jni_JniWrapper_releaseBuffe JNI_METHOD_END() } +/* + * Class: org_apache_arrow_dataset_jni_JniWrapper + * Method: ensureS3Finalized + * Signature: (J)V + */ +JNIEXPORT void JNICALL Java_org_apache_arrow_dataset_jni_JniWrapper_ensureS3Finalized( + JNIEnv* env, jobject) { + JNI_METHOD_START + JniAssertOkOrThrow(arrow::fs::EnsureS3Finalized()); + JNI_METHOD_END() +} + /* * Class: org_apache_arrow_dataset_file_JniWrapper * Method: makeFileSystemDatasetFactory diff --git a/java/dataset/src/main/java/org/apache/arrow/dataset/jni/JniLoader.java b/java/dataset/src/main/java/org/apache/arrow/dataset/jni/JniLoader.java index 7ada21c058280..a3b31c73e8540 100644 --- a/java/dataset/src/main/java/org/apache/arrow/dataset/jni/JniLoader.java +++ b/java/dataset/src/main/java/org/apache/arrow/dataset/jni/JniLoader.java @@ -59,6 +59,7 @@ public void ensureLoaded() { return; } loadRemaining(); + ensureS3FinalizedOnShutdown(); } private synchronized void loadRemaining() { @@ -109,4 +110,8 @@ private String getNormalizedArch() { } return arch; } + + private void ensureS3FinalizedOnShutdown() { + Runtime.getRuntime().addShutdownHook(new Thread(() -> { JniWrapper.get().ensureS3Finalized(); })); + } } diff --git a/java/dataset/src/main/java/org/apache/arrow/dataset/jni/JniWrapper.java b/java/dataset/src/main/java/org/apache/arrow/dataset/jni/JniWrapper.java index 1a9d4188c168f..93cc5d7a37040 100644 --- a/java/dataset/src/main/java/org/apache/arrow/dataset/jni/JniWrapper.java +++ b/java/dataset/src/main/java/org/apache/arrow/dataset/jni/JniWrapper.java @@ -108,4 +108,10 @@ private JniWrapper() { * @param bufferId the native pointer of the arrow::Buffer instance. */ public native void releaseBuffer(long bufferId); + + /** + * Ensure the S3 APIs are shutdown, but only if not already done. If the S3 APIs are unintialized, + * then this is a noop. + */ + public native void ensureS3Finalized(); } diff --git a/java/memory/memory-netty/src/main/java/io/netty/buffer/PooledByteBufAllocatorL.java b/java/memory/memory-netty/src/main/java/io/netty/buffer/PooledByteBufAllocatorL.java index 870114d7db1b5..06c6669cfd162 100644 --- a/java/memory/memory-netty/src/main/java/io/netty/buffer/PooledByteBufAllocatorL.java +++ b/java/memory/memory-netty/src/main/java/io/netty/buffer/PooledByteBufAllocatorL.java @@ -161,7 +161,7 @@ public InnerAllocator() { } private UnsafeDirectLittleEndian newDirectBufferL(int initialCapacity, int maxCapacity) { - PoolArenasCache cache = threadCache(); + PoolThreadCache cache = threadCache(); PoolArena directArena = cache.directArena; if (directArena != null) { diff --git a/java/pom.xml b/java/pom.xml index ccb2a2b72d5e6..f6837cd82c7b5 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -33,7 +33,7 @@ 5.9.0 1.7.25 31.1-jre - 4.1.94.Final + 4.1.96.Final 1.56.0 3.23.1 2.15.1 @@ -984,13 +984,13 @@ java-dist false - ON + ON + ON ON ON - OFF OFF ON - OFF + ON ON @@ -1012,16 +1012,18 @@ -S cpp -B cpp-jni -DARROW_BUILD_SHARED=OFF - -DARROW_CSV=${ARROW_CSV} - -DARROW_DATASET=ON + -DARROW_CSV=${ARROW_DATASET} + -DARROW_DATASET=${ARROW_DATASET} -DARROW_DEPENDENCY_SOURCE=BUNDLED -DARROW_DEPENDENCY_USE_SHARED=OFF -DARROW_FILESYSTEM=ON -DARROW_GANDIVA=${ARROW_GANDIVA} -DARROW_GANDIVA_STATIC_LIBSTDCPP=ON + -DARROW_JSON=${ARROW_DATASET} -DARROW_ORC=${ARROW_ORC} -DARROW_PARQUET=${ARROW_PARQUET} -DARROW_S3=ON + -DARROW_SUBSTRAIT=${ARROW_DATASET} -DARROW_USE_CCACHE=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_LIBDIR=lib/${os.detected.arch} @@ -1068,6 +1070,8 @@ -DCMAKE_INSTALL_LIBDIR=lib/${os.detected.arch} -DCMAKE_INSTALL_PREFIX=${arrow.dataset.jni.dist.dir} -DCMAKE_PREFIX_PATH=${project.basedir}/../java-dist/lib/${os.detected.arch}/cmake + -DProtobuf_USE_STATIC_LIBS=ON + -DProtobuf_ROOT=${project.basedir}/../cpp-jni/protobuf_ep-install ../ @@ -1097,13 +1101,14 @@ java-dist false - ON - OFF + ON + OFF + ON ON OFF ON OFF - OFF + ON @@ -1124,13 +1129,16 @@ -S cpp -B cpp-jni -DARROW_BUILD_SHARED=OFF - -DARROW_CSV=${ARROW_CSV} - -DARROW_DATASET=ON + -DARROW_CSV=${ARROW_DATASET} + -DARROW_DATASET=${ARROW_DATASET} -DARROW_DEPENDENCY_USE_SHARED=OFF -DARROW_FILESYSTEM=ON + -DARROW_GANDIVA=${ARROW_GANDIVA} + -DARROW_JSON=${ARROW_DATASET} -DARROW_ORC=${ARROW_ORC} -DARROW_PARQUET=${ARROW_PARQUET} -DARROW_S3=ON + -DARROW_SUBSTRAIT=${ARROW_DATASET} -DARROW_USE_CCACHE=ON -DARROW_WITH_BROTLI=ON -DARROW_WITH_LZ4=ON diff --git a/js/.gitignore b/js/.gitignore index 5b8e0dcc7eba2..5752f9249f034 100644 --- a/js/.gitignore +++ b/js/.gitignore @@ -84,5 +84,8 @@ test/bundle/**/*-bundle.js* # jest snapshots (too big) test/__snapshots__/ +# jest cache +.jest-cache/ + # VSCode !.vscode diff --git a/js/jest.config.js b/js/jest.config.js index 8aaf60a0ff4a2..5a004ef8e928e 100644 --- a/js/jest.config.js +++ b/js/jest.config.js @@ -22,6 +22,7 @@ export default { roots: [ "/test/", ], + cacheDirectory: ".jest-cache", extensionsToTreatAsEsm: [".ts"], moduleFileExtensions: ["js", "mjs", "ts"], coverageReporters: ["lcov", "json",], diff --git a/js/src/builder.ts b/js/src/builder.ts index 6f84154935f7b..90fe3ddcc9477 100644 --- a/js/src/builder.ts +++ b/js/src/builder.ts @@ -72,7 +72,7 @@ export interface BuilderOptions { * * @example * ```ts - * import { Builder, Utf8 } from 'apache-arrow'; + * import { makeBuilder, Utf8 } from 'apache-arrow'; * * const utf8Builder = makeBuilder({ * type: new Utf8(), diff --git a/matlab/src/cpp/arrow/matlab/array/proxy/array.cc b/matlab/src/cpp/arrow/matlab/array/proxy/array.cc index 35dc496bddb00..8520cf1f21fdd 100644 --- a/matlab/src/cpp/arrow/matlab/array/proxy/array.cc +++ b/matlab/src/cpp/arrow/matlab/array/proxy/array.cc @@ -20,16 +20,21 @@ #include "arrow/matlab/array/proxy/array.h" #include "arrow/matlab/bit/unpack.h" #include "arrow/matlab/error/error.h" +#include "arrow/matlab/type/proxy/wrap.h" +#include "arrow/type_traits.h" + +#include "libmexclass/proxy/ProxyManager.h" namespace arrow::matlab::array::proxy { - Array::Array() { + Array::Array(std::shared_ptr array) : array{std::move(array)} { // Register Proxy methods. REGISTER_METHOD(Array, toString); REGISTER_METHOD(Array, toMATLAB); REGISTER_METHOD(Array, length); REGISTER_METHOD(Array, valid); + REGISTER_METHOD(Array, type); } std::shared_ptr Array::getArray() { @@ -69,4 +74,21 @@ namespace arrow::matlab::array::proxy { auto valid_elements_mda = bit::unpack(validity_bitmap, array_length); context.outputs[0] = valid_elements_mda; } + + void Array::type(libmexclass::proxy::method::Context& context) { + namespace mda = ::matlab::data; + + mda::ArrayFactory factory; + + MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(auto type_proxy, + type::proxy::wrap(array->type()), + context, + error::ARRAY_FAILED_TO_CREATE_TYPE_PROXY); + + auto type_id = type_proxy->unwrap()->id(); + auto proxy_id = libmexclass::proxy::ProxyManager::manageProxy(type_proxy); + + context.outputs[0] = factory.createScalar(proxy_id); + context.outputs[1] = factory.createScalar(static_cast(type_id)); + } } diff --git a/matlab/src/cpp/arrow/matlab/array/proxy/array.h b/matlab/src/cpp/arrow/matlab/array/proxy/array.h index 94fad759759ca..90199767258e2 100644 --- a/matlab/src/cpp/arrow/matlab/array/proxy/array.h +++ b/matlab/src/cpp/arrow/matlab/array/proxy/array.h @@ -18,6 +18,7 @@ #pragma once #include "arrow/array.h" +#include "arrow/matlab/type/proxy/type.h" #include "libmexclass/proxy/Proxy.h" @@ -25,7 +26,7 @@ namespace arrow::matlab::array::proxy { class Array : public libmexclass::proxy::Proxy { public: - Array(); + Array(std::shared_ptr array); virtual ~Array() {} @@ -39,6 +40,8 @@ class Array : public libmexclass::proxy::Proxy { void valid(libmexclass::proxy::method::Context& context); + void type(libmexclass::proxy::method::Context& context); + virtual void toMATLAB(libmexclass::proxy::method::Context& context) = 0; std::shared_ptr array; diff --git a/matlab/src/cpp/arrow/matlab/array/proxy/boolean_array.cc b/matlab/src/cpp/arrow/matlab/array/proxy/boolean_array.cc index 9a3b7ed4e22e9..5be0cfb5a3d13 100644 --- a/matlab/src/cpp/arrow/matlab/array/proxy/boolean_array.cc +++ b/matlab/src/cpp/arrow/matlab/array/proxy/boolean_array.cc @@ -16,6 +16,7 @@ // under the License. #include "arrow/matlab/array/proxy/boolean_array.h" +#include "arrow/matlab/type/proxy/primitive_ctype.h" #include "arrow/matlab/error/error.h" #include "arrow/matlab/bit/pack.h" @@ -23,6 +24,9 @@ namespace arrow::matlab::array::proxy { + BooleanArray::BooleanArray(std::shared_ptr array) + : arrow::matlab::array::proxy::Array{std::move(array)} {} + libmexclass::proxy::MakeResult BooleanArray::make(const libmexclass::proxy::FunctionArguments& constructor_arguments) { ::matlab::data::StructArray opts = constructor_arguments[0]; @@ -40,7 +44,8 @@ namespace arrow::matlab::array::proxy { const auto array_length = logical_mda.getNumberOfElements(); auto array_data = arrow::ArrayData::Make(data_type, array_length, {validity_bitmap_buffer, data_buffer}); - return std::make_shared(arrow::MakeArray(array_data)); + auto arrow_array = std::static_pointer_cast(arrow::MakeArray(array_data)); + return std::make_shared(std::move(arrow_array)); } void BooleanArray::toMATLAB(libmexclass::proxy::method::Context& context) { @@ -49,5 +54,4 @@ namespace arrow::matlab::array::proxy { auto logical_array_mda = bit::unpack(packed_logical_data_buffer, array_length); context.outputs[0] = logical_array_mda; } - } diff --git a/matlab/src/cpp/arrow/matlab/array/proxy/boolean_array.h b/matlab/src/cpp/arrow/matlab/array/proxy/boolean_array.h index 6966d1090ee56..775673c29eada 100644 --- a/matlab/src/cpp/arrow/matlab/array/proxy/boolean_array.h +++ b/matlab/src/cpp/arrow/matlab/array/proxy/boolean_array.h @@ -20,21 +20,18 @@ #include "arrow/matlab/array/proxy/array.h" #include "libmexclass/proxy/Proxy.h" +#include "arrow/type_fwd.h" namespace arrow::matlab::array::proxy { class BooleanArray : public arrow::matlab::array::proxy::Array { public: - BooleanArray(const std::shared_ptr logical_array) - : arrow::matlab::array::proxy::Array() { - array = logical_array; - } + BooleanArray(std::shared_ptr array); static libmexclass::proxy::MakeResult make(const libmexclass::proxy::FunctionArguments& constructor_arguments); protected: void toMATLAB(libmexclass::proxy::method::Context& context) override; - }; } diff --git a/matlab/src/cpp/arrow/matlab/array/proxy/numeric_array.h b/matlab/src/cpp/arrow/matlab/array/proxy/numeric_array.h index 43e7aec622d55..6893079c78b95 100644 --- a/matlab/src/cpp/arrow/matlab/array/proxy/numeric_array.h +++ b/matlab/src/cpp/arrow/matlab/array/proxy/numeric_array.h @@ -21,76 +21,61 @@ #include "arrow/array/data.h" #include "arrow/array/util.h" -#include "arrow/builder.h" #include "arrow/type_traits.h" #include "arrow/matlab/array/proxy/array.h" +#include "arrow/matlab/type/proxy/traits.h" + #include "arrow/matlab/error/error.h" #include "arrow/matlab/bit/pack.h" #include "arrow/matlab/bit/unpack.h" +#include "arrow/matlab/buffer/matlab_buffer.h" #include "libmexclass/proxy/Proxy.h" +#include "arrow/matlab/type/time_unit.h" +#include "arrow/util/utf8.h" + namespace arrow::matlab::array::proxy { -template +template class NumericArray : public arrow::matlab::array::proxy::Array { public: - NumericArray(const std::shared_ptr numeric_array) - : arrow::matlab::array::proxy::Array() { - array = numeric_array; - } - static libmexclass::proxy::MakeResult make(const libmexclass::proxy::FunctionArguments& constructor_arguments) { - using ArrowType = typename arrow::CTypeTraits::ArrowType; - using BuilderType = typename arrow::CTypeTraits::BuilderType; + NumericArray(const std::shared_ptr> numeric_array) + : arrow::matlab::array::proxy::Array{std::move(numeric_array)} {} + + static libmexclass::proxy::MakeResult make(const libmexclass::proxy::FunctionArguments& constructor_arguments) { + using MatlabBuffer = arrow::matlab::buffer::MatlabBuffer; + using CType = typename arrow::TypeTraits::CType; + using NumericArray = arrow::NumericArray; + using NumericArrayProxy = typename proxy::NumericArray; ::matlab::data::StructArray opts = constructor_arguments[0]; // Get the mxArray from constructor arguments const ::matlab::data::TypedArray numeric_mda = opts[0]["MatlabArray"]; const ::matlab::data::TypedArray valid_mda = opts[0]["Valid"]; - const ::matlab::data::TypedArray make_copy = opts[0]["DeepCopy"]; - // Get raw pointer of mxArray - auto it(numeric_mda.cbegin()); - auto dt = it.operator->(); - - const auto make_deep_copy = make_copy[0]; - - if (make_deep_copy) { - // Get the unpacked validity bitmap (if it exists) - auto unpacked_validity_bitmap = bit::extract_ptr(valid_mda); + auto data_buffer = std::make_shared(numeric_mda); - BuilderType builder; + const auto data_type = arrow::CTypeTraits::type_singleton(); + const auto length = static_cast(numeric_mda.getNumberOfElements()); // cast size_t to int64_t - auto status = builder.AppendValues(dt, numeric_mda.getNumberOfElements(), unpacked_validity_bitmap); - MATLAB_ERROR_IF_NOT_OK(status, error::APPEND_VALUES_ERROR_ID); - - MATLAB_ASSIGN_OR_ERROR(auto array, builder.Finish(), error::BUILD_ARRAY_ERROR_ID); - - return std::make_shared>(array); - - } else { - const auto data_type = arrow::CTypeTraits::type_singleton(); - const auto length = static_cast(numeric_mda.getNumberOfElements()); // cast size_t to int64_t - - // Do not make a copy when creating arrow::Buffer - auto data_buffer = std::make_shared(reinterpret_cast(dt), - sizeof(CType) * numeric_mda.getNumberOfElements()); - // Pack the validity bitmap values. - MATLAB_ASSIGN_OR_ERROR(auto packed_validity_bitmap, bit::packValid(valid_mda), error::BITPACK_VALIDITY_BITMAP_ERROR_ID); - auto array_data = arrow::ArrayData::Make(data_type, length, {packed_validity_bitmap, data_buffer}); - return std::make_shared>(arrow::MakeArray(array_data)); - } + // Pack the validity bitmap values. + MATLAB_ASSIGN_OR_ERROR(auto packed_validity_bitmap, bit::packValid(valid_mda), error::BITPACK_VALIDITY_BITMAP_ERROR_ID); + auto array_data = arrow::ArrayData::Make(data_type, length, {packed_validity_bitmap, data_buffer}); + auto numeric_array = std::static_pointer_cast(arrow::MakeArray(array_data)); + return std::make_shared(std::move(numeric_array)); } protected: void toMATLAB(libmexclass::proxy::method::Context& context) override { - using ArrowArrayType = typename arrow::CTypeTraits::ArrayType; + using CType = typename arrow::TypeTraits::CType; + using NumericArray = arrow::NumericArray; const auto num_elements = static_cast(array->length()); - const auto numeric_array = std::static_pointer_cast(array); + const auto numeric_array = std::static_pointer_cast(array); const CType* const data_begin = numeric_array->raw_values(); const CType* const data_end = data_begin + num_elements; @@ -102,4 +87,49 @@ class NumericArray : public arrow::matlab::array::proxy::Array { } }; + // Specialization of NumericArray::Make for arrow::TimestampType. + template <> + libmexclass::proxy::MakeResult NumericArray::make(const libmexclass::proxy::FunctionArguments& constructor_arguments) { + namespace mda = ::matlab::data; + using MatlabBuffer = arrow::matlab::buffer::MatlabBuffer; + using TimestampArray = arrow::TimestampArray; + using TimestampArrayProxy = arrow::matlab::array::proxy::NumericArray; + + mda::StructArray opts = constructor_arguments[0]; + + // Get the mxArray from constructor arguments + const mda::TypedArray timestamp_mda = opts[0]["MatlabArray"]; + const mda::TypedArray validity_bitmap_mda = opts[0]["Valid"]; + + const mda::TypedArray timezone_mda = opts[0]["TimeZone"]; + const mda::TypedArray units_mda = opts[0]["TimeUnit"]; + + // extract the time zone string + const std::u16string& u16_timezone = timezone_mda[0]; + MATLAB_ASSIGN_OR_ERROR(const auto timezone, + arrow::util::UTF16StringToUTF8(u16_timezone), + error::UNICODE_CONVERSION_ERROR_ID); + + // extract the time unit + const std::u16string& u16_timeunit = units_mda[0]; + MATLAB_ASSIGN_OR_ERROR(const auto time_unit, + arrow::matlab::type::timeUnitFromString(u16_timeunit), + error::UKNOWN_TIME_UNIT_ERROR_ID) + + // create the timestamp_type + auto data_type = arrow::timestamp(time_unit, timezone); + auto array_length = static_cast(timestamp_mda.getNumberOfElements()); // cast size_t to int64_t + + auto data_buffer = std::make_shared(timestamp_mda); + + // Pack the validity bitmap values. + MATLAB_ASSIGN_OR_ERROR(auto packed_validity_bitmap, + bit::packValid(validity_bitmap_mda), + error::BITPACK_VALIDITY_BITMAP_ERROR_ID); + + auto array_data = arrow::ArrayData::Make(data_type, array_length, {packed_validity_bitmap, data_buffer}); + auto timestamp_array = std::static_pointer_cast(arrow::MakeArray(array_data)); + return std::make_shared(std::move(timestamp_array)); + } + } diff --git a/matlab/src/cpp/arrow/matlab/array/proxy/string_array.cc b/matlab/src/cpp/arrow/matlab/array/proxy/string_array.cc index 51f39d72fca6c..c583e8851a3ac 100644 --- a/matlab/src/cpp/arrow/matlab/array/proxy/string_array.cc +++ b/matlab/src/cpp/arrow/matlab/array/proxy/string_array.cc @@ -16,6 +16,7 @@ // under the License. #include "arrow/matlab/array/proxy/string_array.h" +#include "arrow/matlab/type/proxy/string_type.h" #include "arrow/array/builder_binary.h" @@ -26,6 +27,9 @@ namespace arrow::matlab::array::proxy { + StringArray::StringArray(const std::shared_ptr string_array) + : arrow::matlab::array::proxy::Array(std::move(string_array)) {} + libmexclass::proxy::MakeResult StringArray::make(const libmexclass::proxy::FunctionArguments& constructor_arguments) { namespace mda = ::matlab::data; @@ -53,8 +57,8 @@ namespace arrow::matlab::array::proxy { arrow::StringBuilder builder; MATLAB_ERROR_IF_NOT_OK(builder.AppendValues(strings, unpacked_validity_bitmap_ptr), error::STRING_BUILDER_APPEND_FAILED); MATLAB_ASSIGN_OR_ERROR(auto array, builder.Finish(), error::STRING_BUILDER_FINISH_FAILED); - - return std::make_shared(array); + auto typed_array = std::static_pointer_cast(array); + return std::make_shared(std::move(typed_array)); } void StringArray::toMATLAB(libmexclass::proxy::method::Context& context) { @@ -77,5 +81,4 @@ namespace arrow::matlab::array::proxy { auto array_mda = factory.createArray({array_length, 1}, strings.begin(), strings.end()); context.outputs[0] = array_mda; } - } diff --git a/matlab/src/cpp/arrow/matlab/array/proxy/string_array.h b/matlab/src/cpp/arrow/matlab/array/proxy/string_array.h index de0c4625928e4..bdcfedd7cdda3 100644 --- a/matlab/src/cpp/arrow/matlab/array/proxy/string_array.h +++ b/matlab/src/cpp/arrow/matlab/array/proxy/string_array.h @@ -21,15 +21,14 @@ #include "libmexclass/proxy/Proxy.h" +#include "arrow/type_fwd.h" + namespace arrow::matlab::array::proxy { class StringArray : public arrow::matlab::array::proxy::Array { public: - StringArray(const std::shared_ptr string_array) - : arrow::matlab::array::proxy::Array() { - array = string_array; - } - + StringArray(const std::shared_ptr string_array); + static libmexclass::proxy::MakeResult make(const libmexclass::proxy::FunctionArguments& constructor_arguments); protected: diff --git a/matlab/src/cpp/arrow/matlab/array/proxy/timestamp_array.cc b/matlab/src/cpp/arrow/matlab/array/proxy/timestamp_array.cc deleted file mode 100644 index aa79a4f99240e..0000000000000 --- a/matlab/src/cpp/arrow/matlab/array/proxy/timestamp_array.cc +++ /dev/null @@ -1,93 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "arrow/matlab/array/proxy/timestamp_array.h" - -#include "arrow/matlab/error/error.h" -#include "arrow/matlab/bit/pack.h" -#include "arrow/matlab/bit/unpack.h" - -#include "arrow/matlab/type/time_unit.h" -#include "arrow/util/utf8.h" -#include "arrow/type.h" -#include "arrow/builder.h" - - -namespace arrow::matlab::array::proxy { - - namespace { - const uint8_t* getUnpackedValidityBitmap(const ::matlab::data::TypedArray& valid_elements) { - const auto valid_elements_iterator(valid_elements.cbegin()); - return reinterpret_cast(valid_elements_iterator.operator->()); - } - } // anonymous namespace - - libmexclass::proxy::MakeResult TimestampArray::make(const libmexclass::proxy::FunctionArguments& constructor_arguments) { - namespace mda = ::matlab::data; - - mda::StructArray opts = constructor_arguments[0]; - - // Get the mxArray from constructor arguments - const mda::TypedArray timestamp_mda = opts[0]["MatlabArray"]; - const mda::TypedArray validity_bitmap_mda = opts[0]["Valid"]; - - const mda::TypedArray timezone_mda = opts[0]["TimeZone"]; - const mda::TypedArray units_mda = opts[0]["TimeUnit"]; - - // extract the time zone string - const std::u16string& u16_timezone = timezone_mda[0]; - MATLAB_ASSIGN_OR_ERROR(const auto timezone, arrow::util::UTF16StringToUTF8(u16_timezone), - error::UNICODE_CONVERSION_ERROR_ID); - - // extract the time unit - MATLAB_ASSIGN_OR_ERROR(const auto time_unit, arrow::matlab::type::timeUnitFromString(units_mda[0]), - error::UKNOWN_TIME_UNIT_ERROR_ID) - - // create the timestamp_type - auto data_type = arrow::timestamp(time_unit, timezone); - arrow::TimestampBuilder builder(data_type, arrow::default_memory_pool()); - - // Get raw pointer of mxArray - auto it(timestamp_mda.cbegin()); - auto dt = it.operator->(); - - // Pack the validity bitmap values. - const uint8_t* valid_mask = getUnpackedValidityBitmap(validity_bitmap_mda); - const auto num_elements = timestamp_mda.getNumberOfElements(); - - // Append values - MATLAB_ERROR_IF_NOT_OK(builder.AppendValues(dt, num_elements, valid_mask), error::APPEND_VALUES_ERROR_ID); - MATLAB_ASSIGN_OR_ERROR(auto timestamp_array, builder.Finish(), error::BUILD_ARRAY_ERROR_ID); - - return std::make_shared(timestamp_array); - } - - void TimestampArray::toMATLAB(libmexclass::proxy::method::Context& context) { - namespace mda = ::matlab::data; - - const auto num_elements = static_cast(array->length()); - const auto timestamp_array = std::static_pointer_cast(array); - const int64_t* const data_begin = timestamp_array->raw_values(); - const int64_t* const data_end = data_begin + num_elements; - - mda::ArrayFactory factory; - - // Constructs a TypedArray from the raw values. Makes a copy. - mda::TypedArray result = factory.createArray({num_elements, 1}, data_begin, data_end); - context.outputs[0] = result; - } -} diff --git a/matlab/src/cpp/arrow/matlab/buffer/matlab_buffer.h b/matlab/src/cpp/arrow/matlab/buffer/matlab_buffer.h new file mode 100644 index 0000000000000..80b237544ded8 --- /dev/null +++ b/matlab/src/cpp/arrow/matlab/buffer/matlab_buffer.h @@ -0,0 +1,48 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/buffer.h" + +#include "MatlabDataArray.hpp" + +namespace arrow::matlab::buffer { + + namespace mda = ::matlab::data; + + class MatlabBuffer : public arrow::Buffer { + public: + + template + MatlabBuffer(const mda::TypedArray typed_array) + : arrow::Buffer{nullptr, 0} + , array{typed_array} { + + // Get raw pointer of mxArray + auto it(typed_array.cbegin()); + auto dt = it.operator->(); + + data_ = reinterpret_cast(dt); + size_ = sizeof(CType) * static_cast(typed_array.getNumberOfElements()); + capacity_ = size_; + is_mutable_ = false; + } + private: + const mda::Array array; + }; +} \ No newline at end of file diff --git a/matlab/src/cpp/arrow/matlab/error/error.h b/matlab/src/cpp/arrow/matlab/error/error.h index b1b7b75b8c84a..3d134d169e7af 100644 --- a/matlab/src/cpp/arrow/matlab/error/error.h +++ b/matlab/src/cpp/arrow/matlab/error/error.h @@ -171,4 +171,6 @@ namespace arrow::matlab::error { static const char* STRING_BUILDER_APPEND_FAILED = "arrow:matlab:array:string:StringBuilderAppendFailed"; static const char* STRING_BUILDER_FINISH_FAILED = "arrow:matlab:array:string:StringBuilderFinishFailed"; static const char* UKNOWN_TIME_UNIT_ERROR_ID = "arrow:matlab:UnknownTimeUnit"; + static const char* FIELD_FAILED_TO_CREATE_TYPE_PROXY = "arrow:field:FailedToCreateTypeProxy"; + static const char* ARRAY_FAILED_TO_CREATE_TYPE_PROXY = "arrow:array:FailedToCreateTypeProxy"; } diff --git a/matlab/src/cpp/arrow/matlab/proxy/factory.cc b/matlab/src/cpp/arrow/matlab/proxy/factory.cc index 41f1357bcedc5..ac9a595a45852 100644 --- a/matlab/src/cpp/arrow/matlab/proxy/factory.cc +++ b/matlab/src/cpp/arrow/matlab/proxy/factory.cc @@ -18,29 +18,47 @@ #include "arrow/matlab/array/proxy/boolean_array.h" #include "arrow/matlab/array/proxy/numeric_array.h" #include "arrow/matlab/array/proxy/string_array.h" -#include "arrow/matlab/array/proxy/timestamp_array.h" #include "arrow/matlab/tabular/proxy/record_batch.h" #include "arrow/matlab/error/error.h" +#include "arrow/matlab/type/proxy/primitive_ctype.h" +#include "arrow/matlab/type/proxy/string_type.h" +#include "arrow/matlab/type/proxy/timestamp_type.h" +#include "arrow/matlab/type/proxy/field.h" #include "factory.h" namespace arrow::matlab::proxy { libmexclass::proxy::MakeResult Factory::make_proxy(const ClassName& class_name, const FunctionArguments& constructor_arguments) { - REGISTER_PROXY(arrow.array.proxy.Float32Array , arrow::matlab::array::proxy::NumericArray); - REGISTER_PROXY(arrow.array.proxy.Float64Array , arrow::matlab::array::proxy::NumericArray); - REGISTER_PROXY(arrow.array.proxy.UInt8Array , arrow::matlab::array::proxy::NumericArray); - REGISTER_PROXY(arrow.array.proxy.UInt16Array , arrow::matlab::array::proxy::NumericArray); - REGISTER_PROXY(arrow.array.proxy.UInt32Array , arrow::matlab::array::proxy::NumericArray); - REGISTER_PROXY(arrow.array.proxy.UInt64Array , arrow::matlab::array::proxy::NumericArray); - REGISTER_PROXY(arrow.array.proxy.Int8Array , arrow::matlab::array::proxy::NumericArray); - REGISTER_PROXY(arrow.array.proxy.Int16Array , arrow::matlab::array::proxy::NumericArray); - REGISTER_PROXY(arrow.array.proxy.Int32Array , arrow::matlab::array::proxy::NumericArray); - REGISTER_PROXY(arrow.array.proxy.Int64Array , arrow::matlab::array::proxy::NumericArray); + REGISTER_PROXY(arrow.array.proxy.Float32Array , arrow::matlab::array::proxy::NumericArray); + REGISTER_PROXY(arrow.array.proxy.Float64Array , arrow::matlab::array::proxy::NumericArray); + REGISTER_PROXY(arrow.array.proxy.UInt8Array , arrow::matlab::array::proxy::NumericArray); + REGISTER_PROXY(arrow.array.proxy.UInt16Array , arrow::matlab::array::proxy::NumericArray); + REGISTER_PROXY(arrow.array.proxy.UInt32Array , arrow::matlab::array::proxy::NumericArray); + REGISTER_PROXY(arrow.array.proxy.UInt64Array , arrow::matlab::array::proxy::NumericArray); + REGISTER_PROXY(arrow.array.proxy.Int8Array , arrow::matlab::array::proxy::NumericArray); + REGISTER_PROXY(arrow.array.proxy.Int16Array , arrow::matlab::array::proxy::NumericArray); + REGISTER_PROXY(arrow.array.proxy.Int32Array , arrow::matlab::array::proxy::NumericArray); + REGISTER_PROXY(arrow.array.proxy.Int64Array , arrow::matlab::array::proxy::NumericArray); REGISTER_PROXY(arrow.array.proxy.BooleanArray , arrow::matlab::array::proxy::BooleanArray); REGISTER_PROXY(arrow.array.proxy.StringArray , arrow::matlab::array::proxy::StringArray); - REGISTER_PROXY(arrow.array.proxy.TimestampArray, arrow::matlab::array::proxy::TimestampArray); + REGISTER_PROXY(arrow.array.proxy.TimestampArray, arrow::matlab::array::proxy::NumericArray); REGISTER_PROXY(arrow.tabular.proxy.RecordBatch , arrow::matlab::tabular::proxy::RecordBatch); + REGISTER_PROXY(arrow.type.proxy.Field , arrow::matlab::type::proxy::Field); + REGISTER_PROXY(arrow.type.proxy.Float32Type , arrow::matlab::type::proxy::PrimitiveCType); + REGISTER_PROXY(arrow.type.proxy.Float64Type , arrow::matlab::type::proxy::PrimitiveCType); + REGISTER_PROXY(arrow.type.proxy.UInt8Type , arrow::matlab::type::proxy::PrimitiveCType); + REGISTER_PROXY(arrow.type.proxy.UInt16Type , arrow::matlab::type::proxy::PrimitiveCType); + REGISTER_PROXY(arrow.type.proxy.UInt32Type , arrow::matlab::type::proxy::PrimitiveCType); + REGISTER_PROXY(arrow.type.proxy.UInt64Type , arrow::matlab::type::proxy::PrimitiveCType); + REGISTER_PROXY(arrow.type.proxy.Int8Type , arrow::matlab::type::proxy::PrimitiveCType); + REGISTER_PROXY(arrow.type.proxy.Int16Type , arrow::matlab::type::proxy::PrimitiveCType); + REGISTER_PROXY(arrow.type.proxy.Int32Type , arrow::matlab::type::proxy::PrimitiveCType); + REGISTER_PROXY(arrow.type.proxy.Int64Type , arrow::matlab::type::proxy::PrimitiveCType); + REGISTER_PROXY(arrow.type.proxy.BooleanType , arrow::matlab::type::proxy::PrimitiveCType); + REGISTER_PROXY(arrow.type.proxy.StringType , arrow::matlab::type::proxy::StringType); + REGISTER_PROXY(arrow.type.proxy.TimestampType , arrow::matlab::type::proxy::TimestampType); + return libmexclass::error::Error{error::UNKNOWN_PROXY_ERROR_ID, "Did not find matching C++ proxy for " + class_name}; }; diff --git a/matlab/src/cpp/arrow/matlab/type/proxy/field.cc b/matlab/src/cpp/arrow/matlab/type/proxy/field.cc new file mode 100644 index 0000000000000..0cf7c995fb275 --- /dev/null +++ b/matlab/src/cpp/arrow/matlab/type/proxy/field.cc @@ -0,0 +1,94 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/util/utf8.h" + +#include "arrow/matlab/type/proxy/field.h" +#include "arrow/matlab/error/error.h" + +#include "arrow/matlab/type/proxy/primitive_ctype.h" +#include "arrow/matlab/type/proxy/timestamp_type.h" +#include "arrow/matlab/type/proxy/string_type.h" +#include "arrow/matlab/type/proxy/wrap.h" + +#include "libmexclass/proxy/ProxyManager.h" + +namespace arrow::matlab::type::proxy { + + Field::Field(std::shared_ptr field) : field{std::move(field)} { + REGISTER_METHOD(Field, name); + REGISTER_METHOD(Field, type); + REGISTER_METHOD(Field, toString); + } + + std::shared_ptr Field::unwrap() { + return field; + } + + void Field::name(libmexclass::proxy::method::Context& context) { + namespace mda = ::matlab::data; + mda::ArrayFactory factory; + + const auto& str_utf8 = field->name(); + MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(const auto str_utf16, arrow::util::UTF8StringToUTF16(str_utf8), context, error::UNICODE_CONVERSION_ERROR_ID); + auto str_mda = factory.createScalar(str_utf16); + context.outputs[0] = str_mda; + } + + void Field::type(libmexclass::proxy::method::Context& context) { + namespace mda = ::matlab::data; + + const auto& datatype = field->type(); + MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(auto proxy, type::proxy::wrap(datatype), context, error::FIELD_FAILED_TO_CREATE_TYPE_PROXY); + const auto proxy_id = libmexclass::proxy::ProxyManager::manageProxy(proxy); + + mda::ArrayFactory factory; + context.outputs[0] = factory.createScalar(proxy_id); + context.outputs[1] = factory.createScalar(static_cast(datatype->id())); + } + + void Field::toString(libmexclass::proxy::method::Context& context) { + namespace mda = ::matlab::data; + mda::ArrayFactory factory; + + const auto str_utf8 = field->ToString(); + MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(const auto str_utf16, arrow::util::UTF8StringToUTF16(str_utf8), context, error::UNICODE_CONVERSION_ERROR_ID); + auto str_mda = factory.createScalar(str_utf16); + context.outputs[0] = str_mda; + } + + libmexclass::proxy::MakeResult Field::make(const libmexclass::proxy::FunctionArguments& constructor_arguments) { + namespace mda = ::matlab::data; + using FieldProxy = arrow::matlab::type::proxy::Field; + + mda::StructArray opts = constructor_arguments[0]; + const mda::StringArray name_mda = opts[0]["Name"]; + const mda::TypedArray type_proxy_id_mda = opts[0]["TypeProxyID"]; + + const std::u16string& name_utf16 = name_mda[0]; + MATLAB_ASSIGN_OR_ERROR(const auto name, + arrow::util::UTF16StringToUTF8(name_utf16), + error::UNICODE_CONVERSION_ERROR_ID); + + auto proxy = std::static_pointer_cast(libmexclass::proxy::ProxyManager::getProxy(type_proxy_id_mda[0])); + auto type = proxy->unwrap(); + auto field = arrow::field(name, type); + return std::make_shared(std::move(field)); + } + +} + diff --git a/matlab/src/cpp/arrow/matlab/array/proxy/timestamp_array.h b/matlab/src/cpp/arrow/matlab/type/proxy/field.h similarity index 67% rename from matlab/src/cpp/arrow/matlab/array/proxy/timestamp_array.h rename to matlab/src/cpp/arrow/matlab/type/proxy/field.h index ec67245564beb..8df73aa8af3a2 100644 --- a/matlab/src/cpp/arrow/matlab/array/proxy/timestamp_array.h +++ b/matlab/src/cpp/arrow/matlab/type/proxy/field.h @@ -17,26 +17,28 @@ #pragma once -#include "arrow/array.h" - -#include "arrow/matlab/array/proxy/array.h" +#include "arrow/type.h" #include "libmexclass/proxy/Proxy.h" -namespace arrow::matlab::array::proxy { +namespace arrow::matlab::type::proxy { -class TimestampArray : public arrow::matlab::array::proxy::Array { +class Field : public libmexclass::proxy::Proxy { public: - TimestampArray(const std::shared_ptr timestamp_array) - : arrow::matlab::array::proxy::Array() { - array = timestamp_array; - } + Field(std::shared_ptr field); + + virtual ~Field() {} + + std::shared_ptr unwrap(); static libmexclass::proxy::MakeResult make(const libmexclass::proxy::FunctionArguments& constructor_arguments); protected: + void name(libmexclass::proxy::method::Context& context); + void type(libmexclass::proxy::method::Context& context); + void toString(libmexclass::proxy::method::Context& context); - void toMATLAB(libmexclass::proxy::method::Context& context) override; + std::shared_ptr field; }; } diff --git a/matlab/src/cpp/arrow/matlab/type/proxy/fixed_width_type.cc b/matlab/src/cpp/arrow/matlab/type/proxy/fixed_width_type.cc new file mode 100644 index 0000000000000..9ede57f2ee1dd --- /dev/null +++ b/matlab/src/cpp/arrow/matlab/type/proxy/fixed_width_type.cc @@ -0,0 +1,34 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + + +#include "arrow/matlab/type/proxy/fixed_width_type.h" + +namespace arrow::matlab::type::proxy { + + FixedWidthType::FixedWidthType(std::shared_ptr type) : Type(std::move(type)) { + REGISTER_METHOD(FixedWidthType, bitWidth); + } + + void FixedWidthType::bitWidth(libmexclass::proxy::method::Context& context) { + namespace mda = ::matlab::data; + mda::ArrayFactory factory; + + auto bit_width_mda = factory.createScalar(data_type->bit_width()); + context.outputs[0] = bit_width_mda; + } +} diff --git a/matlab/src/cpp/arrow/matlab/type/proxy/fixed_width_type.h b/matlab/src/cpp/arrow/matlab/type/proxy/fixed_width_type.h new file mode 100644 index 0000000000000..e245acd55640e --- /dev/null +++ b/matlab/src/cpp/arrow/matlab/type/proxy/fixed_width_type.h @@ -0,0 +1,34 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#pragma once + +#include "arrow/matlab/type/proxy/type.h" + +namespace arrow::matlab::type::proxy { + +class FixedWidthType : public arrow::matlab::type::proxy::Type { + public: + FixedWidthType(std::shared_ptr type); + + virtual ~FixedWidthType() {} + + protected: + void bitWidth(libmexclass::proxy::method::Context& context); + +}; + +} diff --git a/matlab/src/cpp/arrow/matlab/type/proxy/primitive_ctype.h b/matlab/src/cpp/arrow/matlab/type/proxy/primitive_ctype.h new file mode 100644 index 0000000000000..0415972b44c5b --- /dev/null +++ b/matlab/src/cpp/arrow/matlab/type/proxy/primitive_ctype.h @@ -0,0 +1,55 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/matlab/type/proxy/fixed_width_type.h" +#include "arrow/type_traits.h" + +#include + + +namespace arrow::matlab::type::proxy { + +template +using arrow_type_t = typename arrow::CTypeTraits::ArrowType; + +template +using is_primitive = arrow::is_primitive_ctype>; + +template +using enable_if_primitive = std::enable_if_t::value, bool>; + +template = true> +class PrimitiveCType : public arrow::matlab::type::proxy::FixedWidthType { + + using ArrowDataType = arrow_type_t; + + public: + PrimitiveCType(std::shared_ptr primitive_type) : arrow::matlab::type::proxy::FixedWidthType(std::move(primitive_type)) { + } + + ~PrimitiveCType() {} + + static libmexclass::proxy::MakeResult make(const libmexclass::proxy::FunctionArguments& constructor_arguments) { + auto data_type = arrow::CTypeTraits::type_singleton(); + return std::make_shared(std::static_pointer_cast(std::move(data_type))); + } +}; + +} + diff --git a/matlab/src/cpp/arrow/matlab/type/proxy/string_type.cc b/matlab/src/cpp/arrow/matlab/type/proxy/string_type.cc new file mode 100644 index 0000000000000..362dfba7344ea --- /dev/null +++ b/matlab/src/cpp/arrow/matlab/type/proxy/string_type.cc @@ -0,0 +1,28 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/matlab/type/proxy/string_type.h" + +namespace arrow::matlab::type::proxy { + + StringType::StringType(std::shared_ptr string_type) : Type(std::move(string_type)) {} + + libmexclass::proxy::MakeResult StringType::make(const libmexclass::proxy::FunctionArguments& constructor_arguments) { + auto string_type = std::static_pointer_cast(arrow::utf8()); + return std::make_shared(std::move(string_type)); + } +} diff --git a/matlab/src/cpp/arrow/matlab/type/proxy/string_type.h b/matlab/src/cpp/arrow/matlab/type/proxy/string_type.h new file mode 100644 index 0000000000000..fd1808d9b8058 --- /dev/null +++ b/matlab/src/cpp/arrow/matlab/type/proxy/string_type.h @@ -0,0 +1,35 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/matlab/type/proxy/type.h" + +namespace arrow::matlab::type::proxy { + +class StringType : public arrow::matlab::type::proxy::Type { + + public: + StringType(std::shared_ptr string_type); + + ~StringType() {} + + static libmexclass::proxy::MakeResult make(const libmexclass::proxy::FunctionArguments& constructor_arguments); +}; + +} + diff --git a/matlab/src/cpp/arrow/matlab/type/proxy/timestamp_type.cc b/matlab/src/cpp/arrow/matlab/type/proxy/timestamp_type.cc new file mode 100644 index 0000000000000..b1d35ee4874db --- /dev/null +++ b/matlab/src/cpp/arrow/matlab/type/proxy/timestamp_type.cc @@ -0,0 +1,80 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/matlab/type/proxy/timestamp_type.h" +#include "arrow/matlab/type/time_unit.h" +#include "arrow/matlab/error/error.h" +#include "arrow/util/utf8.h" + +namespace arrow::matlab::type::proxy { + + TimestampType::TimestampType(std::shared_ptr timestamp_type) : FixedWidthType(std::move(timestamp_type)) { + REGISTER_METHOD(TimestampType, timeUnit); + REGISTER_METHOD(TimestampType, timeZone); + } + + libmexclass::proxy::MakeResult TimestampType::make(const libmexclass::proxy::FunctionArguments& constructor_arguments) { + namespace mda = ::matlab::data; + + using TimestampTypeProxy = arrow::matlab::type::proxy::TimestampType; + + mda::StructArray opts = constructor_arguments[0]; + + // Get the mxArray from constructor arguments + const mda::StringArray timezone_mda = opts[0]["TimeZone"]; + const mda::StringArray timeunit_mda = opts[0]["TimeUnit"]; + + // extract the time zone + const std::u16string& utf16_timezone = timezone_mda[0]; + MATLAB_ASSIGN_OR_ERROR(const auto timezone, + arrow::util::UTF16StringToUTF8(utf16_timezone), + error::UNICODE_CONVERSION_ERROR_ID); + + // extract the time unit + const std::u16string& utf16_timeunit = timeunit_mda[0]; + MATLAB_ASSIGN_OR_ERROR(const auto timeunit, + arrow::matlab::type::timeUnitFromString(utf16_timeunit), + error::UKNOWN_TIME_UNIT_ERROR_ID); + + auto type = arrow::timestamp(timeunit, timezone); + auto time_type = std::static_pointer_cast(type); + return std::make_shared(std::move(time_type)); + } + + void TimestampType::timeZone(libmexclass::proxy::method::Context& context) { + namespace mda = ::matlab::data; + mda::ArrayFactory factory; + + auto timestamp_type = std::static_pointer_cast(data_type); + const auto timezone_utf8 = timestamp_type->timezone(); + MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(const auto timezone_utf16, + arrow::util::UTF8StringToUTF16(timezone_utf8), + context, error::UNICODE_CONVERSION_ERROR_ID); + auto timezone_mda = factory.createScalar(timezone_utf16); + context.outputs[0] = timezone_mda; + } + + void TimestampType::timeUnit(libmexclass::proxy::method::Context& context) { + namespace mda = ::matlab::data; + mda::ArrayFactory factory; + + auto timestamp_type = std::static_pointer_cast(data_type); + const auto timeunit = timestamp_type->unit(); + auto timeunit_mda = factory.createScalar(static_cast(timeunit)); + context.outputs[0] = timeunit_mda; + } +} diff --git a/matlab/src/cpp/arrow/matlab/type/proxy/timestamp_type.h b/matlab/src/cpp/arrow/matlab/type/proxy/timestamp_type.h new file mode 100644 index 0000000000000..71005dc3a980d --- /dev/null +++ b/matlab/src/cpp/arrow/matlab/type/proxy/timestamp_type.h @@ -0,0 +1,42 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/matlab/type/proxy/fixed_width_type.h" +#include "arrow/type_traits.h" + +namespace arrow::matlab::type::proxy { + +class TimestampType : public arrow::matlab::type::proxy::FixedWidthType { + + public: + TimestampType(std::shared_ptr timestamp_type); + + ~TimestampType() {} + + static libmexclass::proxy::MakeResult make(const libmexclass::proxy::FunctionArguments& constructor_arguments); + + protected: + + void timeZone(libmexclass::proxy::method::Context& context); + + void timeUnit(libmexclass::proxy::method::Context& context); +}; + +} + diff --git a/matlab/src/cpp/arrow/matlab/type/proxy/traits.h b/matlab/src/cpp/arrow/matlab/type/proxy/traits.h new file mode 100644 index 0000000000000..3d9a957a5e3dc --- /dev/null +++ b/matlab/src/cpp/arrow/matlab/type/proxy/traits.h @@ -0,0 +1,90 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/type_fwd.h" + +#include "arrow/matlab/type/proxy/primitive_ctype.h" +#include "arrow/matlab/type/proxy/timestamp_type.h" +#include "arrow/matlab/type/proxy/string_type.h" + +namespace arrow::matlab::type::proxy { + + template + struct Traits; + + template <> + struct Traits { + using TypeProxy = PrimitiveCType; + }; + + template <> + struct Traits { + using TypeProxy = PrimitiveCType; + }; + + template <> + struct Traits { + using TypeProxy = PrimitiveCType; + }; + + template <> + struct Traits { + using TypeProxy = PrimitiveCType; + }; + + template <> + struct Traits { + using TypeProxy = PrimitiveCType; + }; + + template <> + struct Traits { + using TypeProxy = PrimitiveCType; + }; + + template <> + struct Traits { + using TypeProxy = PrimitiveCType; + }; + + template <> + struct Traits { + using TypeProxy = PrimitiveCType; + }; + + template <> + struct Traits { + using TypeProxy = PrimitiveCType; + }; + + template <> + struct Traits { + using TypeProxy = PrimitiveCType; + }; + + template <> + struct Traits { + using TypeProxy = StringType; + }; + + template <> + struct Traits { + using TypeProxy = TimestampType; + }; +} diff --git a/matlab/src/cpp/arrow/matlab/type/proxy/type.cc b/matlab/src/cpp/arrow/matlab/type/proxy/type.cc new file mode 100644 index 0000000000000..f6a307ff3f62f --- /dev/null +++ b/matlab/src/cpp/arrow/matlab/type/proxy/type.cc @@ -0,0 +1,48 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/matlab/type/proxy/type.h" + +namespace arrow::matlab::type::proxy { + + Type::Type(std::shared_ptr type) : data_type{std::move(type)} { + REGISTER_METHOD(Type, typeID); + REGISTER_METHOD(Type, numFields); + } + + std::shared_ptr Type::unwrap() { + return data_type; + } + + void Type::typeID(libmexclass::proxy::method::Context& context) { + namespace mda = ::matlab::data; + mda::ArrayFactory factory; + + auto type_number_mda = factory.createScalar(static_cast(data_type->id())); + context.outputs[0] = type_number_mda; + } + + void Type::numFields(libmexclass::proxy::method::Context& context) { + namespace mda = ::matlab::data; + mda::ArrayFactory factory; + + auto num_fields_mda = factory.createScalar(data_type->num_fields()); + context.outputs[0] = num_fields_mda; + } + +} + diff --git a/matlab/src/cpp/arrow/matlab/type/proxy/type.h b/matlab/src/cpp/arrow/matlab/type/proxy/type.h new file mode 100644 index 0000000000000..e94097aa73cb4 --- /dev/null +++ b/matlab/src/cpp/arrow/matlab/type/proxy/type.h @@ -0,0 +1,43 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/type.h" + +#include "libmexclass/proxy/Proxy.h" + +namespace arrow::matlab::type::proxy { + +class Type : public libmexclass::proxy::Proxy { + public: + Type(std::shared_ptr type); + + virtual ~Type() {} + + std::shared_ptr unwrap(); + + protected: + + void typeID(libmexclass::proxy::method::Context& context); + + void numFields(libmexclass::proxy::method::Context& context); + + std::shared_ptr data_type; +}; + +} diff --git a/matlab/src/cpp/arrow/matlab/type/proxy/wrap.cc b/matlab/src/cpp/arrow/matlab/type/proxy/wrap.cc new file mode 100644 index 0000000000000..b01148fe1c0a9 --- /dev/null +++ b/matlab/src/cpp/arrow/matlab/type/proxy/wrap.cc @@ -0,0 +1,59 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/matlab/type/proxy/wrap.h" + +#include "arrow/matlab/type/proxy/primitive_ctype.h" +#include "arrow/matlab/type/proxy/timestamp_type.h" +#include "arrow/matlab/type/proxy/string_type.h" + +namespace arrow::matlab::type::proxy { + + arrow::Result> wrap(const std::shared_ptr& type) { + using ID = arrow::Type::type; + switch (type->id()) { + case ID::BOOL: + return std::make_shared>(std::static_pointer_cast(type)); + case ID::UINT8: + return std::make_shared>(std::static_pointer_cast(type)); + case ID::UINT16: + return std::make_shared>(std::static_pointer_cast(type)); + case ID::UINT32: + return std::make_shared>(std::static_pointer_cast(type)); + case ID::UINT64: + return std::make_shared>(std::static_pointer_cast(type)); + case ID::INT8: + return std::make_shared>(std::static_pointer_cast(type)); + case ID::INT16: + return std::make_shared>(std::static_pointer_cast(type)); + case ID::INT32: + return std::make_shared>(std::static_pointer_cast(type)); + case ID::INT64: + return std::make_shared>(std::static_pointer_cast(type)); + case ID::FLOAT: + return std::make_shared>(std::static_pointer_cast(type)); + case ID::DOUBLE: + return std::make_shared>(std::static_pointer_cast(type)); + case ID::TIMESTAMP: + return std::make_shared(std::static_pointer_cast(type)); + case ID::STRING: + return std::make_shared(std::static_pointer_cast(type)); + default: + return arrow::Status::NotImplemented("Unsupported DataType: " + type->ToString()); + } + } +} diff --git a/cpp/src/arrow/util/bytes_view.h b/matlab/src/cpp/arrow/matlab/type/proxy/wrap.h similarity index 78% rename from cpp/src/arrow/util/bytes_view.h rename to matlab/src/cpp/arrow/matlab/type/proxy/wrap.h index b1aacc96ed8d8..f5e2d30f8f4ec 100644 --- a/cpp/src/arrow/util/bytes_view.h +++ b/matlab/src/cpp/arrow/matlab/type/proxy/wrap.h @@ -15,15 +15,14 @@ // specific language governing permissions and limitations // under the License. -#pragma once +#include "arrow/type.h" +#include "arrow/result.h" -#include -#include +#include "arrow/matlab/type/proxy/type.h" -namespace arrow { -namespace util { +namespace arrow::matlab::type::proxy { -using bytes_view = std::basic_string_view; +arrow::Result> wrap(const std::shared_ptr& type); -} // namespace util -} // namespace arrow + +} diff --git a/matlab/src/cpp/arrow/matlab/type/time_unit.cc b/matlab/src/cpp/arrow/matlab/type/time_unit.cc index 15ebfcfc0c06b..eb839b0e78096 100644 --- a/matlab/src/cpp/arrow/matlab/type/time_unit.cc +++ b/matlab/src/cpp/arrow/matlab/type/time_unit.cc @@ -20,7 +20,7 @@ namespace arrow::matlab::type { - arrow::Result timeUnitFromString(const std::u16string& unit_str) { + arrow::Result timeUnitFromString(std::u16string_view unit_str) { if (unit_str == u"Second") { return arrow::TimeUnit::type::SECOND; } else if (unit_str == u"Millisecond") { diff --git a/matlab/src/cpp/arrow/matlab/type/time_unit.h b/matlab/src/cpp/arrow/matlab/type/time_unit.h index cf3248d77b967..9534b1f902db7 100644 --- a/matlab/src/cpp/arrow/matlab/type/time_unit.h +++ b/matlab/src/cpp/arrow/matlab/type/time_unit.h @@ -18,10 +18,10 @@ #include "arrow/type_fwd.h" #include "arrow/result.h" -#include +#include namespace arrow::matlab::type { - arrow::Result timeUnitFromString(const std::u16string& unit_str); + arrow::Result timeUnitFromString(std::u16string_view unit_str); } diff --git a/matlab/src/matlab/+arrow/+array/Array.m b/matlab/src/matlab/+arrow/+array/Array.m index 9b8796c33b974..7426052764166 100644 --- a/matlab/src/matlab/+arrow/+array/Array.m +++ b/matlab/src/matlab/+arrow/+array/Array.m @@ -26,7 +26,7 @@ Valid % Validity bitmap end - properties(Abstract, SetAccess=private, GetAccess=public) + properties(Dependent, SetAccess=private, GetAccess=public) Type(1, 1) arrow.type.Type end @@ -46,6 +46,13 @@ function matlabArray = toMATLAB(obj) matlabArray = obj.Proxy.toMATLAB(); end + + function type = get.Type(obj) + [proxyID, typeID] = obj.Proxy.type(); + traits = arrow.type.traits.traits(arrow.type.ID(typeID)); + proxy = libmexclass.proxy.Proxy(Name=traits.TypeProxyClassName, ID=proxyID); + type = traits.TypeConstructor(proxy); + end end methods (Access = private) diff --git a/matlab/src/matlab/+arrow/+array/BooleanArray.m b/matlab/src/matlab/+arrow/+array/BooleanArray.m index e5c4cc527e552..f4d341efce9d3 100644 --- a/matlab/src/matlab/+arrow/+array/BooleanArray.m +++ b/matlab/src/matlab/+arrow/+array/BooleanArray.m @@ -20,10 +20,6 @@ NullSubstitionValue = false; end - properties(SetAccess=private, GetAccess=public) - Type = arrow.type.BooleanType - end - methods function obj = BooleanArray(data, opts) arguments diff --git a/matlab/src/matlab/+arrow/+array/Float32Array.m b/matlab/src/matlab/+arrow/+array/Float32Array.m index 29f23393a4346..c6be563d8621f 100644 --- a/matlab/src/matlab/+arrow/+array/Float32Array.m +++ b/matlab/src/matlab/+arrow/+array/Float32Array.m @@ -20,10 +20,6 @@ NullSubstitutionValue = single(NaN); end - properties(SetAccess=private, GetAccess=public) - Type = arrow.type.Float32Type - end - methods function obj = Float32Array(data, varargin) obj@arrow.array.NumericArray(data, "single", ... diff --git a/matlab/src/matlab/+arrow/+array/Float64Array.m b/matlab/src/matlab/+arrow/+array/Float64Array.m index ab92715864275..ff43ebc0536c0 100644 --- a/matlab/src/matlab/+arrow/+array/Float64Array.m +++ b/matlab/src/matlab/+arrow/+array/Float64Array.m @@ -20,10 +20,6 @@ NullSubstitutionValue = NaN; end - properties(SetAccess=private, GetAccess=public) - Type = arrow.type.Float64Type - end - methods function obj = Float64Array(data, varargin) obj@arrow.array.NumericArray(data, "double", ... diff --git a/matlab/src/matlab/+arrow/+array/Int16Array.m b/matlab/src/matlab/+arrow/+array/Int16Array.m index 23716d5f59ec5..533f0c9ef549d 100644 --- a/matlab/src/matlab/+arrow/+array/Int16Array.m +++ b/matlab/src/matlab/+arrow/+array/Int16Array.m @@ -20,10 +20,6 @@ NullSubstitutionValue = int16(0) end - properties(SetAccess=private, GetAccess=public) - Type = arrow.type.Int16Type - end - methods function obj = Int16Array(data, varargin) obj@arrow.array.NumericArray(data, "int16", ... diff --git a/matlab/src/matlab/+arrow/+array/Int32Array.m b/matlab/src/matlab/+arrow/+array/Int32Array.m index 8844576ae1ef9..0f977fb90f808 100644 --- a/matlab/src/matlab/+arrow/+array/Int32Array.m +++ b/matlab/src/matlab/+arrow/+array/Int32Array.m @@ -20,10 +20,6 @@ NullSubstitutionValue = int32(0) end - properties(SetAccess=private, GetAccess=public) - Type = arrow.type.Int32Type - end - methods function obj = Int32Array(data, varargin) obj@arrow.array.NumericArray(data, "int32", ... diff --git a/matlab/src/matlab/+arrow/+array/Int64Array.m b/matlab/src/matlab/+arrow/+array/Int64Array.m index 9f72c5f2a6854..94cad56519b11 100644 --- a/matlab/src/matlab/+arrow/+array/Int64Array.m +++ b/matlab/src/matlab/+arrow/+array/Int64Array.m @@ -20,10 +20,6 @@ NullSubstitutionValue = int64(0); end - properties(SetAccess=private, GetAccess=public) - Type = arrow.type.Int64Type - end - methods function obj = Int64Array(data, varargin) obj@arrow.array.NumericArray(data, "int64", ... diff --git a/matlab/src/matlab/+arrow/+array/Int8Array.m b/matlab/src/matlab/+arrow/+array/Int8Array.m index f9774f6527493..83a14caa27287 100644 --- a/matlab/src/matlab/+arrow/+array/Int8Array.m +++ b/matlab/src/matlab/+arrow/+array/Int8Array.m @@ -20,10 +20,6 @@ NullSubstitutionValue = int8(0); end - properties(SetAccess=private, GetAccess=public) - Type = arrow.type.Int8Type - end - methods function obj = Int8Array(data, varargin) obj@arrow.array.NumericArray(data, "int8", ... diff --git a/matlab/src/matlab/+arrow/+array/NumericArray.m b/matlab/src/matlab/+arrow/+array/NumericArray.m index 16b96be7acc5d..fb2fc1d333939 100644 --- a/matlab/src/matlab/+arrow/+array/NumericArray.m +++ b/matlab/src/matlab/+arrow/+array/NumericArray.m @@ -15,35 +15,26 @@ classdef NumericArray < arrow.array.Array % arrow.array.NumericArray - - - properties (Hidden, SetAccess=protected) - MatlabArray = [] - end properties(Abstract, Access=protected) NullSubstitutionValue; end methods - function obj = NumericArray(data, type, proxyName, opts, nullOpts) + function obj = NumericArray(data, type, proxyName, opts) arguments data type(1, 1) string proxyName(1, 1) string - opts.DeepCopy(1, 1) logical = false - nullOpts.InferNulls(1, 1) logical = true - nullOpts.Valid + opts.InferNulls(1, 1) logical = true + opts.Valid end arrow.args.validateTypeAndShape(data, type); - validElements = arrow.args.parseValidElements(data, nullOpts); - opts = struct(MatlabArray=data, Valid=validElements, DeepCopy=opts.DeepCopy); + validElements = arrow.args.parseValidElements(data, opts); + opts = struct(MatlabArray=data, Valid=validElements); obj@arrow.array.Array("Name", proxyName, "ConstructorArguments", {opts}); - obj.MatlabArray = cast(obj.MatlabArray, type); - % Store a reference to the array if not doing a deep copy - if ~opts.DeepCopy, obj.MatlabArray = data; end end - + function matlabArray = toMATLAB(obj) matlabArray = obj.Proxy.toMATLAB(); matlabArray(~obj.Valid) = obj.NullSubstitutionValue; diff --git a/matlab/src/matlab/+arrow/+array/StringArray.m b/matlab/src/matlab/+arrow/+array/StringArray.m index 9ef3f0252586f..ec2d53b371fe2 100644 --- a/matlab/src/matlab/+arrow/+array/StringArray.m +++ b/matlab/src/matlab/+arrow/+array/StringArray.m @@ -20,10 +20,6 @@ NullSubstitionValue = string(missing); end - properties(SetAccess=private, GetAccess=public) - Type = arrow.type.StringType - end - methods function obj = StringArray(data, opts) arguments diff --git a/matlab/src/matlab/+arrow/+array/TimestampArray.m b/matlab/src/matlab/+arrow/+array/TimestampArray.m index 0aa76beb99c7a..0f0da4e82130c 100644 --- a/matlab/src/matlab/+arrow/+array/TimestampArray.m +++ b/matlab/src/matlab/+arrow/+array/TimestampArray.m @@ -20,10 +20,6 @@ NullSubstitutionValue = NaT; end - properties(SetAccess=private, GetAccess=public) - Type = arrow.type.TimestampType % temporarily default value - end - methods function obj = TimestampArray(data, opts) arguments @@ -39,7 +35,6 @@ args = struct(MatlabArray=ptime, Valid=validElements, TimeZone=timezone, TimeUnit=string(opts.TimeUnit)); obj@arrow.array.Array("Name", "arrow.array.proxy.TimestampArray", "ConstructorArguments", {args}); - obj.Type = arrow.type.TimestampType(TimeUnit=opts.TimeUnit, TimeZone=timezone); end function dates = toMATLAB(obj) @@ -48,7 +43,7 @@ epoch = datetime(1970, 1, 1, TimeZone="UTC"); tz = obj.Type.TimeZone; - ticsPerSecond = obj.Type.TimeUnit.TicksPerSecond; + ticsPerSecond = ticksPerSecond(obj.Type.TimeUnit); dates = datetime(time, ConvertFrom="epochtime", Epoch=epoch, ... TimeZone=tz, TicksPerSecond=ticsPerSecond); @@ -72,7 +67,7 @@ % % TODO: convertTo may error if the datetime is 2^63-1 before or % after the epoch. We should throw a custom error in this case. - time(indices) = convertTo(dates(indices), "epochtime", TicksPerSecond=units.TicksPerSecond); + time(indices) = convertTo(dates(indices), "epochtime", TicksPerSecond=ticksPerSecond(units)); end end end \ No newline at end of file diff --git a/matlab/src/matlab/+arrow/+array/UInt16Array.m b/matlab/src/matlab/+arrow/+array/UInt16Array.m index 3732df3c76111..4862ca20b9f88 100644 --- a/matlab/src/matlab/+arrow/+array/UInt16Array.m +++ b/matlab/src/matlab/+arrow/+array/UInt16Array.m @@ -20,10 +20,6 @@ NullSubstitutionValue = uint16(0) end - properties(SetAccess=private, GetAccess=public) - Type = arrow.type.UInt16Type - end - methods function obj = UInt16Array(data, varargin) obj@arrow.array.NumericArray(data, "uint16", ... diff --git a/matlab/src/matlab/+arrow/+array/UInt32Array.m b/matlab/src/matlab/+arrow/+array/UInt32Array.m index 183d4df08257a..782b0010997fc 100644 --- a/matlab/src/matlab/+arrow/+array/UInt32Array.m +++ b/matlab/src/matlab/+arrow/+array/UInt32Array.m @@ -20,10 +20,6 @@ NullSubstitutionValue = uint32(0) end - properties(SetAccess=private, GetAccess=public) - Type = arrow.type.UInt32Type - end - methods function obj = UInt32Array(data, varargin) obj@arrow.array.NumericArray(data, "uint32", ... diff --git a/matlab/src/matlab/+arrow/+array/UInt64Array.m b/matlab/src/matlab/+arrow/+array/UInt64Array.m index af828978ce2a7..9e25ce4987bc1 100644 --- a/matlab/src/matlab/+arrow/+array/UInt64Array.m +++ b/matlab/src/matlab/+arrow/+array/UInt64Array.m @@ -20,10 +20,6 @@ NullSubstitutionValue = uint64(0) end - properties(SetAccess=private, GetAccess=public) - Type = arrow.type.UInt64Type - end - methods function obj = UInt64Array(data, varargin) obj@arrow.array.NumericArray(data, "uint64", ... diff --git a/matlab/src/matlab/+arrow/+array/UInt8Array.m b/matlab/src/matlab/+arrow/+array/UInt8Array.m index b5dc664ea1476..8bad2401bd429 100644 --- a/matlab/src/matlab/+arrow/+array/UInt8Array.m +++ b/matlab/src/matlab/+arrow/+array/UInt8Array.m @@ -20,10 +20,6 @@ NullSubstitutionValue = uint8(0) end - properties(SetAccess=private, GetAccess=public) - Type = arrow.type.UInt8Type - end - methods function obj = UInt8Array(data, varargin) obj@arrow.array.NumericArray(data, "uint8", ... diff --git a/matlab/src/matlab/+arrow/+internal/+proxy/create.m b/matlab/src/matlab/+arrow/+internal/+proxy/create.m new file mode 100644 index 0000000000000..0ed1476058df6 --- /dev/null +++ b/matlab/src/matlab/+arrow/+internal/+proxy/create.m @@ -0,0 +1,25 @@ +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +function proxy = create(name, args) +%CREATE Creates a proxy object. + arguments + name(1, 1) string {mustBeNonmissing} + end + arguments(Repeating) + args + end + proxy = libmexclass.proxy.Proxy.create(name, args{:}); +end \ No newline at end of file diff --git a/matlab/src/matlab/+arrow/+internal/+proxy/validate.m b/matlab/src/matlab/+arrow/+internal/+proxy/validate.m new file mode 100644 index 0000000000000..1b2b3649e42c3 --- /dev/null +++ b/matlab/src/matlab/+arrow/+internal/+proxy/validate.m @@ -0,0 +1,29 @@ +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +function validate(proxy, expectedName) +%VALIDATE Throws an arrow:matlab:ProxyNameMismatch error if +% proxy.Name and expectedName are not equal. + arguments + proxy(1, 1) libmexclass.proxy.Proxy + expectedName(1, 1) string + end + + if proxy.Name ~= expectedName + errid = "arrow:proxy:ProxyNameMismatch"; + msg = "Proxy class name is " + proxyName + ", but expected " + expectedProxyName; + error(errid, msg); + end +end diff --git a/matlab/src/matlab/+arrow/+type/+traits/BooleanTraits.m b/matlab/src/matlab/+arrow/+type/+traits/BooleanTraits.m new file mode 100644 index 0000000000000..82a8b6b1e28ba --- /dev/null +++ b/matlab/src/matlab/+arrow/+type/+traits/BooleanTraits.m @@ -0,0 +1,29 @@ +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +classdef BooleanTraits < arrow.type.traits.TypeTraits + + properties (Constant) + ArrayConstructor = @arrow.array.BooleanArray + ArrayClassName = "arrow.array.BooleanArray" + ArrayProxyClassName = "arrow.array.proxy.BooleanArray" + TypeConstructor = @arrow.type.BooleanType; + TypeClassName = "arrow.type.BooleanType" + TypeProxyClassName = "arrow.type.proxy.BooleanType" + MatlabConstructor = @logical + MatlabClassName = "logical" + end + +end \ No newline at end of file diff --git a/matlab/src/matlab/+arrow/+type/+traits/Float32Traits.m b/matlab/src/matlab/+arrow/+type/+traits/Float32Traits.m new file mode 100644 index 0000000000000..7dc0d17474e2f --- /dev/null +++ b/matlab/src/matlab/+arrow/+type/+traits/Float32Traits.m @@ -0,0 +1,29 @@ +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +classdef Float32Traits < arrow.type.traits.TypeTraits + + properties (Constant) + ArrayConstructor = @arrow.array.Float32Array + ArrayClassName = "arrow.array.Float32Array" + ArrayProxyClassName = "arrow.array.proxy.Float32Array" + TypeConstructor = @arrow.type.Float32Type; + TypeClassName = "arrow.type.Float32Type" + TypeProxyClassName = "arrow.type.proxy.Float32Type" + MatlabConstructor = @single + MatlabClassName = "single" + end + +end \ No newline at end of file diff --git a/matlab/src/matlab/+arrow/+type/+traits/Float64Traits.m b/matlab/src/matlab/+arrow/+type/+traits/Float64Traits.m new file mode 100644 index 0000000000000..9c52634b2c942 --- /dev/null +++ b/matlab/src/matlab/+arrow/+type/+traits/Float64Traits.m @@ -0,0 +1,29 @@ +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +classdef Float64Traits < arrow.type.traits.TypeTraits + + properties (Constant) + ArrayConstructor = @arrow.array.Float64Array + ArrayClassName = "arrow.array.Float64Array" + ArrayProxyClassName = "arrow.array.proxy.Float64Array" + TypeConstructor = @arrow.type.Float64Type; + TypeClassName = "arrow.type.Float64Type" + TypeProxyClassName = "arrow.type.proxy.Float64Type" + MatlabConstructor = @double + MatlabClassName = "double" + end + +end \ No newline at end of file diff --git a/matlab/src/matlab/+arrow/+type/+traits/Int16Traits.m b/matlab/src/matlab/+arrow/+type/+traits/Int16Traits.m new file mode 100644 index 0000000000000..46b67b43c1783 --- /dev/null +++ b/matlab/src/matlab/+arrow/+type/+traits/Int16Traits.m @@ -0,0 +1,29 @@ +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +classdef Int16Traits < arrow.type.traits.TypeTraits + + properties (Constant) + ArrayConstructor = @arrow.array.Int16Array + ArrayClassName = "arrow.array.Int16Array" + ArrayProxyClassName = "arrow.array.proxy.Int16Array" + TypeConstructor = @arrow.type.Int16Type; + TypeClassName = "arrow.type.Int16Type" + TypeProxyClassName = "arrow.type.proxy.Int16Type" + MatlabConstructor = @int16 + MatlabClassName = "int16" + end + +end \ No newline at end of file diff --git a/matlab/src/matlab/+arrow/+type/+traits/Int32Traits.m b/matlab/src/matlab/+arrow/+type/+traits/Int32Traits.m new file mode 100644 index 0000000000000..4117271e50ff1 --- /dev/null +++ b/matlab/src/matlab/+arrow/+type/+traits/Int32Traits.m @@ -0,0 +1,29 @@ +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +classdef Int32Traits < arrow.type.traits.TypeTraits + + properties (Constant) + ArrayConstructor = @arrow.array.Int32Array + ArrayClassName = "arrow.array.Int32Array" + ArrayProxyClassName = "arrow.array.proxy.Int32Array" + TypeConstructor = @arrow.type.Int32Type; + TypeClassName = "arrow.type.Int32Type" + TypeProxyClassName = "arrow.type.proxy.Int32Type" + MatlabConstructor = @int32 + MatlabClassName = "int32" + end + +end \ No newline at end of file diff --git a/matlab/src/matlab/+arrow/+type/+traits/Int64Traits.m b/matlab/src/matlab/+arrow/+type/+traits/Int64Traits.m new file mode 100644 index 0000000000000..e25da953aa0fc --- /dev/null +++ b/matlab/src/matlab/+arrow/+type/+traits/Int64Traits.m @@ -0,0 +1,29 @@ +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +classdef Int64Traits < arrow.type.traits.TypeTraits + + properties (Constant) + ArrayConstructor = @arrow.array.Int64Array + ArrayClassName = "arrow.array.Int64Array" + ArrayProxyClassName = "arrow.array.proxy.Int64Array" + TypeConstructor = @arrow.type.Int64Type; + TypeClassName = "arrow.type.Int64Type" + TypeProxyClassName = "arrow.type.proxy.Int64Type" + MatlabConstructor = @int64 + MatlabClassName = "int64" + end + +end \ No newline at end of file diff --git a/matlab/src/matlab/+arrow/+type/+traits/Int8Traits.m b/matlab/src/matlab/+arrow/+type/+traits/Int8Traits.m new file mode 100644 index 0000000000000..9f73bd2667e1b --- /dev/null +++ b/matlab/src/matlab/+arrow/+type/+traits/Int8Traits.m @@ -0,0 +1,29 @@ +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +classdef Int8Traits < arrow.type.traits.TypeTraits + + properties (Constant) + ArrayConstructor = @arrow.array.Int8Array + ArrayClassName = "arrow.array.Int8Array" + ArrayProxyClassName = "arrow.array.proxy.Int8Array" + TypeConstructor = @arrow.type.Int8Type; + TypeClassName = "arrow.type.Int8Type" + TypeProxyClassName = "arrow.type.proxy.Int8Type" + MatlabConstructor = @int8 + MatlabClassName = "int8" + end + +end \ No newline at end of file diff --git a/matlab/src/matlab/+arrow/+type/+traits/StringTraits.m b/matlab/src/matlab/+arrow/+type/+traits/StringTraits.m new file mode 100644 index 0000000000000..0730657270129 --- /dev/null +++ b/matlab/src/matlab/+arrow/+type/+traits/StringTraits.m @@ -0,0 +1,29 @@ +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +classdef StringTraits < arrow.type.traits.TypeTraits + + properties (Constant) + ArrayConstructor = @arrow.array.StringArray + ArrayClassName = "arrow.array.StringArray" + ArrayProxyClassName = "arrow.array.proxy.StringArray" + TypeConstructor = @arrow.type.StringType; + TypeClassName = "arrow.type.StringType" + TypeProxyClassName = "arrow.type.proxy.StringType" + MatlabConstructor = @string + MatlabClassName = "string" + end + +end \ No newline at end of file diff --git a/matlab/src/matlab/+arrow/+type/+traits/TimestampTraits.m b/matlab/src/matlab/+arrow/+type/+traits/TimestampTraits.m new file mode 100644 index 0000000000000..488a5e7314016 --- /dev/null +++ b/matlab/src/matlab/+arrow/+type/+traits/TimestampTraits.m @@ -0,0 +1,29 @@ +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +classdef TimestampTraits < arrow.type.traits.TypeTraits + + properties (Constant) + ArrayConstructor = @arrow.array.TimestampArray + ArrayClassName = "arrow.array.TimestampArray" + ArrayProxyClassName = "arrow.array.proxy.TimestampArray" + TypeConstructor = @arrow.type.TimestampType; + TypeClassName = "arrow.type.TimestampType" + TypeProxyClassName = "arrow.type.proxy.TimestampType" + MatlabConstructor = @datetime + MatlabClassName = "datetime" + end + +end \ No newline at end of file diff --git a/matlab/src/matlab/+arrow/+type/+traits/TypeTraits.m b/matlab/src/matlab/+arrow/+type/+traits/TypeTraits.m new file mode 100644 index 0000000000000..54b8fc0a7709c --- /dev/null +++ b/matlab/src/matlab/+arrow/+type/+traits/TypeTraits.m @@ -0,0 +1,29 @@ +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +classdef TypeTraits + + properties (Abstract, Constant) + ArrayConstructor + ArrayClassName + ArrayProxyClassName + TypeConstructor + TypeClassName + TypeProxyClassName + MatlabConstructor + MatlabClassName + end + +end \ No newline at end of file diff --git a/matlab/src/matlab/+arrow/+type/+traits/UInt16Traits.m b/matlab/src/matlab/+arrow/+type/+traits/UInt16Traits.m new file mode 100644 index 0000000000000..b90e6294ce0d8 --- /dev/null +++ b/matlab/src/matlab/+arrow/+type/+traits/UInt16Traits.m @@ -0,0 +1,29 @@ +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +classdef UInt16Traits < arrow.type.traits.TypeTraits + + properties (Constant) + ArrayConstructor = @arrow.array.UInt16Array + ArrayClassName = "arrow.array.UInt16Array" + ArrayProxyClassName = "arrow.array.proxy.UInt16Array" + TypeConstructor = @arrow.type.UInt16Type; + TypeClassName = "arrow.type.UInt16Type" + TypeProxyClassName = "arrow.type.proxy.UInt16Type" + MatlabConstructor = @uint16 + MatlabClassName = "uint16" + end + +end \ No newline at end of file diff --git a/matlab/src/matlab/+arrow/+type/+traits/UInt32Traits.m b/matlab/src/matlab/+arrow/+type/+traits/UInt32Traits.m new file mode 100644 index 0000000000000..ff79bd9579a3b --- /dev/null +++ b/matlab/src/matlab/+arrow/+type/+traits/UInt32Traits.m @@ -0,0 +1,29 @@ +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +classdef UInt32Traits < arrow.type.traits.TypeTraits + + properties (Constant) + ArrayConstructor = @arrow.array.UInt32Array + ArrayClassName = "arrow.array.UInt32Array" + ArrayProxyClassName = "arrow.array.proxy.UInt32Array" + TypeConstructor = @arrow.type.UInt32Type; + TypeClassName = "arrow.type.UInt32Type" + TypeProxyClassName = "arrow.type.proxy.UInt32Type" + MatlabConstructor = @uint32 + MatlabClassName = "uint32" + end + +end \ No newline at end of file diff --git a/matlab/src/matlab/+arrow/+type/+traits/UInt64Traits.m b/matlab/src/matlab/+arrow/+type/+traits/UInt64Traits.m new file mode 100644 index 0000000000000..a6b0de37528a9 --- /dev/null +++ b/matlab/src/matlab/+arrow/+type/+traits/UInt64Traits.m @@ -0,0 +1,29 @@ +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +classdef UInt64Traits < arrow.type.traits.TypeTraits + + properties (Constant) + ArrayConstructor = @arrow.array.UInt64Array + ArrayClassName = "arrow.array.UInt64Array" + ArrayProxyClassName = "arrow.array.proxy.UInt64Array" + TypeConstructor = @arrow.type.UInt64Type; + TypeClassName = "arrow.type.UInt64Type" + TypeProxyClassName = "arrow.type.proxy.UInt64Type" + MatlabConstructor = @uint64 + MatlabClassName = "uint64" + end + +end \ No newline at end of file diff --git a/matlab/src/matlab/+arrow/+type/+traits/UInt8Traits.m b/matlab/src/matlab/+arrow/+type/+traits/UInt8Traits.m new file mode 100644 index 0000000000000..ff2377ff812c3 --- /dev/null +++ b/matlab/src/matlab/+arrow/+type/+traits/UInt8Traits.m @@ -0,0 +1,29 @@ +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +classdef UInt8Traits < arrow.type.traits.TypeTraits + + properties (Constant) + ArrayConstructor = @arrow.array.UInt8Array + ArrayClassName = "arrow.array.UInt8Array" + ArrayProxyClassName = "arrow.array.proxy.UInt8Array" + TypeConstructor = @arrow.type.UInt8Type; + TypeClassName = "arrow.type.UInt8Type" + TypeProxyClassName = "arrow.type.proxy.UInt8Type" + MatlabConstructor = @uint8 + MatlabClassName = "uint8" + end + +end \ No newline at end of file diff --git a/matlab/src/matlab/+arrow/+type/+traits/traits.m b/matlab/src/matlab/+arrow/+type/+traits/traits.m new file mode 100644 index 0000000000000..af59e2822df96 --- /dev/null +++ b/matlab/src/matlab/+arrow/+type/+traits/traits.m @@ -0,0 +1,89 @@ +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +function typeTraits = traits(type) + % "Gateway" function that links an arrow Type ID enumeration (e.g. + % arrow.type.ID.String) or a MATLAB class string (e.g. "datetime") + % to associated type information. + import arrow.type.traits.* + import arrow.type.* + + if isa(type, "arrow.type.ID") + switch type + case ID.UInt8 + typeTraits = UInt8Traits(); + case ID.UInt16 + typeTraits = UInt16Traits(); + case ID.UInt32 + typeTraits = UInt32Traits(); + case ID.UInt64 + typeTraits = UInt64Traits(); + case ID.Int8 + typeTraits = Int8Traits(); + case ID.Int16 + typeTraits = Int16Traits(); + case ID.Int32 + typeTraits = Int32Traits(); + case ID.Int64 + typeTraits = Int64Traits(); + case ID.Float32 + typeTraits = Float32Traits(); + case ID.Float64 + typeTraits = Float64Traits(); + case ID.Boolean + typeTraits = BooleanTraits(); + case ID.String + typeTraits = StringTraits(); + case ID.Timestamp + typeTraits = TimestampTraits(); + otherwise + error("arrow:type:traits:UnsupportedArrowTypeID", "Unsupported Arrow type ID: " + type); + end + elseif isa(type, "string") % MATLAB class string + switch type + case "uint8" + typeTraits = UInt8Traits(); + case "uint16" + typeTraits = UInt16Traits(); + case "uint32" + typeTraits = UInt32Traits(); + case "uint64" + typeTraits = UInt64Traits(); + case "int8" + typeTraits = Int8Traits(); + case "int16" + typeTraits = Int16Traits(); + case "int32" + typeTraits = Int32Traits(); + case "int64" + typeTraits = Int64Traits(); + case "single" + typeTraits = Float32Traits(); + case "double" + typeTraits = Float64Traits(); + case "logical" + typeTraits = BooleanTraits(); + case "string" + typeTraits = StringTraits(); + case "datetime" + typeTraits = TimestampTraits(); + otherwise + error("arrow:type:traits:UnsupportedMatlabClass", "Unsupported MATLAB class: " + type); + end + else + error("arrow:type:traits:UnsupportedInputType", "The input argument to the traits function " + ... + "must be a MATLAB class string or an arrow.type.ID enumeration."); + end +end \ No newline at end of file diff --git a/matlab/src/matlab/+arrow/+type/BooleanType.m b/matlab/src/matlab/+arrow/+type/BooleanType.m index 050beae3f5120..6afa00e9258cb 100644 --- a/matlab/src/matlab/+arrow/+type/BooleanType.m +++ b/matlab/src/matlab/+arrow/+type/BooleanType.m @@ -13,10 +13,16 @@ % implied. See the License for the specific language governing % permissions and limitations under the License. -classdef BooleanType < arrow.type.PrimitiveType +classdef BooleanType < arrow.type.FixedWidthType %BOOLEANTYPE Type class for boolean data. - properties(SetAccess = protected) - ID = arrow.type.ID.Boolean + methods + function obj = BooleanType(proxy) + arguments + proxy(1, 1) libmexclass.proxy.Proxy {validate(proxy, "arrow.type.proxy.BooleanType")} + end + import arrow.internal.proxy.validate + obj@arrow.type.FixedWidthType(proxy); + end end end \ No newline at end of file diff --git a/matlab/src/matlab/+arrow/+type/Field.m b/matlab/src/matlab/+arrow/+type/Field.m new file mode 100644 index 0000000000000..aaab36b048e37 --- /dev/null +++ b/matlab/src/matlab/+arrow/+type/Field.m @@ -0,0 +1,67 @@ +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +classdef Field < matlab.mixin.CustomDisplay +%FIELD A class representing a name and a type. +% Fields are often used in tabular schemas for describing a column's +% name and type. + + properties (GetAccess=public, SetAccess=private, Hidden) + Proxy + end + + properties (Dependent) + % Name of the field + Name + % Arrow type of the field + Type + end + + methods + function obj = Field(proxy) + arguments + proxy(1, 1) libmexclass.proxy.Proxy {validate(proxy, "arrow.type.proxy.Field")} + end + import arrow.internal.proxy.validate + + obj.Proxy = proxy; + end + + function type = get.Type(obj) + [proxyID, typeID] = obj.Proxy.type(); + traits = arrow.type.traits.traits(arrow.type.ID(typeID)); + proxy = libmexclass.proxy.Proxy(Name=traits.TypeProxyClassName, ID=proxyID); + type = traits.TypeConstructor(proxy); + end + + function name = get.Name(obj) + name = obj.Proxy.name(); + end + + end + + methods (Access = private) + function str = toString(obj) + str = obj.Proxy.toString(); + end + end + + methods (Access=protected) + function displayScalarObject(obj) + disp(obj.toString()); + end + end + +end diff --git a/matlab/src/matlab/+arrow/+type/PrimitiveType.m b/matlab/src/matlab/+arrow/+type/FixedWidthType.m similarity index 67% rename from matlab/src/matlab/+arrow/+type/PrimitiveType.m rename to matlab/src/matlab/+arrow/+type/FixedWidthType.m index 6297b98d8b01b..8c9c5b26081ae 100644 --- a/matlab/src/matlab/+arrow/+type/PrimitiveType.m +++ b/matlab/src/matlab/+arrow/+type/FixedWidthType.m @@ -13,21 +13,23 @@ % implied. See the License for the specific language governing % permissions and limitations under the License. -classdef PrimitiveType < arrow.type.Type -%PRIMITIVETYPE Abstract type class representing primtive data types. +classdef (Abstract) FixedWidthType < arrow.type.Type +%FIXEDWIDTHTYPE Abstract type class representing fixed width data types. - properties(Dependent, SetAccess=protected, GetAccess=public) + properties(Dependent, SetAccess=private, GetAccess=public) BitWidth end - properties(Constant) - NumFields = 0 - NumBuffers = 2 - end - methods + function obj = FixedWidthType(proxy) + arguments + proxy(1, 1) libmexclass.proxy.Proxy + end + obj@arrow.type.Type(proxy); + end + function width = get.BitWidth(obj) - width = bitWidth(obj.ID); + width = obj.Proxy.bitWidth(); end - end + end end diff --git a/matlab/src/matlab/+arrow/+type/Float32Type.m b/matlab/src/matlab/+arrow/+type/Float32Type.m index b0430bda7eab4..df5fa1ce844e9 100644 --- a/matlab/src/matlab/+arrow/+type/Float32Type.m +++ b/matlab/src/matlab/+arrow/+type/Float32Type.m @@ -13,10 +13,16 @@ % implied. See the License for the specific language governing % permissions and limitations under the License. -classdef Float32Type < arrow.type.PrimitiveType +classdef Float32Type < arrow.type.FixedWidthType %FLOAT32TYPE Type class for float32 data. - properties(SetAccess = protected) - ID = arrow.type.ID.Float32 + methods + function obj = Float32Type(proxy) + arguments + proxy(1, 1) libmexclass.proxy.Proxy {validate(proxy, "arrow.type.proxy.Float32Type")} + end + import arrow.internal.proxy.validate + obj@arrow.type.FixedWidthType(proxy); + end end end \ No newline at end of file diff --git a/matlab/src/matlab/+arrow/+type/Float64Type.m b/matlab/src/matlab/+arrow/+type/Float64Type.m index a2ffe02b786af..ba93265ebc73e 100644 --- a/matlab/src/matlab/+arrow/+type/Float64Type.m +++ b/matlab/src/matlab/+arrow/+type/Float64Type.m @@ -13,10 +13,16 @@ % implied. See the License for the specific language governing % permissions and limitations under the License. -classdef Float64Type < arrow.type.PrimitiveType +classdef Float64Type < arrow.type.FixedWidthType %FLOAT64Type Type class for float64 data. - properties(SetAccess = protected) - ID = arrow.type.ID.Float64 + methods + function obj = Float64Type(proxy) + arguments + proxy(1, 1) libmexclass.proxy.Proxy {validate(proxy, "arrow.type.proxy.Float64Type")} + end + import arrow.internal.proxy.validate + obj@arrow.type.FixedWidthType(proxy); + end end end \ No newline at end of file diff --git a/matlab/src/matlab/+arrow/+type/ID.m b/matlab/src/matlab/+arrow/+type/ID.m index 2e320603d039c..076d79d196a30 100644 --- a/matlab/src/matlab/+arrow/+type/ID.m +++ b/matlab/src/matlab/+arrow/+type/ID.m @@ -35,24 +35,4 @@ % Date64 (17) Timestamp (18) end - - methods - function bitWidth = bitWidth(obj) - import arrow.type.ID - switch obj - case ID.Boolean - bitWidth = 1; - case {ID.UInt8, ID.Int8} - bitWidth = 8; - case {ID.UInt16, ID.Int16} - bitWidth = 16; - case {ID.UInt32, ID.Int32, ID.Float32} - bitWidth = 32; - case {ID.UInt64, ID.Int64, ID.Float64, ID.Timestamp} - bitWidth = 64; - otherwise - bitWidth = NaN; - end - end - end end diff --git a/matlab/src/matlab/+arrow/+type/Int16Type.m b/matlab/src/matlab/+arrow/+type/Int16Type.m index 3d060f7e58671..c16d3fd5ca53f 100644 --- a/matlab/src/matlab/+arrow/+type/Int16Type.m +++ b/matlab/src/matlab/+arrow/+type/Int16Type.m @@ -13,11 +13,17 @@ % implied. See the License for the specific language governing % permissions and limitations under the License. -classdef Int16Type < arrow.type.PrimitiveType +classdef Int16Type < arrow.type.FixedWidthType %INT16TYPE Type class for int8 data. - properties(SetAccess = protected) - ID = arrow.type.ID.Int16 + methods + function obj = Int16Type(proxy) + arguments + proxy(1, 1) libmexclass.proxy.Proxy {validate(proxy, "arrow.type.proxy.Int16Type")} + end + import arrow.internal.proxy.validate + obj@arrow.type.FixedWidthType(proxy); + end end end diff --git a/matlab/src/matlab/+arrow/+type/Int32Type.m b/matlab/src/matlab/+arrow/+type/Int32Type.m index 98c81c08647dd..786697bf1136b 100644 --- a/matlab/src/matlab/+arrow/+type/Int32Type.m +++ b/matlab/src/matlab/+arrow/+type/Int32Type.m @@ -13,11 +13,17 @@ % implied. See the License for the specific language governing % permissions and limitations under the License. -classdef Int32Type < arrow.type.PrimitiveType +classdef Int32Type < arrow.type.FixedWidthType %INT32TYPE Type class for int32 data. - properties(SetAccess = protected) - ID = arrow.type.ID.Int32 + methods + function obj = Int32Type(proxy) + arguments + proxy(1, 1) libmexclass.proxy.Proxy {validate(proxy, "arrow.type.proxy.Int32Type")} + end + import arrow.internal.proxy.validate + obj@arrow.type.FixedWidthType(proxy); + end end end diff --git a/matlab/src/matlab/+arrow/+type/Int64Type.m b/matlab/src/matlab/+arrow/+type/Int64Type.m index 23147817e36e1..bf6c71d622a63 100644 --- a/matlab/src/matlab/+arrow/+type/Int64Type.m +++ b/matlab/src/matlab/+arrow/+type/Int64Type.m @@ -13,10 +13,16 @@ % implied. See the License for the specific language governing % permissions and limitations under the License. -classdef Int64Type < arrow.type.PrimitiveType +classdef Int64Type < arrow.type.FixedWidthType %INT64TYPE Type class for int64 data. - properties(SetAccess = protected) - ID = arrow.type.ID.Int64 + methods + function obj = Int64Type(proxy) + arguments + proxy(1, 1) libmexclass.proxy.Proxy {validate(proxy, "arrow.type.proxy.Int64Type")} + end + import arrow.internal.proxy.validate + obj@arrow.type.FixedWidthType(proxy); + end end end \ No newline at end of file diff --git a/matlab/src/matlab/+arrow/+type/Int8Type.m b/matlab/src/matlab/+arrow/+type/Int8Type.m index 9d364bb32be82..b28785f876ea8 100644 --- a/matlab/src/matlab/+arrow/+type/Int8Type.m +++ b/matlab/src/matlab/+arrow/+type/Int8Type.m @@ -13,11 +13,17 @@ % implied. See the License for the specific language governing % permissions and limitations under the License. -classdef Int8Type < arrow.type.PrimitiveType +classdef Int8Type < arrow.type.FixedWidthType %INT8TYPE Type class for int8 data. - properties(SetAccess = protected) - ID = arrow.type.ID.Int8 + methods + function obj = Int8Type(proxy) + arguments + proxy(1, 1) libmexclass.proxy.Proxy {validate(proxy, "arrow.type.proxy.Int8Type")} + end + import arrow.internal.proxy.validate + obj@arrow.type.FixedWidthType(proxy); + end end end diff --git a/matlab/src/matlab/+arrow/+type/StringType.m b/matlab/src/matlab/+arrow/+type/StringType.m index 66a15dd0ea3e2..c269bfa6db33c 100644 --- a/matlab/src/matlab/+arrow/+type/StringType.m +++ b/matlab/src/matlab/+arrow/+type/StringType.m @@ -16,14 +16,14 @@ classdef StringType < arrow.type.Type %STRINGTYPE Type class for string data. - properties(SetAccess = protected) - ID = arrow.type.ID.String - end - - properties(Constant) - NumFields = 0 - NumBuffers = 3 - end - + methods + function obj = StringType(proxy) + arguments + proxy(1, 1) libmexclass.proxy.Proxy {validate(proxy, "arrow.type.proxy.StringType")} + end + import arrow.internal.proxy.validate + obj@arrow.type.Type(proxy); + end + end end diff --git a/matlab/src/matlab/+arrow/+type/TimeUnit.m b/matlab/src/matlab/+arrow/+type/TimeUnit.m index 3ec8bf44d104f..358818be985c9 100644 --- a/matlab/src/matlab/+arrow/+type/TimeUnit.m +++ b/matlab/src/matlab/+arrow/+type/TimeUnit.m @@ -12,33 +12,28 @@ % WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or % implied. See the License for the specific language governing % permissions and limitations under the License. -classdef TimeUnit +classdef TimeUnit < int16 % Enumeration class representing Time Units. enumeration - Second - Millisecond - Microsecond - Nanosecond + Second (0) + Millisecond (1) + Microsecond (2) + Nanosecond (3) end - properties (Dependent) - TicksPerSecond - end - - - methods - function ticksPerSecond = get.TicksPerSecond(obj) + methods (Hidden) + function ticks = ticksPerSecond(obj) import arrow.type.TimeUnit switch obj case TimeUnit.Second - ticksPerSecond = 1; + ticks = 1; case TimeUnit.Millisecond - ticksPerSecond = 1e3; + ticks = 1e3; case TimeUnit.Microsecond - ticksPerSecond = 1e6; + ticks = 1e6; case TimeUnit.Nanosecond - ticksPerSecond = 1e9; + ticks = 1e9; end end end diff --git a/matlab/src/matlab/+arrow/+type/TimestampType.m b/matlab/src/matlab/+arrow/+type/TimestampType.m index 99ac4a7b769f7..b3d34f31b7d1e 100644 --- a/matlab/src/matlab/+arrow/+type/TimestampType.m +++ b/matlab/src/matlab/+arrow/+type/TimestampType.m @@ -13,29 +13,37 @@ % implied. See the License for the specific language governing % permissions and limitations under the License. -classdef TimestampType < arrow.type.PrimitiveType +classdef TimestampType < arrow.type.FixedWidthType %TIMESTAMPTYPE Type class for timestamp data. - - properties(SetAccess=private) - TimeZone(1, 1) string - TimeUnit(1, 1) arrow.type.TimeUnit - end - - properties(SetAccess = protected) - ID = arrow.type.ID.Timestamp + properties(Dependent, SetAccess=private, GetAccess=public) + TimeZone + TimeUnit end methods - function obj = TimestampType(opts) - %TIMESTAMPTYPE Construct an instance of this class + function obj = TimestampType(proxy) arguments - opts.TimeUnit(1, 1) arrow.type.TimeUnit = arrow.type.TimeUnit.Microsecond - opts.TimeZone(1, 1) string {mustBeNonmissing} = "" + proxy(1, 1) libmexclass.proxy.Proxy {validate(proxy, "arrow.type.proxy.TimestampType")} end - obj.TimeUnit = opts.TimeUnit; - obj.TimeZone = opts.TimeZone; + import arrow.internal.proxy.validate + obj@arrow.type.FixedWidthType(proxy); + end + + function unit = get.TimeUnit(obj) + val = obj.Proxy.timeUnit(); + unit = arrow.type.TimeUnit(val); + end + + function tz = get.TimeZone(obj) + tz = obj.Proxy.timeZone(); end end -end + methods (Access=protected) + function group = getPropertyGroups(~) + targets = ["ID" "TimeUnit" "TimeZone"]; + group = matlab.mixin.util.PropertyGroup(targets); + end + end +end \ No newline at end of file diff --git a/matlab/src/matlab/+arrow/+type/Type.m b/matlab/src/matlab/+arrow/+type/Type.m index a05eb2253bf87..c2ae3dbc58c9c 100644 --- a/matlab/src/matlab/+arrow/+type/Type.m +++ b/matlab/src/matlab/+arrow/+type/Type.m @@ -13,11 +13,40 @@ % implied. See the License for the specific language governing % permissions and limitations under the License. -classdef Type +classdef (Abstract) Type < matlab.mixin.CustomDisplay %TYPE Abstract type class. - properties (Abstract, SetAccess=protected) - ID(1, 1) arrow.type.ID + properties (Dependent, GetAccess=public, SetAccess=private) + ID + NumFields end -end + properties (GetAccess=public, SetAccess=private, Hidden) + Proxy + end + + methods + function obj = Type(proxy) + arguments + proxy(1, 1) libmexclass.proxy.Proxy + end + obj.Proxy = proxy; + end + + function numFields = get.NumFields(obj) + numFields = obj.Proxy.numFields(); + end + + function typeID = get.ID(obj) + typeID = arrow.type.ID(obj.Proxy.typeID()); + end + end + + methods (Access=protected) + function propgrp = getPropertyGroups(~) + proplist = {'ID'}; + propgrp = matlab.mixin.util.PropertyGroup(proplist); + end + end + +end diff --git a/matlab/src/matlab/+arrow/+type/UInt16Type.m b/matlab/src/matlab/+arrow/+type/UInt16Type.m index 8d53ea68556d8..3198b78671ef9 100644 --- a/matlab/src/matlab/+arrow/+type/UInt16Type.m +++ b/matlab/src/matlab/+arrow/+type/UInt16Type.m @@ -13,10 +13,16 @@ % implied. See the License for the specific language governing % permissions and limitations under the License. -classdef UInt16Type < arrow.type.PrimitiveType +classdef UInt16Type < arrow.type.FixedWidthType %UINT16TYPE Type class for uint16 data. - properties(SetAccess = protected) - ID = arrow.type.ID.UInt16 + methods + function obj = UInt16Type(proxy) + arguments + proxy(1, 1) libmexclass.proxy.Proxy {validate(proxy, "arrow.type.proxy.UInt16Type")} + end + import arrow.internal.proxy.validate + obj@arrow.type.FixedWidthType(proxy); + end end end \ No newline at end of file diff --git a/matlab/src/matlab/+arrow/+type/UInt32Type.m b/matlab/src/matlab/+arrow/+type/UInt32Type.m index 693bd897d66dc..53e60e4e34290 100644 --- a/matlab/src/matlab/+arrow/+type/UInt32Type.m +++ b/matlab/src/matlab/+arrow/+type/UInt32Type.m @@ -13,10 +13,16 @@ % implied. See the License for the specific language governing % permissions and limitations under the License. -classdef UInt32Type < arrow.type.PrimitiveType +classdef UInt32Type < arrow.type.FixedWidthType %UINT32TYPE Type class for uint32 data. - properties(SetAccess = protected) - ID = arrow.type.ID.UInt32 + methods + function obj = UInt32Type(proxy) + arguments + proxy(1, 1) libmexclass.proxy.Proxy {validate(proxy, "arrow.type.proxy.UInt32Type")} + end + import arrow.internal.proxy.validate + obj@arrow.type.FixedWidthType(proxy); + end end end \ No newline at end of file diff --git a/matlab/src/matlab/+arrow/+type/UInt64Type.m b/matlab/src/matlab/+arrow/+type/UInt64Type.m index fbd06646cedd7..f8512ec59497c 100644 --- a/matlab/src/matlab/+arrow/+type/UInt64Type.m +++ b/matlab/src/matlab/+arrow/+type/UInt64Type.m @@ -13,10 +13,16 @@ % implied. See the License for the specific language governing % permissions and limitations under the License. -classdef UInt64Type < arrow.type.PrimitiveType +classdef UInt64Type < arrow.type.FixedWidthType %UINT64TYPE Type class for uint64 data. - properties(SetAccess = protected) - ID = arrow.type.ID.UInt64 + methods + function obj = UInt64Type(proxy) + arguments + proxy(1, 1) libmexclass.proxy.Proxy {validate(proxy, "arrow.type.proxy.UInt64Type")} + end + import arrow.internal.proxy.validate + obj@arrow.type.FixedWidthType(proxy); + end end end \ No newline at end of file diff --git a/matlab/src/matlab/+arrow/+type/UInt8Type.m b/matlab/src/matlab/+arrow/+type/UInt8Type.m index 9abd001b43c67..898426e3a4076 100644 --- a/matlab/src/matlab/+arrow/+type/UInt8Type.m +++ b/matlab/src/matlab/+arrow/+type/UInt8Type.m @@ -13,10 +13,16 @@ % implied. See the License for the specific language governing % permissions and limitations under the License. -classdef UInt8Type < arrow.type.PrimitiveType +classdef UInt8Type < arrow.type.FixedWidthType %UINT8TYPE Type class for uint8 data. - properties(SetAccess = protected) - ID = arrow.type.ID.UInt8 + methods + function obj = UInt8Type(proxy) + arguments + proxy(1, 1) libmexclass.proxy.Proxy {validate(proxy, "arrow.type.proxy.UInt8Type")} + end + import arrow.internal.proxy.validate + obj@arrow.type.FixedWidthType(proxy); + end end end \ No newline at end of file diff --git a/matlab/src/matlab/+arrow/boolean.m b/matlab/src/matlab/+arrow/boolean.m new file mode 100644 index 0000000000000..f5331d790e595 --- /dev/null +++ b/matlab/src/matlab/+arrow/boolean.m @@ -0,0 +1,20 @@ +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. +function type = boolean() +%BOOLEAN Creates an arrow.type.BooleanType object + proxy = arrow.internal.proxy.create("arrow.type.proxy.BooleanType"); + type = arrow.type.BooleanType(proxy); +end + diff --git a/matlab/src/matlab/+arrow/field.m b/matlab/src/matlab/+arrow/field.m new file mode 100644 index 0000000000000..a14ed2268bd35 --- /dev/null +++ b/matlab/src/matlab/+arrow/field.m @@ -0,0 +1,27 @@ +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +function f = field(name, type) +%FIELD Creates an arrow.type.Field object + arguments + name(1, 1) string {mustBeNonmissing} + type(1, 1) arrow.type.Type + end + + typeProxyID = type.Proxy.ID; + args = struct(Name=name, TypeProxyID=typeProxyID); + proxy = arrow.internal.proxy.create("arrow.type.proxy.Field", args); + f = arrow.type.Field(proxy); +end \ No newline at end of file diff --git a/matlab/src/matlab/+arrow/float32.m b/matlab/src/matlab/+arrow/float32.m new file mode 100644 index 0000000000000..d8c44dfc7f03e --- /dev/null +++ b/matlab/src/matlab/+arrow/float32.m @@ -0,0 +1,20 @@ +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +function type = float32() +%FLOAT64 Creates an arrow.type.Float32Type object + proxy = arrow.internal.proxy.create("arrow.type.proxy.Float32Type"); + type = arrow.type.Float32Type(proxy); +end \ No newline at end of file diff --git a/matlab/src/matlab/+arrow/float64.m b/matlab/src/matlab/+arrow/float64.m new file mode 100644 index 0000000000000..ae2fdc44c2a84 --- /dev/null +++ b/matlab/src/matlab/+arrow/float64.m @@ -0,0 +1,20 @@ +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +function type = float64() +%FLOAT64 Creates an arrow.type.Float64Type object + proxy = arrow.internal.proxy.create("arrow.type.proxy.Float64Type"); + type = arrow.type.Float64Type(proxy); +end diff --git a/matlab/src/matlab/+arrow/int16.m b/matlab/src/matlab/+arrow/int16.m new file mode 100644 index 0000000000000..49f3bfdaa3522 --- /dev/null +++ b/matlab/src/matlab/+arrow/int16.m @@ -0,0 +1,20 @@ +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +function type = int16() +%INT16 Creates an arrow.type.Int16Type object + proxy = arrow.internal.proxy.create("arrow.type.proxy.Int16Type"); + type = arrow.type.Int16Type(proxy); +end diff --git a/matlab/src/matlab/+arrow/int32.m b/matlab/src/matlab/+arrow/int32.m new file mode 100644 index 0000000000000..80673a6bb57a7 --- /dev/null +++ b/matlab/src/matlab/+arrow/int32.m @@ -0,0 +1,20 @@ +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +function type = int32() +%INT32 Creates an arrow.type.Int32Type object + proxy = arrow.internal.proxy.create("arrow.type.proxy.Int32Type"); + type = arrow.type.Int32Type(proxy); +end \ No newline at end of file diff --git a/matlab/src/matlab/+arrow/int64.m b/matlab/src/matlab/+arrow/int64.m new file mode 100644 index 0000000000000..7e28fdc48e520 --- /dev/null +++ b/matlab/src/matlab/+arrow/int64.m @@ -0,0 +1,21 @@ +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +function type = int64() +%INT64 Creates an arrow.type.Int64Type object + proxy = arrow.internal.proxy.create("arrow.type.proxy.Int64Type"); + type = arrow.type.Int64Type(proxy); +end + diff --git a/matlab/src/matlab/+arrow/int8.m b/matlab/src/matlab/+arrow/int8.m new file mode 100644 index 0000000000000..d59281cfb3db2 --- /dev/null +++ b/matlab/src/matlab/+arrow/int8.m @@ -0,0 +1,20 @@ +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +function type = int8() +%INT8 Creates an arrow.type.Int8Type object + proxy = arrow.internal.proxy.create("arrow.type.proxy.Int8Type"); + type = arrow.type.Int8Type(proxy); +end diff --git a/matlab/src/matlab/+arrow/string.m b/matlab/src/matlab/+arrow/string.m new file mode 100644 index 0000000000000..71329adc7cc2e --- /dev/null +++ b/matlab/src/matlab/+arrow/string.m @@ -0,0 +1,21 @@ +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +function type = string() +%STRING Creates an arrow.type.StringType object + proxy = arrow.internal.proxy.create("arrow.type.proxy.StringType"); + type = arrow.type.StringType(proxy); +end + diff --git a/matlab/src/matlab/+arrow/timestamp.m b/matlab/src/matlab/+arrow/timestamp.m new file mode 100644 index 0000000000000..6ad47eae27e45 --- /dev/null +++ b/matlab/src/matlab/+arrow/timestamp.m @@ -0,0 +1,25 @@ +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +function type = timestamp(opts) +%TIMESTAMP Creates an arrow.type.TimestampType object + arguments + opts.TimeUnit(1, 1) arrow.type.TimeUnit = arrow.type.TimeUnit.Microsecond + opts.TimeZone(1, 1) string {mustBeNonmissing} = "" + end + args = struct(TimeUnit=string(opts.TimeUnit), TimeZone=opts.TimeZone); + proxy = arrow.internal.proxy.create("arrow.type.proxy.TimestampType", args); + type = arrow.type.TimestampType(proxy); +end \ No newline at end of file diff --git a/matlab/src/matlab/+arrow/uint16.m b/matlab/src/matlab/+arrow/uint16.m new file mode 100644 index 0000000000000..75032a0253cbc --- /dev/null +++ b/matlab/src/matlab/+arrow/uint16.m @@ -0,0 +1,21 @@ +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +function type = uint16() +%UINT16 Creates an arrow.type.Int16Type object + proxy = arrow.internal.proxy.create("arrow.type.proxy.UInt16Type"); + type = arrow.type.UInt16Type(proxy); +end + diff --git a/matlab/src/matlab/+arrow/uint32.m b/matlab/src/matlab/+arrow/uint32.m new file mode 100644 index 0000000000000..79b821605d52a --- /dev/null +++ b/matlab/src/matlab/+arrow/uint32.m @@ -0,0 +1,21 @@ +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +function type = uint32() +%UINT32 Creates an arrow.type.UInt32Type object + proxy = arrow.internal.proxy.create("arrow.type.proxy.UInt32Type"); + type = arrow.type.UInt32Type(proxy); +end + diff --git a/matlab/src/matlab/+arrow/uint64.m b/matlab/src/matlab/+arrow/uint64.m new file mode 100644 index 0000000000000..c0965fc9bd40f --- /dev/null +++ b/matlab/src/matlab/+arrow/uint64.m @@ -0,0 +1,20 @@ +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +function type = uint64() +%UINT64 Creates an arrow.type.UInt64Type object + proxy = arrow.internal.proxy.create("arrow.type.proxy.UInt64Type"); + type = arrow.type.UInt64Type(proxy); +end diff --git a/matlab/src/matlab/+arrow/uint8.m b/matlab/src/matlab/+arrow/uint8.m new file mode 100644 index 0000000000000..b199a3c766052 --- /dev/null +++ b/matlab/src/matlab/+arrow/uint8.m @@ -0,0 +1,21 @@ +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +function type = uint8() +%UINT8 Creates an arrow.type.UInt8Type object + proxy = arrow.internal.proxy.create("arrow.type.proxy.UInt8Type"); + type = arrow.type.UInt8Type(proxy); +end + diff --git a/matlab/test/arrow/array/hNumericArray.m b/matlab/test/arrow/array/hNumericArray.m index ca5a534524433..f9f5f1d9e4ee3 100644 --- a/matlab/test/arrow/array/hNumericArray.m +++ b/matlab/test/arrow/array/hNumericArray.m @@ -27,10 +27,6 @@ ArrowType(1, 1) end - properties (TestParameter) - MakeDeepCopy = {true false} - end - methods(TestClassSetup) function verifyOnMatlabPath(tc) % Verify the arrow array class is on the MATLAB Search Path. @@ -41,103 +37,82 @@ function verifyOnMatlabPath(tc) end methods(Test) - function BasicTest(tc, MakeDeepCopy) - A = tc.ArrowArrayConstructor(tc.MatlabArrayFcn([1 2 3]), DeepCopy=MakeDeepCopy); + function BasicTest(tc) + A = tc.ArrowArrayConstructor(tc.MatlabArrayFcn([1 2 3])); className = string(class(A)); tc.verifyEqual(className, tc.ArrowArrayClassName); end - function ShallowCopyTest(tc) - % By default, NumericArrays do not create a deep copy on - % construction when constructed from a MATLAB array. Instead, - % it stores a shallow copy of the array keep the memory alive. - A = tc.ArrowArrayConstructor(tc.MatlabArrayFcn([1, 2, 3])); - tc.verifyEqual(A.MatlabArray, tc.MatlabArrayFcn([1, 2, 3])); - tc.verifyEqual(toMATLAB(A), tc.MatlabArrayFcn([1 2 3]')); - - A = tc.ArrowArrayConstructor(tc.MatlabArrayFcn([1, 2, 3]), DeepCopy=false); - tc.verifyEqual(A.MatlabArray, tc.MatlabArrayFcn([1 2 3])); - tc.verifyEqual(toMATLAB(A), tc.MatlabArrayFcn([1 2 3]')); - end - - function DeepCopyTest(tc) - % Verify NumericArrays does not store shallow copy of the - % MATLAB array if DeepCopy=true was supplied. - A = tc.ArrowArrayConstructor(tc.MatlabArrayFcn([1, 2, 3]), DeepCopy=true); - tc.verifyEqual(A.MatlabArray, tc.MatlabArrayFcn([])); - tc.verifyEqual(toMATLAB(A), tc.MatlabArrayFcn([1 2 3]')); - end - - function ToMATLAB(tc, MakeDeepCopy) + function ToMATLAB(tc) % Create array from a scalar - A1 = tc.ArrowArrayConstructor(tc.MatlabArrayFcn(100), DeepCopy=MakeDeepCopy); + A1 = tc.ArrowArrayConstructor(tc.MatlabArrayFcn(100)); data = toMATLAB(A1); tc.verifyEqual(data, tc.MatlabArrayFcn(100)); % Create array from a vector - A2 = tc.ArrowArrayConstructor(tc.MatlabArrayFcn([1 2 3]), DeepCopy=MakeDeepCopy); + A2 = tc.ArrowArrayConstructor(tc.MatlabArrayFcn([1 2 3])); data = toMATLAB(A2); tc.verifyEqual(data, tc.MatlabArrayFcn([1 2 3]')); % Create a Float64Array from an empty double vector - A3 = tc.ArrowArrayConstructor(tc.MatlabArrayFcn([]), DeepCopy=MakeDeepCopy); + A3 = tc.ArrowArrayConstructor(tc.MatlabArrayFcn([])); data = toMATLAB(A3); tc.verifyEqual(data, tc.MatlabArrayFcn(reshape([], 0, 1))); end - function MatlabConversion(tc, MakeDeepCopy) + function MatlabConversion(tc) % Tests the type-specific conversion methods, e.g. single for % arrow.array.Float32Array, double for array.array.Float64Array % Create array from a scalar - A1 = tc.ArrowArrayConstructor(tc.MatlabArrayFcn(100), DeepCopy=MakeDeepCopy); + A1 = tc.ArrowArrayConstructor(tc.MatlabArrayFcn(100)); data = tc.MatlabConversionFcn(A1); tc.verifyEqual(data, tc.MatlabArrayFcn(100)); % Create array from a vector - A2 = tc.ArrowArrayConstructor(tc.MatlabArrayFcn([1 2 3]), DeepCopy=MakeDeepCopy); + A2 = tc.ArrowArrayConstructor(tc.MatlabArrayFcn([1 2 3])); data = tc.MatlabConversionFcn(A2); tc.verifyEqual(data, tc.MatlabArrayFcn([1 2 3]')); % Create an array from an empty vector - A3 = tc.ArrowArrayConstructor(tc.MatlabArrayFcn([]), DeepCopy=MakeDeepCopy); + A3 = tc.ArrowArrayConstructor(tc.MatlabArrayFcn([])); data = tc.MatlabConversionFcn(A3); tc.verifyEqual(data, tc.MatlabArrayFcn(reshape([], 0, 1))); end - function MinValueTest(tc, MakeDeepCopy) - A = tc.ArrowArrayConstructor(tc.MinValue, DeepCopy=MakeDeepCopy); + function MinValueTest(tc) + A = tc.ArrowArrayConstructor(tc.MinValue); tc.verifyEqual(toMATLAB(A), tc.MinValue); end - function MaxValueTest(tc, MakeDeepCopy) - A1 = tc.ArrowArrayConstructor(tc.MaxValue, DeepCopy=MakeDeepCopy); + function MaxValueTest(tc) + A1 = tc.ArrowArrayConstructor(tc.MaxValue); tc.verifyEqual(toMATLAB(A1), tc.MaxValue); end - function ErrorIfComplex(tc, MakeDeepCopy) - fcn = @() tc.ArrowArrayConstructor(tc.MatlabArrayFcn([10 + 1i, 4]), DeepCopy=MakeDeepCopy); + function ErrorIfComplex(tc) + fcn = @() tc.ArrowArrayConstructor(tc.MatlabArrayFcn([10 + 1i, 4])); tc.verifyError(fcn, "MATLAB:expectedReal"); end - function ErrorIfNonVector(tc, MakeDeepCopy) + function ErrorIfNonVector(tc) data = tc.MatlabArrayFcn([1 2 3 4 5 6 7 8 9]); data = reshape(data, 3, 1, 3); - fcn = @() tc.ArrowArrayConstructor(tc.MatlabArrayFcn(data), DeepCopy=MakeDeepCopy); + fcn = @() tc.ArrowArrayConstructor(tc.MatlabArrayFcn(data)); tc.verifyError(fcn, "MATLAB:expectedVector"); end - function ErrorIfEmptyArrayIsNotTwoDimensional(tc, MakeDeepCopy) + function ErrorIfEmptyArrayIsNotTwoDimensional(tc) data = tc.MatlabArrayFcn(reshape([], [1 0 0])); - fcn = @() tc.ArrowArrayConstructor(data, DeepCopy=MakeDeepCopy); + fcn = @() tc.ArrowArrayConstructor(data); tc.verifyError(fcn, "MATLAB:expected2D"); end - function LogicalValidNVPair(tc, MakeDeepCopy) + function LogicalValidNVPair(tc) % Verify the expected elements are treated as null when Valid % is provided as a logical array data = tc.MatlabArrayFcn([1 2 3 4]); - arrowArray = tc.ArrowArrayConstructor(data, Valid=[false true true false], DeepCopy=MakeDeepCopy); + arrowArray = tc.ArrowArrayConstructor(data, Valid=[false true true false]); expectedData = data'; expectedData([1 4]) = tc.NullSubstitutionValue; @@ -146,11 +121,11 @@ function LogicalValidNVPair(tc, MakeDeepCopy) tc.verifyEqual(arrowArray.Valid, [false; true; true; false]); end - function NumericValidNVPair(tc, MakeDeepCopy) + function NumericValidNVPair(tc) % Verify the expected elements are treated as null when Valid % is provided as a array of indices data = tc.MatlabArrayFcn([1 2 3 4]); - arrowArray = tc.ArrowArrayConstructor(data, Valid=[2 4], DeepCopy=MakeDeepCopy); + arrowArray = tc.ArrowArrayConstructor(data, Valid=[2 4]); expectedData = data'; expectedData([1 3]) = tc.NullSubstitutionValue; @@ -171,7 +146,7 @@ function TestArrowType(tc) % Verify the array has the expected arrow.type.Type object data = tc.MatlabArrayFcn([1 2 3 4]); arrowArray = tc.ArrowArrayConstructor(data); - tc.verifyEqual(arrowArray.Type, tc.ArrowType); + tc.verifyEqual(arrowArray.Type.ID, tc.ArrowType.ID); end end end diff --git a/matlab/test/arrow/array/tBooleanArray.m b/matlab/test/arrow/array/tBooleanArray.m index 3a565202a2775..e27ca11285a50 100644 --- a/matlab/test/arrow/array/tBooleanArray.m +++ b/matlab/test/arrow/array/tBooleanArray.m @@ -22,7 +22,7 @@ MatlabArrayFcn = @logical MatlabConversionFcn = @logical NullSubstitutionValue = false - ArrowType = arrow.type.BooleanType + ArrowType = arrow.boolean end methods(TestClassSetup) @@ -155,7 +155,7 @@ function TestArrowType(tc) % Verify the array has the expected arrow.type.Type object data = tc.MatlabArrayFcn([true false]); arrowArray = tc.ArrowArrayConstructor(data); - tc.verifyEqual(arrowArray.Type, tc.ArrowType); + tc.verifyEqual(arrowArray.Type.ID, tc.ArrowType.ID); end end end diff --git a/matlab/test/arrow/array/tFloat32Array.m b/matlab/test/arrow/array/tFloat32Array.m index 0266fa618137e..f007e2b422d6e 100644 --- a/matlab/test/arrow/array/tFloat32Array.m +++ b/matlab/test/arrow/array/tFloat32Array.m @@ -24,72 +24,72 @@ MaxValue = realmax("single") MinValue = realmin("single") NullSubstitutionValue = single(NaN) - ArrowType = arrow.type.Float32Type + ArrowType = arrow.float32 end methods(Test) - function InfValues(testCase, MakeDeepCopy) - A1 = arrow.array.Float32Array(single([Inf -Inf]), DeepCopy=MakeDeepCopy); + function InfValues(testCase) + A1 = arrow.array.Float32Array(single([Inf -Inf])); data = single(A1); testCase.verifyEqual(data, single([Inf -Inf]')); end - function ValidBasic(testCase, MakeDeepCopy) + function ValidBasic(testCase) % Create a MATLAB array with one null value (i.e. one NaN). % Verify NaN is considered a null value by default. matlabArray = single([1, NaN, 3]'); - arrowArray = arrow.array.Float32Array(matlabArray, DeepCopy=MakeDeepCopy); + arrowArray = arrow.array.Float32Array(matlabArray); expectedValid = [true, false, true]'; testCase.verifyEqual(arrowArray.Valid, expectedValid); end - function InferNulls(testCase, MakeDeepCopy) + function InferNulls(testCase) matlabArray = single([1, NaN, 3]); % Verify NaN is treated as a null value when InferNulls=true. - arrowArray1 = arrow.array.Float32Array(matlabArray, InferNulls=true, DeepCopy=MakeDeepCopy); + arrowArray1 = arrow.array.Float32Array(matlabArray, InferNulls=true); expectedValid1 = [true false true]'; testCase.verifyEqual(arrowArray1.Valid, expectedValid1); testCase.verifyEqual(toMATLAB(arrowArray1), matlabArray'); % Verify NaN is not treated as a null value when InferNulls=false. - arrowArray2 = arrow.array.Float32Array(matlabArray, InferNulls=false, DeepCopy=MakeDeepCopy); + arrowArray2 = arrow.array.Float32Array(matlabArray, InferNulls=false); expectedValid2 = [true true true]'; testCase.verifyEqual(arrowArray2.Valid, expectedValid2); testCase.verifyEqual(toMATLAB(arrowArray2), matlabArray'); end - function ValidNoNulls(testCase, MakeDeepCopy) + function ValidNoNulls(testCase) % Create a MATLAB array with no null values (i.e. no NaNs). matlabArray = single([1, 2, 3]'); - arrowArray = arrow.array.Float32Array(matlabArray, DeepCopy=MakeDeepCopy); + arrowArray = arrow.array.Float32Array(matlabArray); expectedValid = [true, true, true]'; testCase.verifyEqual(arrowArray.Valid, expectedValid); end - function ValidAllNulls(testCase, MakeDeepCopy) + function ValidAllNulls(testCase) % Create a MATLAB array with all null values (i.e. all NaNs). matlabArray = single([NaN, NaN, NaN]'); - arrowArray = arrow.array.Float32Array(matlabArray, DeepCopy=MakeDeepCopy); + arrowArray = arrow.array.Float32Array(matlabArray); expectedValid = [false, false, false]'; testCase.verifyEqual(arrowArray.Valid, expectedValid); end - function EmptyArrayValidBitmap(testCase, MakeDeepCopy) + function EmptyArrayValidBitmap(testCase) % Create an empty 0x0 MATLAB array. matlabArray = single.empty(0, 0); - arrowArray = arrow.array.Float32Array(matlabArray, DeepCopy=MakeDeepCopy); + arrowArray = arrow.array.Float32Array(matlabArray); expectedValid = logical.empty(0, 1); testCase.verifyEqual(arrowArray.Valid, expectedValid); % Create an empty 0x1 MATLAB array. matlabArray = single.empty(0, 1); - arrowArray = arrow.array.Float32Array(matlabArray, DeepCopy=MakeDeepCopy); + arrowArray = arrow.array.Float32Array(matlabArray); testCase.verifyEqual(arrowArray.Valid, expectedValid); % Create an empty 1x0 MATLAB array. matlabArray = single.empty(1, 0); - arrowArray = arrow.array.Float32Array(matlabArray, DeepCopy=MakeDeepCopy); + arrowArray = arrow.array.Float32Array(matlabArray); testCase.verifyEqual(arrowArray.Valid, expectedValid); end diff --git a/matlab/test/arrow/array/tFloat64Array.m b/matlab/test/arrow/array/tFloat64Array.m index d956d33c6860b..9b30ec8f25d49 100755 --- a/matlab/test/arrow/array/tFloat64Array.m +++ b/matlab/test/arrow/array/tFloat64Array.m @@ -24,77 +24,77 @@ MaxValue = realmax("double") MinValue = realmin("double") NullSubstitutionValue = NaN - ArrowType = arrow.type.Float64Type + ArrowType = arrow.float64 end methods(Test) - function InfValues(testCase, MakeDeepCopy) - A1 = arrow.array.Float64Array([Inf -Inf], DeepCopy=MakeDeepCopy); + function InfValues(testCase) + A1 = arrow.array.Float64Array([Inf -Inf]); data = double(A1); testCase.verifyEqual(data, [Inf -Inf]'); end - function ErrorIfSparse(testCase, MakeDeepCopy) - fcn = @() arrow.array.Float64Array(sparse(ones([10 1])), DeepCopy=MakeDeepCopy); + function ErrorIfSparse(testCase) + fcn = @() arrow.array.Float64Array(sparse(ones([10 1]))); testCase.verifyError(fcn, "MATLAB:expectedNonsparse"); end - function ValidBasic(testCase, MakeDeepCopy) + function ValidBasic(testCase) % Create a MATLAB array with one null value (i.e. one NaN). % Verify NaN is considered a null value by default. matlabArray = [1, NaN, 3]'; - arrowArray = arrow.array.Float64Array(matlabArray, DeepCopy=MakeDeepCopy); + arrowArray = arrow.array.Float64Array(matlabArray); expectedValid = [true, false, true]'; testCase.verifyEqual(arrowArray.Valid, expectedValid); end - function InferNulls(testCase, MakeDeepCopy) + function InferNulls(testCase) matlabArray = [1, NaN, 3]; % Verify NaN is treated as a null value when InferNulls=true. - arrowArray1 = arrow.array.Float64Array(matlabArray, InferNulls=true, DeepCopy=MakeDeepCopy); + arrowArray1 = arrow.array.Float64Array(matlabArray, InferNulls=true); expectedValid1 = [true false true]'; testCase.verifyEqual(arrowArray1.Valid, expectedValid1); testCase.verifyEqual(toMATLAB(arrowArray1), matlabArray'); % Verify NaN is not treated as a null value when InferNulls=false. - arrowArray2 = arrow.array.Float64Array(matlabArray, InferNulls=false, DeepCopy=MakeDeepCopy); + arrowArray2 = arrow.array.Float64Array(matlabArray, InferNulls=false); expectedValid2 = [true true true]'; testCase.verifyEqual(arrowArray2.Valid, expectedValid2); testCase.verifyEqual(toMATLAB(arrowArray2), matlabArray'); end - function ValidNoNulls(testCase, MakeDeepCopy) + function ValidNoNulls(testCase) % Create a MATLAB array with no null values (i.e. no NaNs). matlabArray = [1, 2, 3]'; - arrowArray = arrow.array.Float64Array(matlabArray, DeepCopy=MakeDeepCopy); + arrowArray = arrow.array.Float64Array(matlabArray); expectedValid = [true, true, true]'; testCase.verifyEqual(arrowArray.Valid, expectedValid); end - function ValidAllNulls(testCase, MakeDeepCopy) + function ValidAllNulls(testCase) % Create a MATLAB array with all null values (i.e. all NaNs). matlabArray = [NaN, NaN, NaN]'; - arrowArray = arrow.array.Float64Array(matlabArray, DeepCopy=MakeDeepCopy); + arrowArray = arrow.array.Float64Array(matlabArray); expectedValid = [false, false, false]'; testCase.verifyEqual(arrowArray.Valid, expectedValid); end - function EmptyArrayValidBitmap(testCase, MakeDeepCopy) + function EmptyArrayValidBitmap(testCase) % Create an empty 0x0 MATLAB array. matlabArray = double.empty(0, 0); - arrowArray = arrow.array.Float64Array(matlabArray, DeepCopy=MakeDeepCopy); + arrowArray = arrow.array.Float64Array(matlabArray); expectedValid = logical.empty(0, 1); testCase.verifyEqual(arrowArray.Valid, expectedValid); % Create an empty 0x1 MATLAB array. matlabArray = double.empty(0, 1); - arrowArray = arrow.array.Float64Array(matlabArray, DeepCopy=MakeDeepCopy); + arrowArray = arrow.array.Float64Array(matlabArray); testCase.verifyEqual(arrowArray.Valid, expectedValid); % Create an empty 1x0 MATLAB array. matlabArray = double.empty(1, 0); - arrowArray = arrow.array.Float64Array(matlabArray, DeepCopy=MakeDeepCopy); + arrowArray = arrow.array.Float64Array(matlabArray); testCase.verifyEqual(arrowArray.Valid, expectedValid); end diff --git a/matlab/test/arrow/array/tInt16Array.m b/matlab/test/arrow/array/tInt16Array.m index 58193e076c228..9cb5fdc1d1049 100644 --- a/matlab/test/arrow/array/tInt16Array.m +++ b/matlab/test/arrow/array/tInt16Array.m @@ -24,7 +24,7 @@ MaxValue = intmax("int16") MinValue = intmin("int16") NullSubstitutionValue = int16(0) - ArrowType = arrow.type.Int16Type + ArrowType = arrow.int16 end end diff --git a/matlab/test/arrow/array/tInt32Array.m b/matlab/test/arrow/array/tInt32Array.m index 59255c1272638..b45705592d714 100644 --- a/matlab/test/arrow/array/tInt32Array.m +++ b/matlab/test/arrow/array/tInt32Array.m @@ -24,6 +24,6 @@ MaxValue = intmax("int32") MinValue = intmin("int32") NullSubstitutionValue = int32(0) - ArrowType = arrow.type.Int32Type + ArrowType = arrow.int32 end end diff --git a/matlab/test/arrow/array/tInt64Array.m b/matlab/test/arrow/array/tInt64Array.m index 289b4fcf3e290..0b38f58547cce 100644 --- a/matlab/test/arrow/array/tInt64Array.m +++ b/matlab/test/arrow/array/tInt64Array.m @@ -24,6 +24,6 @@ MaxValue = intmax("int64") MinValue = intmin("int64") NullSubstitutionValue = int64(0) - ArrowType = arrow.type.Int64Type + ArrowType = arrow.int64 end end diff --git a/matlab/test/arrow/array/tInt8Array.m b/matlab/test/arrow/array/tInt8Array.m index 9ae1eb8cc4fe7..8ce8e4e9b2d14 100644 --- a/matlab/test/arrow/array/tInt8Array.m +++ b/matlab/test/arrow/array/tInt8Array.m @@ -24,7 +24,7 @@ MaxValue = intmax("int8") MinValue = intmin("int8") NullSubstitutionValue = int8(0) - ArrowType = arrow.type.Int8Type + ArrowType = arrow.int8 end end diff --git a/matlab/test/arrow/array/tStringArray.m b/matlab/test/arrow/array/tStringArray.m index 000a57b27bcc2..dbb2adca0ce5b 100644 --- a/matlab/test/arrow/array/tStringArray.m +++ b/matlab/test/arrow/array/tStringArray.m @@ -22,7 +22,7 @@ MatlabArrayFcn = @string MatlabConversionFcn = @string NullSubstitutionValue = string(missing) - ArrowType = arrow.type.StringType + ArrowType = arrow.string end methods(TestClassSetup) @@ -149,7 +149,7 @@ function TestArrowType(tc) % Verify the array has the expected arrow.type.Type object data = tc.MatlabArrayFcn(["A", "B"]); arrowArray = tc.ArrowArrayConstructor(data); - tc.verifyEqual(arrowArray.Type, tc.ArrowType); + tc.verifyEqual(arrowArray.Type.ID, tc.ArrowType.ID); end function Unicode(tc) diff --git a/matlab/test/arrow/array/tUInt16Array.m b/matlab/test/arrow/array/tUInt16Array.m index b79a753694684..705d6eabc0b7b 100644 --- a/matlab/test/arrow/array/tUInt16Array.m +++ b/matlab/test/arrow/array/tUInt16Array.m @@ -24,6 +24,6 @@ MaxValue = intmax("uint16") MinValue = intmin("uint16") NullSubstitutionValue = uint16(0) - ArrowType = arrow.type.UInt16Type + ArrowType = arrow.uint16 end end diff --git a/matlab/test/arrow/array/tUInt32Array.m b/matlab/test/arrow/array/tUInt32Array.m index 157cad941724d..267a687738e44 100644 --- a/matlab/test/arrow/array/tUInt32Array.m +++ b/matlab/test/arrow/array/tUInt32Array.m @@ -24,6 +24,6 @@ MaxValue = intmax("uint32") MinValue = intmin("uint32") NullSubstitutionValue = uint32(0) - ArrowType = arrow.type.UInt32Type + ArrowType = arrow.uint32 end end diff --git a/matlab/test/arrow/array/tUInt64Array.m b/matlab/test/arrow/array/tUInt64Array.m index 41e479e816263..b1a23a004de69 100644 --- a/matlab/test/arrow/array/tUInt64Array.m +++ b/matlab/test/arrow/array/tUInt64Array.m @@ -24,6 +24,6 @@ MaxValue = intmax("uint64") MinValue = intmin("uint64") NullSubstitutionValue = uint64(0) - ArrowType = arrow.type.UInt64Type + ArrowType = arrow.uint64 end end diff --git a/matlab/test/arrow/array/tUInt8Array.m b/matlab/test/arrow/array/tUInt8Array.m index 4aca2cced1c8d..3db79f8c0b16d 100644 --- a/matlab/test/arrow/array/tUInt8Array.m +++ b/matlab/test/arrow/array/tUInt8Array.m @@ -24,6 +24,6 @@ MaxValue = intmax("uint8") MinValue = intmin("uint8") NullSubstitutionValue = uint8(0) - ArrowType = arrow.type.UInt8Type + ArrowType = arrow.uint8 end end diff --git a/matlab/test/arrow/type/hPrimitiveType.m b/matlab/test/arrow/type/hFixedWidthType.m similarity index 80% rename from matlab/test/arrow/type/hPrimitiveType.m rename to matlab/test/arrow/type/hFixedWidthType.m index b757ad4b409c9..308ac46011a6c 100644 --- a/matlab/test/arrow/type/hPrimitiveType.m +++ b/matlab/test/arrow/type/hFixedWidthType.m @@ -13,17 +13,24 @@ % implied. See the License for the specific language governing % permissions and limitations under the License. -classdef hPrimitiveType < matlab.unittest.TestCase +classdef hFixedWidthType < matlab.unittest.TestCase % Test class that defines shared unit tests for classes that inherit from -% arrow.type.PrimitiveType +% arrow.type.FixedWidthType properties(Abstract) ArrowType TypeID BitWidth + ClassName end methods(Test) + function TestClass(testCase) + % Verify ArrowType is an object of the expected class type. + name = string(class(testCase.ArrowType)); + testCase.verifyEqual(name, testCase.ClassName); + end + function TestTypeID(testCase) % Verify ID is set to the appropriate arrow.type.ID value. arrowType = testCase.ArrowType; @@ -39,13 +46,7 @@ function TestBitWidth(testCase) function TestNumFields(testCase) % Verify NumFields is set to 0 for primitive types. arrowType = testCase.ArrowType; - testCase.verifyEqual(arrowType.NumFields, 0); - end - - function TestNumBuffers(testCase) - % Verify NumBuffers is set to 2 for primitive types. - arrowType = testCase.ArrowType; - testCase.verifyEqual(arrowType.NumBuffers, 2); + testCase.verifyEqual(arrowType.NumFields, int32(0)); end end end \ No newline at end of file diff --git a/matlab/test/arrow/type/tBooleanType.m b/matlab/test/arrow/type/tBooleanType.m index 23884991f2065..eaa1c280d5355 100644 --- a/matlab/test/arrow/type/tBooleanType.m +++ b/matlab/test/arrow/type/tBooleanType.m @@ -13,12 +13,13 @@ % implied. See the License for the specific language governing % permissions and limitations under the License. -classdef tBooleanType < hPrimitiveType +classdef tBooleanType < hFixedWidthType % Test class for arrow.type.BooleanType properties - ArrowType = arrow.type.BooleanType + ArrowType = arrow.boolean TypeID = arrow.type.ID.Boolean - BitWidth = 1; + BitWidth = int32(1) + ClassName = "arrow.type.BooleanType" end end \ No newline at end of file diff --git a/matlab/test/arrow/type/tField.m b/matlab/test/arrow/type/tField.m new file mode 100644 index 0000000000000..77a05bbe39513 --- /dev/null +++ b/matlab/test/arrow/type/tField.m @@ -0,0 +1,131 @@ +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +classdef tField < matlab.unittest.TestCase +% Test class for arrow.type.Field and arrow.field. + + methods(Test) + function TestBasic(testCase) + name = "A"; + type = arrow.uint64; + field = arrow.field(name, type); + + testCase.verifyEqual(field.Name, name); + testCase.verifyEqual(field.Type.ID, type.ID); + end + + function TestSupportedTypes(testCase) + name = "name"; + supportedTypes = { ... + arrow.uint8, ... + arrow.uint16, ... + arrow.uint32, ... + arrow.uint64, ... + arrow.int8, ... + arrow.int16, ... + arrow.int32, ... + arrow.int64, ... + arrow.boolean, ... + arrow.float32, ... + arrow.float64, ... + arrow.string, ... + arrow.timestamp, ... + }; + for ii = 1:numel(supportedTypes) + supportedType = supportedTypes{ii}; + field = arrow.field(name, supportedType); + testCase.verifyEqual(field.Name, name); + testCase.verifyEqual(field.Type.ID, supportedType.ID); + end + end + + function TestNameUnicode(testCase) + smiley = "😀"; + tree = "🌲"; + mango = "🥭"; + + type = arrow.uint64; + field = arrow.field(smiley, type); + + testCase.verifyEqual(field.Name, smiley); + testCase.verifyEqual(field.Type.ID, type.ID); + + field = arrow.field(tree, type); + + testCase.verifyEqual(field.Name, tree); + testCase.verifyEqual(field.Type.ID, type.ID); + + field = arrow.field(mango, type); + + testCase.verifyEqual(field.Name, mango); + testCase.verifyEqual(field.Type.ID, type.ID); + end + + function TestErrorIfNameStringMissing(testCase) + name = string(missing); + type = arrow.uint64; + testCase.verifyError(@() arrow.field(name, type), "MATLAB:validators:mustBeNonmissing"); + end + + function TestNameEmptyString(testCase) + name = ""; + type = arrow.uint64; + field = arrow.field(name, type); + + testCase.verifyEqual(field.Name, name); + testCase.verifyEqual(field.Type.ID, type.ID); + end + + function TestNameCharVector(testCase) + name = 'ABC'; + type = arrow.uint64; + field = arrow.field(name, type); + + testCase.verifyEqual(field.Name, string(name)); + testCase.verifyEqual(field.Type.ID, type.ID); + end + + function TestNameNumber(testCase) + name = 123; + type = arrow.uint64; + field = arrow.field(name, type); + + testCase.verifyEqual(field.Name, string(123)); + testCase.verifyEqual(field.Type.ID, type.ID); + end + + function TestArrowTypeUnsupportedInput(testCase) + name = "A"; + type = { 123 }; + testCase.verifyError(@() arrow.field(name, type), "MATLAB:validation:UnableToConvert"); + end + + function TestNameUnsupportedInput(testCase) + name = table(); + type = arrow.uint64; + testCase.verifyError(@() arrow.field(name, type), "MATLAB:validation:UnableToConvert"); + end + + function TestImmutableProperties(testCase) + name = "A"; + type = arrow.uint64; + field = arrow.field(name, type); + + testCase.verifyError(@() setfield(field, "Name", "NewValue"), "MATLAB:class:noSetMethod") + testCase.verifyError(@() setfield(field, "Type", arrow.boolean), "MATLAB:class:noSetMethod") + end + + end +end diff --git a/matlab/test/arrow/type/tFloat32Type.m b/matlab/test/arrow/type/tFloat32Type.m index 8c4fa5f402942..1837c39a72ed8 100644 --- a/matlab/test/arrow/type/tFloat32Type.m +++ b/matlab/test/arrow/type/tFloat32Type.m @@ -13,12 +13,13 @@ % implied. See the License for the specific language governing % permissions and limitations under the License. -classdef tFloat32Type < hPrimitiveType +classdef tFloat32Type < hFixedWidthType % Test class for arrow.type.Float32Type properties - ArrowType = arrow.type.Float32Type + ArrowType = arrow.float32 TypeID = arrow.type.ID.Float32 - BitWidth = 32; + BitWidth = int32(32) + ClassName = "arrow.type.Float32Type" end end \ No newline at end of file diff --git a/matlab/test/arrow/type/tFloat64Type.m b/matlab/test/arrow/type/tFloat64Type.m index c4489c4080341..8387a4bf5807b 100644 --- a/matlab/test/arrow/type/tFloat64Type.m +++ b/matlab/test/arrow/type/tFloat64Type.m @@ -13,12 +13,14 @@ % implied. See the License for the specific language governing % permissions and limitations under the License. -classdef tFloat64Type < hPrimitiveType +classdef tFloat64Type < hFixedWidthType % Test class for arrow.type.Float64Type properties - ArrowType = arrow.type.Float64Type + ArrowType = arrow.float64 TypeID = arrow.type.ID.Float64 - BitWidth = 64; + BitWidth = int32(64) + ClassName = "arrow.type.Float64Type" + end end \ No newline at end of file diff --git a/matlab/test/arrow/type/tID.m b/matlab/test/arrow/type/tID.m index 10c99dfab8775..344d2dd0f5a96 100644 --- a/matlab/test/arrow/type/tID.m +++ b/matlab/test/arrow/type/tID.m @@ -26,22 +26,6 @@ function verifyOnMatlabPath(tc) end methods (Test) - function bitWidth(testCase) - import arrow.type.ID - - typeIDs = [ID.Boolean, ID.UInt8, ID.Int8, ID.UInt16, ... - ID.Int16, ID.UInt32, ID.Int32, ID.UInt64, ... - ID.Int64, ID.Float32, ID.Float64]; - - expectedWidths = [1, 8, 8, 16, 16, 32, 32, 64, 64, 32, 64]; - - for ii = 1:numel(typeIDs) - actualWidth = bitWidth(typeIDs(ii)); - expectedWidth = expectedWidths(ii); - testCase.verifyEqual(actualWidth, expectedWidth); - end - end - function CastToUInt64(testCase) import arrow.type.ID diff --git a/matlab/test/arrow/type/tInt16Type.m b/matlab/test/arrow/type/tInt16Type.m index b5b5e803dfd06..9b741a32956f5 100644 --- a/matlab/test/arrow/type/tInt16Type.m +++ b/matlab/test/arrow/type/tInt16Type.m @@ -13,12 +13,13 @@ % implied. See the License for the specific language governing % permissions and limitations under the License. -classdef tInt16Type < hPrimitiveType +classdef tInt16Type < hFixedWidthType % Test class for arrow.type.Int16Type properties - ArrowType = arrow.type.Int16Type + ArrowType = arrow.int16 TypeID = arrow.type.ID.Int16 - BitWidth = 16; + BitWidth = int32(16) + ClassName = "arrow.type.Int16Type" end end \ No newline at end of file diff --git a/matlab/test/arrow/type/tInt32Type.m b/matlab/test/arrow/type/tInt32Type.m index ab9c1bf4a7afa..9724f9a4a6a96 100644 --- a/matlab/test/arrow/type/tInt32Type.m +++ b/matlab/test/arrow/type/tInt32Type.m @@ -13,12 +13,13 @@ % implied. See the License for the specific language governing % permissions and limitations under the License. -classdef tInt32Type < hPrimitiveType +classdef tInt32Type < hFixedWidthType % Test class for arrow.type.Int32Type properties - ArrowType = arrow.type.Int32Type + ArrowType = arrow.int32 TypeID = arrow.type.ID.Int32 - BitWidth = 32; + BitWidth = int32(32) + ClassName = "arrow.type.Int32Type" end end \ No newline at end of file diff --git a/matlab/test/arrow/type/tInt64Type.m b/matlab/test/arrow/type/tInt64Type.m index b5a273f0f36a0..2acb5fd2d3f1f 100644 --- a/matlab/test/arrow/type/tInt64Type.m +++ b/matlab/test/arrow/type/tInt64Type.m @@ -13,12 +13,13 @@ % implied. See the License for the specific language governing % permissions and limitations under the License. -classdef tInt64Type < hPrimitiveType +classdef tInt64Type < hFixedWidthType % Test class for arrow.type.Int64Type properties - ArrowType = arrow.type.Int64Type + ArrowType = arrow.int64 TypeID = arrow.type.ID.Int64 - BitWidth = 64; + BitWidth = int32(64) + ClassName = "arrow.type.Int64Type" end end \ No newline at end of file diff --git a/matlab/test/arrow/type/tInt8Type.m b/matlab/test/arrow/type/tInt8Type.m index 7e8e06790d460..15e2629bc4d68 100644 --- a/matlab/test/arrow/type/tInt8Type.m +++ b/matlab/test/arrow/type/tInt8Type.m @@ -13,12 +13,13 @@ % implied. See the License for the specific language governing % permissions and limitations under the License. -classdef tInt8Type < hPrimitiveType +classdef tInt8Type < hFixedWidthType % Test class for arrow.type.Int8Type properties - ArrowType = arrow.type.Int8Type + ArrowType = arrow.int8 TypeID = arrow.type.ID.Int8 - BitWidth = 8; + BitWidth = int32(8) + ClassName = "arrow.type.Int8Type" end end \ No newline at end of file diff --git a/matlab/test/arrow/type/tStringType.m b/matlab/test/arrow/type/tStringType.m index f3cf101ac6185..e52c2cb1cba0b 100644 --- a/matlab/test/arrow/type/tStringType.m +++ b/matlab/test/arrow/type/tStringType.m @@ -19,20 +19,15 @@ methods (Test) function Basic(tc) - type = arrow.type.StringType; + type = arrow.string; className = string(class(type)); tc.verifyEqual(className, "arrow.type.StringType"); tc.verifyEqual(type.ID, arrow.type.ID.String); end - function NumBuffers(tc) - type = arrow.type.StringType; - tc.verifyEqual(type.NumBuffers, 3); - end - function NumFields(tc) - type = arrow.type.StringType; - tc.verifyEqual(type.NumFields, 0); + type = arrow.string; + tc.verifyEqual(type.NumFields, int32(0)); end end diff --git a/matlab/test/arrow/type/tTimeUnit.m b/matlab/test/arrow/type/tTimeUnit.m index b01de443443c5..0c2432193a3af 100644 --- a/matlab/test/arrow/type/tTimeUnit.m +++ b/matlab/test/arrow/type/tTimeUnit.m @@ -31,9 +31,9 @@ function TicksPerSecond(testCase) import arrow.type.TimeUnit units = [TimeUnit.Second, TimeUnit.Millisecond, ... TimeUnit.Microsecond, TimeUnit.Nanosecond]'; - ticksPerSecond = [1 1e3 1e6 1e9]; + ticks = [1 1e3 1e6 1e9]; for ii = 1:numel(units) - testCase.verifyEqual(units(ii).TicksPerSecond, ticksPerSecond(ii)); + testCase.verifyEqual(ticksPerSecond(units(ii)), ticks(ii)); end end end diff --git a/matlab/test/arrow/type/tTimestampType.m b/matlab/test/arrow/type/tTimestampType.m index f8a9a37f32a63..deee984e4b911 100644 --- a/matlab/test/arrow/type/tTimestampType.m +++ b/matlab/test/arrow/type/tTimestampType.m @@ -13,19 +13,26 @@ % implied. See the License for the specific language governing % permissions and limitations under the License. -classdef tTimestampType < hPrimitiveType +classdef tTimestampType < hFixedWidthType % Test class for arrow.type.TimestampType properties - ArrowType = arrow.type.TimestampType + ArrowType = arrow.timestamp TypeID = arrow.type.ID.Timestamp - BitWidth = 64; + BitWidth = int32(64) + ClassName = "arrow.type.TimestampType" end methods(Test) + function TestClass(testCase) + % Verify ArrowType is an object of the expected class type. + name = string(class(testCase.ArrowType)); + testCase.verifyEqual(name, testCase.ClassName); + end + function DefaultTimeUnit(testCase) % Verify the default TimeUnit is Microsecond - type = arrow.type.TimestampType; + type = arrow.timestamp; actualUnit = type.TimeUnit; expectedUnit = arrow.type.TimeUnit.Microsecond; testCase.verifyEqual(actualUnit, expectedUnit); @@ -33,7 +40,7 @@ function DefaultTimeUnit(testCase) function DefaultTimeZone(testCase) % Verify the default TimeZone is "" - type = arrow.type.TimestampType; + type = arrow.timestamp; actualTimezone = type.TimeZone; expectedTimezone = ""; testCase.verifyEqual(actualTimezone, expectedTimezone); @@ -46,7 +53,7 @@ function SupplyTimeUnitEnum(testCase) TimeUnit.Microsecond, TimeUnit.Nanosecond]; for unit = expectedUnit - type = TimestampType(TimeUnit=unit); + type = arrow.timestamp(TimeUnit=unit); testCase.verifyEqual(type.TimeUnit, unit); end end @@ -60,43 +67,66 @@ function SupplyTimeUnitString(testCase) TimeUnit.Microsecond, TimeUnit.Nanosecond]; for ii = 1:numel(unitString) - type = TimestampType(TimeUnit=unitString(ii)); + type = arrow.timestamp(TimeUnit=unitString(ii)); testCase.verifyEqual(type.TimeUnit, expectedUnit(ii)); end end function SupplyTimeZone(testCase) % Supply the TimeZone. - type = arrow.type.TimestampType(TimeZone="America/New_York"); + type = arrow.timestamp(TimeZone="America/New_York"); testCase.verifyEqual(type.TimeZone, "America/New_York"); end function ErrorIfMissingStringTimeZone(testCase) - fcn = @() arrow.type.TimestampType(TimeZone=string(missing)); + fcn = @() arrow.timestamp(TimeZone=string(missing)); testCase.verifyError(fcn, "MATLAB:validators:mustBeNonmissing"); end function ErrorIfTimeZoneIsNonScalar(testCase) - fcn = @() arrow.type.TimestampType(TimeZone=["a", "b"]); + fcn = @() arrow.timestamp(TimeZone=["a", "b"]); testCase.verifyError(fcn, "MATLAB:validation:IncompatibleSize"); - fcn = @() arrow.type.TimestampType(TimeZone=strings(0, 0)); + fcn = @() arrow.timestamp(TimeZone=strings(0, 0)); testCase.verifyError(fcn, "MATLAB:validation:IncompatibleSize"); end function ErrorIfAmbiguousTimeUnit(testCase) - fcn = @() arrow.type.TimestampType(TimeUnit="mi"); + fcn = @() arrow.timestamp(TimeUnit="mi"); testCase.verifyError(fcn, "MATLAB:validation:UnableToConvert"); end function ErrorIfTimeUnitIsNonScalar(testCase) units = [arrow.type.TimeUnit.Second; arrow.type.TimeUnit.Millisecond]; - fcn = @() arrow.type.TimestampType(TimeZone=units); + fcn = @() arrow.timestamp(TimeZone=units); testCase.verifyError(fcn, "MATLAB:validation:IncompatibleSize"); units = ["second" "millisecond"]; - fcn = @() arrow.type.TimestampType(TimeZone=units); + fcn = @() arrow.timestamp(TimeZone=units); testCase.verifyError(fcn, "MATLAB:validation:IncompatibleSize"); end + + function Display(testCase) + % Verify the display of TimestampType objects. + % + % Example: + % + % TimestampType with properties: + % + % ID: Timestamp + % TimeUnit: Second + % TimeZone: "America/Anchorage" + % + type = arrow.timestamp(TimeUnit="Second", TimeZone="America/Anchorage"); %#ok + classnameLink = "TimestampType"; + header = " " + classnameLink + " with properties:" + newline; + body = strjust(pad(["ID:"; "TimeUnit:"; "TimeZone:"])); + body = body + " " + ["Timestamp"; "Second"; """America/Anchorage"""]; + body = " " + body; + footer = string(newline); + expectedDisplay = char(strjoin([header body' footer], newline)); + actualDisplay = evalc('disp(type)'); + testCase.verifyEqual(actualDisplay, expectedDisplay); + end end -end \ No newline at end of file +end diff --git a/matlab/test/arrow/type/tUInt16Type.m b/matlab/test/arrow/type/tUInt16Type.m index b5102ace34d84..8a803dc0a7888 100644 --- a/matlab/test/arrow/type/tUInt16Type.m +++ b/matlab/test/arrow/type/tUInt16Type.m @@ -13,12 +13,13 @@ % implied. See the License for the specific language governing % permissions and limitations under the License. -classdef tUInt16Type < hPrimitiveType +classdef tUInt16Type < hFixedWidthType % Test class for arrow.type.UInt16Type properties - ArrowType = arrow.type.UInt16Type + ArrowType = arrow.uint16 TypeID = arrow.type.ID.UInt16 - BitWidth = 16; + BitWidth = int32(16) + ClassName = "arrow.type.UInt16Type" end end \ No newline at end of file diff --git a/matlab/test/arrow/type/tUInt32Type.m b/matlab/test/arrow/type/tUInt32Type.m index 8f86eec7f53c3..019b8ce26929d 100644 --- a/matlab/test/arrow/type/tUInt32Type.m +++ b/matlab/test/arrow/type/tUInt32Type.m @@ -13,12 +13,13 @@ % implied. See the License for the specific language governing % permissions and limitations under the License. -classdef tUInt32Type < hPrimitiveType +classdef tUInt32Type < hFixedWidthType % Test class for arrow.type.UInt32Type properties - ArrowType = arrow.type.UInt32Type + ArrowType = arrow.uint32 TypeID = arrow.type.ID.UInt32 - BitWidth = 32; + BitWidth = int32(32) + ClassName = "arrow.type.UInt32Type" end end \ No newline at end of file diff --git a/matlab/test/arrow/type/tUInt64Type.m b/matlab/test/arrow/type/tUInt64Type.m index 7f3084616d35f..8287bb40d0052 100644 --- a/matlab/test/arrow/type/tUInt64Type.m +++ b/matlab/test/arrow/type/tUInt64Type.m @@ -13,12 +13,13 @@ % implied. See the License for the specific language governing % permissions and limitations under the License. -classdef tUInt64Type < hPrimitiveType +classdef tUInt64Type < hFixedWidthType % Test class for arrow.type.UInt64Type properties - ArrowType = arrow.type.UInt64Type + ArrowType = arrow.uint64 TypeID = arrow.type.ID.UInt64 - BitWidth = 64; + BitWidth = int32(64) + ClassName = "arrow.type.UInt64Type" end end \ No newline at end of file diff --git a/matlab/test/arrow/type/tUInt8Type.m b/matlab/test/arrow/type/tUInt8Type.m index 6dfc8a4694359..1ff203c862aeb 100644 --- a/matlab/test/arrow/type/tUInt8Type.m +++ b/matlab/test/arrow/type/tUInt8Type.m @@ -13,12 +13,13 @@ % implied. See the License for the specific language governing % permissions and limitations under the License. -classdef tUInt8Type < hPrimitiveType +classdef tUInt8Type < hFixedWidthType % Test class for arrow.type.UInt64Type properties - ArrowType = arrow.type.UInt8Type + ArrowType = arrow.uint8 TypeID = arrow.type.ID.UInt8 - BitWidth = 8; + BitWidth = int32(8) + ClassName = "arrow.type.UInt8Type" end end \ No newline at end of file diff --git a/matlab/test/arrow/type/traits/hTypeTraits.m b/matlab/test/arrow/type/traits/hTypeTraits.m new file mode 100644 index 0000000000000..df62fdd325f2f --- /dev/null +++ b/matlab/test/arrow/type/traits/hTypeTraits.m @@ -0,0 +1,78 @@ +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +classdef hTypeTraits < matlab.unittest.TestCase +% Superclass for tests that validate the behavior of "type trait" objects +% like arrow.type.traits.StringTraits. + + properties (Abstract) + TraitsConstructor + ArrayConstructor + ArrayClassName + ArrayProxyClassName + TypeConstructor + TypeClassName + TypeProxyClassName + MatlabConstructor + MatlabClassName + end + + properties + Traits + end + + methods (TestMethodSetup) + function setupTraits(testCase) + testCase.Traits = testCase.TraitsConstructor(); + end + end + + methods(Test) + + function TestArrayConstructor(testCase) + testCase.verifyEqual(testCase.Traits.ArrayConstructor, testCase.ArrayConstructor); + end + + function TestArrayClassName(testCase) + testCase.verifyEqual(testCase.Traits.ArrayClassName, testCase.ArrayClassName); + end + + function TestArrayProxyClassName(testCase) + testCase.verifyEqual(testCase.Traits.ArrayProxyClassName, testCase.ArrayProxyClassName); + end + + function TestTypeConstructor(testCase) + testCase.verifyEqual(testCase.Traits.TypeConstructor, testCase.TypeConstructor); + end + + function TestTypeClassName(testCase) + testCase.verifyEqual(testCase.Traits.TypeClassName, testCase.TypeClassName); + end + + function TestTypeProxyClassName(testCase) + testCase.verifyEqual(testCase.Traits.TypeProxyClassName, testCase.TypeProxyClassName); + end + + function TestMatlabConstructor(testCase) + testCase.verifyEqual(testCase.Traits.MatlabConstructor, testCase.MatlabConstructor); + end + + function TestMatlabClassName(testCase) + testCase.verifyEqual(testCase.Traits.MatlabClassName, testCase.MatlabClassName); + end + + end + +end \ No newline at end of file diff --git a/matlab/test/arrow/type/traits/tBooleanTraits.m b/matlab/test/arrow/type/traits/tBooleanTraits.m new file mode 100644 index 0000000000000..859dc630a1fc7 --- /dev/null +++ b/matlab/test/arrow/type/traits/tBooleanTraits.m @@ -0,0 +1,30 @@ +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +classdef tBooleanTraits < hTypeTraits + + properties + TraitsConstructor = @arrow.type.traits.BooleanTraits + ArrayConstructor = @arrow.array.BooleanArray + ArrayClassName = "arrow.array.BooleanArray" + ArrayProxyClassName = "arrow.array.proxy.BooleanArray" + TypeConstructor = @arrow.type.BooleanType + TypeClassName = "arrow.type.BooleanType" + TypeProxyClassName = "arrow.type.proxy.BooleanType" + MatlabConstructor = @logical + MatlabClassName = "logical" + end + +end \ No newline at end of file diff --git a/matlab/test/arrow/type/traits/tInt16Traits.m b/matlab/test/arrow/type/traits/tInt16Traits.m new file mode 100644 index 0000000000000..bde308d28e68a --- /dev/null +++ b/matlab/test/arrow/type/traits/tInt16Traits.m @@ -0,0 +1,30 @@ +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +classdef tInt16Traits < hTypeTraits + + properties + TraitsConstructor = @arrow.type.traits.Int16Traits + ArrayConstructor = @arrow.array.Int16Array + ArrayClassName = "arrow.array.Int16Array" + ArrayProxyClassName = "arrow.array.proxy.Int16Array" + TypeConstructor = @arrow.type.Int16Type + TypeClassName = "arrow.type.Int16Type" + TypeProxyClassName = "arrow.type.proxy.Int16Type" + MatlabConstructor = @int16 + MatlabClassName = "int16" + end + +end \ No newline at end of file diff --git a/matlab/test/arrow/type/traits/tInt32Traits.m b/matlab/test/arrow/type/traits/tInt32Traits.m new file mode 100644 index 0000000000000..651f647455408 --- /dev/null +++ b/matlab/test/arrow/type/traits/tInt32Traits.m @@ -0,0 +1,30 @@ +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +classdef tInt32Traits < hTypeTraits + + properties + TraitsConstructor = @arrow.type.traits.Int32Traits + ArrayConstructor = @arrow.array.Int32Array + ArrayClassName = "arrow.array.Int32Array" + ArrayProxyClassName = "arrow.array.proxy.Int32Array" + TypeConstructor = @arrow.type.Int32Type + TypeClassName = "arrow.type.Int32Type" + TypeProxyClassName = "arrow.type.proxy.Int32Type" + MatlabConstructor = @int32 + MatlabClassName = "int32" + end + +end \ No newline at end of file diff --git a/matlab/test/arrow/type/traits/tInt64Traits.m b/matlab/test/arrow/type/traits/tInt64Traits.m new file mode 100644 index 0000000000000..4f16c91eb4e09 --- /dev/null +++ b/matlab/test/arrow/type/traits/tInt64Traits.m @@ -0,0 +1,30 @@ +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +classdef tInt64Traits < hTypeTraits + + properties + TraitsConstructor = @arrow.type.traits.Int64Traits + ArrayConstructor = @arrow.array.Int64Array + ArrayClassName = "arrow.array.Int64Array" + ArrayProxyClassName = "arrow.array.proxy.Int64Array" + TypeConstructor = @arrow.type.Int64Type + TypeClassName = "arrow.type.Int64Type" + TypeProxyClassName = "arrow.type.proxy.Int64Type" + MatlabConstructor = @int64 + MatlabClassName = "int64" + end + +end \ No newline at end of file diff --git a/matlab/test/arrow/type/traits/tInt8Traits.m b/matlab/test/arrow/type/traits/tInt8Traits.m new file mode 100644 index 0000000000000..3e767abbebba4 --- /dev/null +++ b/matlab/test/arrow/type/traits/tInt8Traits.m @@ -0,0 +1,30 @@ +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +classdef tInt8Traits < hTypeTraits + + properties + TraitsConstructor = @arrow.type.traits.Int8Traits + ArrayConstructor = @arrow.array.Int8Array + ArrayClassName = "arrow.array.Int8Array" + ArrayProxyClassName = "arrow.array.proxy.Int8Array" + TypeConstructor = @arrow.type.Int8Type + TypeClassName = "arrow.type.Int8Type" + TypeProxyClassName = "arrow.type.proxy.Int8Type" + MatlabConstructor = @int8 + MatlabClassName = "int8" + end + +end \ No newline at end of file diff --git a/matlab/test/arrow/type/traits/tStringTraits.m b/matlab/test/arrow/type/traits/tStringTraits.m new file mode 100644 index 0000000000000..68f061d1b031d --- /dev/null +++ b/matlab/test/arrow/type/traits/tStringTraits.m @@ -0,0 +1,30 @@ +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +classdef tStringTraits < hTypeTraits + + properties + TraitsConstructor = @arrow.type.traits.StringTraits + ArrayConstructor = @arrow.array.StringArray + ArrayClassName = "arrow.array.StringArray" + ArrayProxyClassName = "arrow.array.proxy.StringArray" + TypeConstructor = @arrow.type.StringType + TypeClassName = "arrow.type.StringType" + TypeProxyClassName = "arrow.type.proxy.StringType" + MatlabConstructor = @string + MatlabClassName = "string" + end + +end \ No newline at end of file diff --git a/matlab/test/arrow/type/traits/tTimestampTraits.m b/matlab/test/arrow/type/traits/tTimestampTraits.m new file mode 100644 index 0000000000000..5f451c0631465 --- /dev/null +++ b/matlab/test/arrow/type/traits/tTimestampTraits.m @@ -0,0 +1,30 @@ +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +classdef tTimestampTraits < hTypeTraits + + properties + TraitsConstructor = @arrow.type.traits.TimestampTraits + ArrayConstructor = @arrow.array.TimestampArray + ArrayClassName = "arrow.array.TimestampArray" + ArrayProxyClassName = "arrow.array.proxy.TimestampArray" + TypeConstructor = @arrow.type.TimestampType + TypeClassName = "arrow.type.TimestampType" + TypeProxyClassName = "arrow.type.proxy.TimestampType" + MatlabConstructor = @datetime + MatlabClassName = "datetime" + end + +end \ No newline at end of file diff --git a/matlab/test/arrow/type/traits/tUInt16Traits.m b/matlab/test/arrow/type/traits/tUInt16Traits.m new file mode 100644 index 0000000000000..4a9eef6f2978d --- /dev/null +++ b/matlab/test/arrow/type/traits/tUInt16Traits.m @@ -0,0 +1,30 @@ +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +classdef tUInt16Traits < hTypeTraits + + properties + TraitsConstructor = @arrow.type.traits.UInt16Traits + ArrayConstructor = @arrow.array.UInt16Array + ArrayClassName = "arrow.array.UInt16Array" + ArrayProxyClassName = "arrow.array.proxy.UInt16Array" + TypeConstructor = @arrow.type.UInt16Type + TypeClassName = "arrow.type.UInt16Type" + TypeProxyClassName = "arrow.type.proxy.UInt16Type" + MatlabConstructor = @uint16 + MatlabClassName = "uint16" + end + +end \ No newline at end of file diff --git a/matlab/test/arrow/type/traits/tUInt32Traits.m b/matlab/test/arrow/type/traits/tUInt32Traits.m new file mode 100644 index 0000000000000..227e42c4eb0ec --- /dev/null +++ b/matlab/test/arrow/type/traits/tUInt32Traits.m @@ -0,0 +1,30 @@ +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +classdef tUInt32Traits < hTypeTraits + + properties + TraitsConstructor = @arrow.type.traits.UInt32Traits + ArrayConstructor = @arrow.array.UInt32Array + ArrayClassName = "arrow.array.UInt32Array" + ArrayProxyClassName = "arrow.array.proxy.UInt32Array" + TypeConstructor = @arrow.type.UInt32Type + TypeClassName = "arrow.type.UInt32Type" + TypeProxyClassName = "arrow.type.proxy.UInt32Type" + MatlabConstructor = @uint32 + MatlabClassName = "uint32" + end + +end \ No newline at end of file diff --git a/matlab/test/arrow/type/traits/tUInt64Traits.m b/matlab/test/arrow/type/traits/tUInt64Traits.m new file mode 100644 index 0000000000000..370e905f27736 --- /dev/null +++ b/matlab/test/arrow/type/traits/tUInt64Traits.m @@ -0,0 +1,30 @@ +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +classdef tUInt64Traits < hTypeTraits + + properties + TraitsConstructor = @arrow.type.traits.UInt64Traits + ArrayConstructor = @arrow.array.UInt64Array + ArrayClassName = "arrow.array.UInt64Array" + ArrayProxyClassName = "arrow.array.proxy.UInt64Array" + TypeConstructor = @arrow.type.UInt64Type + TypeClassName = "arrow.type.UInt64Type" + TypeProxyClassName = "arrow.type.proxy.UInt64Type" + MatlabConstructor = @uint64 + MatlabClassName = "uint64" + end + +end \ No newline at end of file diff --git a/matlab/test/arrow/type/traits/tUInt8Traits.m b/matlab/test/arrow/type/traits/tUInt8Traits.m new file mode 100644 index 0000000000000..d93f9d3c1b942 --- /dev/null +++ b/matlab/test/arrow/type/traits/tUInt8Traits.m @@ -0,0 +1,30 @@ +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +classdef tUInt8Traits < hTypeTraits + + properties + TraitsConstructor = @arrow.type.traits.UInt8Traits + ArrayConstructor = @arrow.array.UInt8Array + ArrayClassName = "arrow.array.UInt8Array" + ArrayProxyClassName = "arrow.array.proxy.UInt8Array" + TypeConstructor = @arrow.type.UInt8Type + TypeClassName = "arrow.type.UInt8Type" + TypeProxyClassName = "arrow.type.proxy.UInt8Type" + MatlabConstructor = @uint8 + MatlabClassName = "uint8" + end + +end \ No newline at end of file diff --git a/matlab/test/arrow/type/traits/ttraits.m b/matlab/test/arrow/type/traits/ttraits.m new file mode 100644 index 0000000000000..14149a5ebff48 --- /dev/null +++ b/matlab/test/arrow/type/traits/ttraits.m @@ -0,0 +1,320 @@ +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +classdef ttraits < matlab.unittest.TestCase + % Tests for the type traits (i.e. arrow.type.traits.traits) + % "gateway" function. + + methods(Test) + + function TestUInt8(testCase) + import arrow.type.traits.* + import arrow.type.* + + typeID = ID.UInt8; + expectedTraits = UInt8Traits(); + + actualTraits = traits(typeID); + + testCase.verifyEqual(actualTraits, expectedTraits); + end + + function TestUInt16(testCase) + import arrow.type.traits.* + import arrow.type.* + + type = ID.UInt16; + expectedTraits = UInt16Traits(); + + actualTraits = traits(type); + + testCase.verifyEqual(actualTraits, expectedTraits); + end + + function TestUInt32(testCase) + import arrow.type.traits.* + import arrow.type.* + + type = ID.UInt32; + expectedTraits = UInt32Traits(); + + actualTraits = traits(type); + + testCase.verifyEqual(actualTraits, expectedTraits); + end + + function TestUInt64(testCase) + import arrow.type.traits.* + import arrow.type.* + + type = ID.UInt64; + expectedTraits = UInt64Traits(); + + actualTraits = traits(type); + + testCase.verifyEqual(actualTraits, expectedTraits); + end + + function TestInt8(testCase) + import arrow.type.traits.* + import arrow.type.* + + type = ID.Int8; + expectedTraits = Int8Traits(); + + actualTraits = traits(type); + + testCase.verifyEqual(actualTraits, expectedTraits); + end + + function TestInt16(testCase) + import arrow.type.traits.* + import arrow.type.* + + type = ID.Int16; + expectedTraits = Int16Traits(); + + actualTraits = traits(type); + + testCase.verifyEqual(actualTraits, expectedTraits); + end + + function TestInt32(testCase) + import arrow.type.traits.* + import arrow.type.* + + type = ID.Int32; + expectedTraits = Int32Traits(); + + actualTraits = traits(type); + + testCase.verifyEqual(actualTraits, expectedTraits); + end + + function TestInt64(testCase) + import arrow.type.traits.* + import arrow.type.* + + type = ID.Int64; + expectedTraits = Int64Traits(); + + actualTraits = traits(type); + + testCase.verifyEqual(actualTraits, expectedTraits); + end + + function TestString(testCase) + import arrow.type.traits.* + import arrow.type.* + + type = ID.String; + expectedTraits = StringTraits(); + + actualTraits = traits(type); + + testCase.verifyEqual(actualTraits, expectedTraits); + end + + function TestTimestamp(testCase) + import arrow.type.traits.* + import arrow.type.* + + type = ID.Timestamp; + expectedTraits = TimestampTraits(); + + actualTraits = traits(type); + + testCase.verifyEqual(actualTraits, expectedTraits); + end + + function TestBoolean(testCase) + import arrow.type.traits.* + import arrow.type.* + + type = ID.Boolean; + expectedTraits = BooleanTraits(); + + actualTraits = traits(type); + + testCase.verifyEqual(actualTraits, expectedTraits); + end + + function TestMatlabUInt8(testCase) + import arrow.type.traits.* + + type = "uint8"; + expectedTraits = UInt8Traits(); + + actualTraits = traits(type); + + testCase.verifyEqual(actualTraits, expectedTraits); + end + + function TestMatlabUInt16(testCase) + import arrow.type.traits.* + + type = "uint16"; + expectedTraits = UInt16Traits(); + + actualTraits = traits(type); + + testCase.verifyEqual(actualTraits, expectedTraits); + end + + function TestMatlabUInt32(testCase) + import arrow.type.traits.* + + type = "uint32"; + expectedTraits = UInt32Traits(); + + actualTraits = traits(type); + + testCase.verifyEqual(actualTraits, expectedTraits); + end + + function TestMatlabUInt64(testCase) + import arrow.type.traits.* + + type = "uint64"; + expectedTraits = UInt64Traits(); + + actualTraits = traits(type); + + testCase.verifyEqual(actualTraits, expectedTraits); + end + + function TestMatlabInt8(testCase) + import arrow.type.traits.* + + type = "int8"; + expectedTraits = Int8Traits(); + + actualTraits = traits(type); + + testCase.verifyEqual(actualTraits, expectedTraits); + end + + function TestMatlabInt16(testCase) + import arrow.type.traits.* + + type = "int16"; + expectedTraits = Int16Traits(); + + actualTraits = traits(type); + + testCase.verifyEqual(actualTraits, expectedTraits); + end + + function TestMatlabInt32(testCase) + import arrow.type.traits.* + + type = "int32"; + expectedTraits = Int32Traits(); + + actualTraits = traits(type); + + testCase.verifyEqual(actualTraits, expectedTraits); + end + + function TestMatlabInt64(testCase) + import arrow.type.traits.* + + type = "int64"; + expectedTraits = Int64Traits(); + + actualTraits = traits(type); + + testCase.verifyEqual(actualTraits, expectedTraits); + end + + function TestMatlabSingle(testCase) + import arrow.type.traits.* + + type = "single"; + expectedTraits = Float32Traits(); + + actualTraits = traits(type); + + testCase.verifyEqual(actualTraits, expectedTraits); + end + + function TestMatlabDouble(testCase) + import arrow.type.traits.* + + type = "double"; + expectedTraits = Float64Traits(); + + actualTraits = traits(type); + + testCase.verifyEqual(actualTraits, expectedTraits); + end + + function TestMatlabLogical(testCase) + import arrow.type.traits.* + + type = "logical"; + expectedTraits = BooleanTraits(); + + actualTraits = traits(type); + + testCase.verifyEqual(actualTraits, expectedTraits); + end + + function TestMatlabString(testCase) + import arrow.type.traits.* + + type = "string"; + expectedTraits = StringTraits(); + + actualTraits = traits(type); + + testCase.verifyEqual(actualTraits, expectedTraits); + end + + function TestMatlabDatetime(testCase) + import arrow.type.traits.* + + type = "datetime"; + expectedTraits = TimestampTraits(); + + actualTraits = traits(type); + + testCase.verifyEqual(actualTraits, expectedTraits); + end + + function TestErrorIfUnsupportedMatlabClass(testCase) + import arrow.type.traits.* + + type = "not-a-class"; + + testCase.verifyError(@() traits(type), "arrow:type:traits:UnsupportedMatlabClass"); + end + + function TestErrorIfUnsupportedInputType(testCase) + import arrow.type.traits.* + + type = 123; + testCase.verifyError(@() traits(type), "arrow:type:traits:UnsupportedInputType"); + + type = {'double'}; + testCase.verifyError(@() traits(type), "arrow:type:traits:UnsupportedInputType"); + + type = datetime(2023, 1, 1); + testCase.verifyError(@() traits(type), "arrow:type:traits:UnsupportedInputType"); + end + + end + +end \ No newline at end of file diff --git a/matlab/tools/cmake/BuildMatlabArrowInterface.cmake b/matlab/tools/cmake/BuildMatlabArrowInterface.cmake index 27a64a19a91ef..530799c15c172 100644 --- a/matlab/tools/cmake/BuildMatlabArrowInterface.cmake +++ b/matlab/tools/cmake/BuildMatlabArrowInterface.cmake @@ -24,7 +24,7 @@ set(MATLAB_ARROW_LIBMEXCLASS_CLIENT_FETCH_CONTENT_NAME libmexclass) # libmexclass is accessible for CI without permission issues. set(MATLAB_ARROW_LIBMEXCLASS_CLIENT_FETCH_CONTENT_GIT_REPOSITORY "https://github.com/mathworks/libmexclass.git") # Use a specific Git commit hash to avoid libmexclass version changing unexpectedly. -set(MATLAB_ARROW_LIBMEXCLASS_CLIENT_FETCH_CONTENT_GIT_TAG "77f3d72") +set(MATLAB_ARROW_LIBMEXCLASS_CLIENT_FETCH_CONTENT_GIT_TAG "d04f88d") set(MATLAB_ARROW_LIBMEXCLASS_CLIENT_FETCH_CONTENT_SOURCE_SUBDIR "libmexclass/cpp") @@ -37,17 +37,24 @@ set(MATLAB_ARROW_LIBMEXCLASS_CLIENT_PROXY_LIBRARY_ROOT_INCLUDE_DIR "${CMAKE_SOUR set(MATLAB_ARROW_LIBMEXCLASS_CLIENT_PROXY_INCLUDE_DIR "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/array/proxy" "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/bit" "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/error" - "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/type") - + "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/type" + "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/type/proxy" + "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/buffer") set(MATLAB_ARROW_LIBMEXCLASS_CLIENT_PROXY_SOURCES "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/array/proxy/array.cc" "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/array/proxy/boolean_array.cc" "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/array/proxy/string_array.cc" - "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/array/proxy/timestamp_array.cc" "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/tabular/proxy/record_batch.cc" "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/bit/pack.cc" "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/bit/unpack.cc" - "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/type/time_unit.cc") + "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/type/time_unit.cc" + "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/type/proxy/type.cc" + "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/type/proxy/fixed_width_type.cc" + "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/type/proxy/string_type.cc" + "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/type/proxy/timestamp_type.cc" + "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/type/proxy/field.cc" + "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/type/proxy/wrap.cc") + set(MATLAB_ARROW_LIBMEXCLASS_CLIENT_PROXY_FACTORY_INCLUDE_DIR "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/proxy") set(MATLAB_ARROW_LIBMEXCLASS_CLIENT_PROXY_FACTORY_SOURCES "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/proxy/factory.cc") diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index ab63a7a19f7f6..ac7efeff41aba 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -1201,6 +1201,8 @@ class SliceOptions(_SliceOptions): def __init__(self, start, stop=None, step=1): if stop is None: stop = sys.maxsize + if step < 0: + stop = -stop self._set_options(start, stop, step) diff --git a/python/pyarrow/_dataset.pxd b/python/pyarrow/_dataset.pxd index d626b42e23843..210e5558009ec 100644 --- a/python/pyarrow/_dataset.pxd +++ b/python/pyarrow/_dataset.pxd @@ -160,11 +160,14 @@ cdef class PartitioningFactory(_Weakrefable): cdef: shared_ptr[CPartitioningFactory] wrapped CPartitioningFactory* factory + object constructor + object options cdef init(self, const shared_ptr[CPartitioningFactory]& sp) @staticmethod - cdef wrap(const shared_ptr[CPartitioningFactory]& sp) + cdef wrap(const shared_ptr[CPartitioningFactory]& sp, + object constructor, object options) cdef inline shared_ptr[CPartitioningFactory] unwrap(self) diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx index 2ab8ffb7985ed..badf6e4a4c5dc 100644 --- a/python/pyarrow/_dataset.pyx +++ b/python/pyarrow/_dataset.pyx @@ -2348,10 +2348,9 @@ cdef class Partitioning(_Weakrefable): return self.wrapped def __eq__(self, other): - try: + if isinstance(other, Partitioning): return self.partitioning.Equals(deref((other).unwrap())) - except TypeError: - return False + return False def parse(self, path): cdef CResult[CExpression] result @@ -2374,16 +2373,22 @@ cdef class PartitioningFactory(_Weakrefable): self.factory = sp.get() @staticmethod - cdef wrap(const shared_ptr[CPartitioningFactory]& sp): + cdef wrap(const shared_ptr[CPartitioningFactory]& sp, + object constructor, object options): cdef PartitioningFactory self = PartitioningFactory.__new__( PartitioningFactory ) self.init(sp) + self.constructor = constructor + self.options = options return self cdef inline shared_ptr[CPartitioningFactory] unwrap(self): return self.wrapped + def __reduce__(self): + return self.constructor, self.options + @property def type_name(self): return frombytes(self.factory.type_name()) @@ -2454,6 +2459,10 @@ cdef class KeyValuePartitioning(Partitioning): return res +def _constructor_directory_partitioning_factory(*args): + return DirectoryPartitioning.discover(*args) + + cdef class DirectoryPartitioning(KeyValuePartitioning): """ A Partitioning based on a specified Schema. @@ -2571,7 +2580,15 @@ cdef class DirectoryPartitioning(KeyValuePartitioning): c_options.segment_encoding = _get_segment_encoding(segment_encoding) return PartitioningFactory.wrap( - CDirectoryPartitioning.MakeFactory(c_field_names, c_options)) + CDirectoryPartitioning.MakeFactory(c_field_names, c_options), + _constructor_directory_partitioning_factory, + (field_names, infer_dictionary, max_partition_dictionary_size, + schema, segment_encoding) + ) + + +def _constructor_hive_partitioning_factory(*args): + return HivePartitioning.discover(*args) cdef class HivePartitioning(KeyValuePartitioning): @@ -2714,7 +2731,15 @@ cdef class HivePartitioning(KeyValuePartitioning): c_options.segment_encoding = _get_segment_encoding(segment_encoding) return PartitioningFactory.wrap( - CHivePartitioning.MakeFactory(c_options)) + CHivePartitioning.MakeFactory(c_options), + _constructor_hive_partitioning_factory, + (infer_dictionary, max_partition_dictionary_size, null_fallback, + schema, segment_encoding), + ) + + +def _constructor_filename_partitioning_factory(*args): + return FilenamePartitioning.discover(*args) cdef class FilenamePartitioning(KeyValuePartitioning): @@ -2823,7 +2848,10 @@ cdef class FilenamePartitioning(KeyValuePartitioning): c_options.segment_encoding = _get_segment_encoding(segment_encoding) return PartitioningFactory.wrap( - CFilenamePartitioning.MakeFactory(c_field_names, c_options)) + CFilenamePartitioning.MakeFactory(c_field_names, c_options), + _constructor_filename_partitioning_factory, + (field_names, infer_dictionary, schema, segment_encoding) + ) cdef class DatasetFactory(_Weakrefable): @@ -2988,7 +3016,7 @@ cdef class FileSystemFactoryOptions(_Weakrefable): c_factory = self.options.partitioning.factory() if c_factory.get() == nullptr: return None - return PartitioningFactory.wrap(c_factory) + return PartitioningFactory.wrap(c_factory, None, None) @partitioning_factory.setter def partitioning_factory(self, PartitioningFactory value): @@ -3309,7 +3337,7 @@ cdef class Scanner(_Weakrefable): ---------- dataset : Dataset Dataset to scan. - columns : list of str, default None + columns : list[str] or dict[str, Expression], default None The columns to project. This can be a list of column names to include (order and duplicates will be preserved), or a dictionary with {new_column_name: expression} values for more advanced @@ -3388,7 +3416,7 @@ cdef class Scanner(_Weakrefable): fragment to scan. schema : Schema, optional The schema of the fragment. - columns : list of str, default None + columns : list[str] or dict[str, Expression], default None The columns to project. This can be a list of column names to include (order and duplicates will be preserved), or a dictionary with {new_column_name: expression} values for more advanced @@ -3474,7 +3502,7 @@ cdef class Scanner(_Weakrefable): The iterator of Batches. schema : Schema The schema of the batches. - columns : list of str, default None + columns : list[str] or dict[str, Expression], default None The columns to project. This can be a list of column names to include (order and duplicates will be preserved), or a dictionary with {new_column_name: expression} values for more advanced diff --git a/python/pyarrow/_dataset_parquet.pyx b/python/pyarrow/_dataset_parquet.pyx index bc4786b9cd61e..4ad0caec30798 100644 --- a/python/pyarrow/_dataset_parquet.pyx +++ b/python/pyarrow/_dataset_parquet.pyx @@ -811,7 +811,7 @@ cdef class ParquetFactoryOptions(_Weakrefable): c_factory = self.options.partitioning.factory() if c_factory.get() == nullptr: return None - return PartitioningFactory.wrap(c_factory) + return PartitioningFactory.wrap(c_factory, None, None) @partitioning_factory.setter def partitioning_factory(self, PartitioningFactory value): diff --git a/python/pyarrow/_flight.pyx b/python/pyarrow/_flight.pyx index 6f5cd03cd56bf..0572ed77b40ef 100644 --- a/python/pyarrow/_flight.pyx +++ b/python/pyarrow/_flight.pyx @@ -1756,6 +1756,14 @@ cdef class ServerCallContext(_Weakrefable): """Check if the current RPC call has been canceled by the client.""" return self.context.is_cancelled() + def add_header(self, key, value): + """Add a response header.""" + self.context.AddHeader(tobytes(key), tobytes(value)) + + def add_trailer(self, key, value): + """Add a response trailer.""" + self.context.AddTrailer(tobytes(key), tobytes(value)) + def get_middleware(self, key): """ Get a middleware instance by key. @@ -3016,7 +3024,7 @@ cdef class FlightServerBase(_Weakrefable): def serve(self): """Block until the server shuts down. - This method only returns if shutdown() is called or a signal a + This method only returns if shutdown() is called or a signal is received. """ if self.server.get() == nullptr: @@ -3041,6 +3049,8 @@ cdef class FlightServerBase(_Weakrefable): method, as then the server will block forever waiting for that request to finish. Instead, call this method from a background thread. + + This method should only be called once. """ # Must not hold the GIL: shutdown waits for pending RPCs to # complete. Holding the GIL means Python-implemented Flight diff --git a/python/pyarrow/_parquet.pxd b/python/pyarrow/_parquet.pxd index a3db7ab8d3001..39cdcc063b503 100644 --- a/python/pyarrow/_parquet.pxd +++ b/python/pyarrow/_parquet.pxd @@ -300,6 +300,10 @@ cdef extern from "parquet/api/reader.h" namespace "parquet" nogil: c_bool encrypted_with_footer_key() const const c_string& key_metadata() const + cdef cppclass ParquetIndexLocation" parquet::IndexLocation": + int64_t offset + int32_t length + cdef cppclass CColumnChunkMetaData" parquet::ColumnChunkMetaData": int64_t file_offset() const const c_string& file_path() const @@ -321,6 +325,8 @@ cdef extern from "parquet/api/reader.h" namespace "parquet" nogil: int64_t total_compressed_size() const int64_t total_uncompressed_size() const unique_ptr[CColumnCryptoMetaData] crypto_metadata() const + optional[ParquetIndexLocation] GetColumnIndexLocation() const + optional[ParquetIndexLocation] GetOffsetIndexLocation() const cdef cppclass CRowGroupMetaData" parquet::RowGroupMetaData": c_bool Equals(const CRowGroupMetaData&) const @@ -420,6 +426,8 @@ cdef extern from "parquet/api/writer.h" namespace "parquet" nogil: Builder* max_row_group_length(int64_t size) Builder* write_batch_size(int64_t batch_size) Builder* dictionary_pagesize_limit(int64_t dictionary_pagesize_limit) + Builder* enable_write_page_index() + Builder* disable_write_page_index() shared_ptr[WriterProperties] build() cdef cppclass ArrowWriterProperties: @@ -567,7 +575,8 @@ cdef shared_ptr[WriterProperties] _create_writer_properties( data_page_version=*, FileEncryptionProperties encryption_properties=*, write_batch_size=*, - dictionary_pagesize_limit=*) except * + dictionary_pagesize_limit=*, + write_page_index=*) except * cdef shared_ptr[ArrowWriterProperties] _create_arrow_writer_properties( diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index 2f53d5fbbaa34..4448f359ac1aa 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -175,17 +175,18 @@ cdef class Statistics(_Weakrefable): @property def null_count(self): """Number of null values in chunk (int).""" - return self.statistics.get().null_count() + if self.has_null_count: + return self.statistics.get().null_count() + else: + return None @property def distinct_count(self): - """ - Distinct number of values in chunk (int). - - If this is not set, will return 0. - """ - # This seems to be zero if not set. See: ARROW-11793 - return self.statistics.get().distinct_count() + """Distinct number of values in chunk (int).""" + if self.has_distinct_count: + return self.statistics.get().distinct_count() + else: + return None @property def num_values(self): @@ -462,7 +463,7 @@ cdef class ColumnChunkMetaData(_Weakrefable): @property def dictionary_page_offset(self): - """Offset of dictionary page reglative to column chunk offset (int).""" + """Offset of dictionary page relative to column chunk offset (int).""" if self.has_dictionary_page: return self.metadata.dictionary_page_offset() else: @@ -470,7 +471,7 @@ cdef class ColumnChunkMetaData(_Weakrefable): @property def data_page_offset(self): - """Offset of data page reglative to column chunk offset (int).""" + """Offset of data page relative to column chunk offset (int).""" return self.metadata.data_page_offset() @property @@ -493,6 +494,16 @@ cdef class ColumnChunkMetaData(_Weakrefable): """Uncompressed size in bytes (int).""" return self.metadata.total_uncompressed_size() + @property + def has_offset_index(self): + """Whether the column chunk has an offset index""" + return self.metadata.GetOffsetIndexLocation().has_value() + + @property + def has_column_index(self): + """Whether the column chunk has a column index""" + return self.metadata.GetColumnIndexLocation().has_value() + cdef class RowGroupMetaData(_Weakrefable): """Metadata for a single row group.""" @@ -1455,7 +1466,8 @@ cdef shared_ptr[WriterProperties] _create_writer_properties( data_page_version=None, FileEncryptionProperties encryption_properties=None, write_batch_size=None, - dictionary_pagesize_limit=None) except *: + dictionary_pagesize_limit=None, + write_page_index=False) except *: """General writer properties""" cdef: shared_ptr[WriterProperties] properties @@ -1599,6 +1611,13 @@ cdef shared_ptr[WriterProperties] _create_writer_properties( # a size larger than this then it will be latched to this value. props.max_row_group_length(_MAX_ROW_GROUP_SIZE) + # page index + + if write_page_index: + props.enable_write_page_index() + else: + props.disable_write_page_index() + properties = props.build() return properties @@ -1710,7 +1729,8 @@ cdef class ParquetWriter(_Weakrefable): encryption_properties=None, write_batch_size=None, dictionary_pagesize_limit=None, - store_schema=True): + store_schema=True, + write_page_index=False): cdef: shared_ptr[WriterProperties] properties shared_ptr[ArrowWriterProperties] arrow_properties @@ -1740,7 +1760,8 @@ cdef class ParquetWriter(_Weakrefable): data_page_version=data_page_version, encryption_properties=encryption_properties, write_batch_size=write_batch_size, - dictionary_pagesize_limit=dictionary_pagesize_limit + dictionary_pagesize_limit=dictionary_pagesize_limit, + write_page_index=write_page_index ) arrow_properties = _create_arrow_writer_properties( use_deprecated_int96_timestamps=use_deprecated_int96_timestamps, diff --git a/python/pyarrow/_s3fs.pyx b/python/pyarrow/_s3fs.pyx index e76c7b9ffa730..51c248d147828 100644 --- a/python/pyarrow/_s3fs.pyx +++ b/python/pyarrow/_s3fs.pyx @@ -140,14 +140,20 @@ cdef class S3FileSystem(FileSystem): """ S3-backed FileSystem implementation - If neither access_key nor secret_key are provided, and role_arn is also not - provided, then attempts to initialize from AWS environment variables, - otherwise both access_key and secret_key must be provided. + AWS access_key and secret_key can be provided explicitly. If role_arn is provided instead of access_key and secret_key, temporary credentials will be fetched by issuing a request to STS to assume the specified role. + If neither access_key nor secret_key are provided, and role_arn is also not + provided, then attempts to establish the credentials automatically. + S3FileSystem will try the following methods, in order: + + * ``AWS_ACCESS_KEY_ID``, ``AWS_SECRET_ACCESS_KEY``, and ``AWS_SESSION_TOKEN`` environment variables + * configuration files such as ``~/.aws/credentials`` and ``~/.aws/config`` + * for nodes on Amazon EC2, the EC2 Instance Metadata Service + Note: S3 buckets are special and the operations available on them may be limited or more expensive than desired. diff --git a/python/pyarrow/includes/libarrow_flight.pxd b/python/pyarrow/includes/libarrow_flight.pxd index 34ba809438e2c..624904ed77a69 100644 --- a/python/pyarrow/includes/libarrow_flight.pxd +++ b/python/pyarrow/includes/libarrow_flight.pxd @@ -257,6 +257,8 @@ cdef extern from "arrow/flight/api.h" namespace "arrow" nogil: c_string& peer_identity() c_string& peer() c_bool is_cancelled() + void AddHeader(const c_string& key, const c_string& value) + void AddTrailer(const c_string& key, const c_string& value) CServerMiddleware* GetMiddleware(const c_string& key) cdef cppclass CTimeoutDuration" arrow::flight::TimeoutDuration": diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py index bf5fc6b24f649..e0cdfee62ef4b 100644 --- a/python/pyarrow/parquet/core.py +++ b/python/pyarrow/parquet/core.py @@ -874,6 +874,13 @@ def _sanitize_table(table, new_schema, flavor): it will restore the timezone (Parquet only stores the UTC values without timezone), or columns with duration type will be restored from the int64 Parquet column. +write_page_index : bool, default False + Whether to write a page index in general for all columns. + Writing statistics to the page index disables the old method of writing + statistics to each data page header. The page index makes statistics-based + filtering more efficient than the page header, as it gathers all the + statistics for a Parquet file in a single place, avoiding scattered I/O. + Note that the page index is not yet used on the read size by PyArrow. """ _parquet_writer_example_doc = """\ @@ -966,6 +973,7 @@ def __init__(self, where, schema, filesystem=None, write_batch_size=None, dictionary_pagesize_limit=None, store_schema=True, + write_page_index=False, **options): if use_deprecated_int96_timestamps is None: # Use int96 timestamps for Spark @@ -1022,6 +1030,7 @@ def __init__(self, where, schema, filesystem=None, write_batch_size=write_batch_size, dictionary_pagesize_limit=dictionary_pagesize_limit, store_schema=store_schema, + write_page_index=write_page_index, **options) self.is_open = True @@ -3084,6 +3093,7 @@ def write_table(table, where, row_group_size=None, version='2.6', write_batch_size=None, dictionary_pagesize_limit=None, store_schema=True, + write_page_index=False, **kwargs): # Implementor's note: when adding keywords here / updating defaults, also # update it in write_to_dataset and _dataset_parquet.pyx ParquetFileWriteOptions @@ -3111,6 +3121,7 @@ def write_table(table, where, row_group_size=None, version='2.6', write_batch_size=write_batch_size, dictionary_pagesize_limit=dictionary_pagesize_limit, store_schema=store_schema, + write_page_index=write_page_index, **kwargs) as writer: writer.write_table(table, row_group_size=row_group_size) except Exception: diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index f438c8847bb02..aff1c311abbfb 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -522,6 +522,23 @@ cdef class TimestampScalar(Scalar): return _datetime_from_int(sp.value, unit=dtype.unit(), tzinfo=tzinfo) + def __repr__(self): + """ + Return the representation of TimestampScalar using `strftime` to avoid + original repr datetime values being out of range. + """ + cdef: + CTimestampScalar* sp = self.wrapped.get() + CTimestampType* dtype = sp.type.get() + + if not dtype.timezone().empty(): + type_format = str(_pc().strftime(self, format="%Y-%m-%dT%H:%M:%S%z")) + else: + type_format = str(_pc().strftime(self)) + return ''.format( + self.__class__.__name__, type_format + ) + cdef class DurationScalar(Scalar): """ @@ -785,7 +802,7 @@ cdef class MapScalar(ListScalar): if arr is None: raise IndexError(i) dct = arr[_normalize_index(i, len(arr))] - return (dct['key'], dct['value']) + return (dct[self.type.key_field.name], dct[self.type.item_field.name]) def __iter__(self): """ @@ -794,7 +811,7 @@ cdef class MapScalar(ListScalar): arr = self.values if array is None: raise StopIteration - for k, v in zip(arr.field('key'), arr.field('value')): + for k, v in zip(arr.field(self.type.key_field.name), arr.field(self.type.item_field.name)): yield (k.as_py(), v.as_py()) def as_py(self): diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 238fdb86bcc56..f08162089b8fa 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -460,10 +460,16 @@ cdef class ChunkedArray(_PandasConvertible): def _to_pandas(self, options, types_mapper=None, **kwargs): return _array_like_to_pandas(self, options, types_mapper=types_mapper) - def to_numpy(self): + def to_numpy(self, zero_copy_only=False): """ Return a NumPy copy of this array (experimental). + Parameters + ---------- + zero_copy_only : bool, default False + Introduced for signature consistence with pyarrow.Array.to_numpy. + This must be False here since NumPy arrays' buffer must be contiguous. + Returns ------- array : numpy.ndarray @@ -475,6 +481,10 @@ cdef class ChunkedArray(_PandasConvertible): >>> n_legs.to_numpy() array([ 2, 2, 4, 4, 5, 100]) """ + if zero_copy_only: + raise ValueError( + "zero_copy_only must be False for pyarrow.ChunkedArray.to_numpy" + ) cdef: PyObject* out PandasOptions c_options diff --git a/python/pyarrow/tests/parquet/test_metadata.py b/python/pyarrow/tests/parquet/test_metadata.py index 090c35973e816..fd5eff3ecfc58 100644 --- a/python/pyarrow/tests/parquet/test_metadata.py +++ b/python/pyarrow/tests/parquet/test_metadata.py @@ -161,14 +161,14 @@ def test_parquet_metadata_lifetime(tempdir): 'distinct_count' ), [ - ([1, 2, 2, None, 4], pa.uint8(), 'INT32', 1, 4, 1, 4, 0), - ([1, 2, 2, None, 4], pa.uint16(), 'INT32', 1, 4, 1, 4, 0), - ([1, 2, 2, None, 4], pa.uint32(), 'INT32', 1, 4, 1, 4, 0), - ([1, 2, 2, None, 4], pa.uint64(), 'INT64', 1, 4, 1, 4, 0), - ([-1, 2, 2, None, 4], pa.int8(), 'INT32', -1, 4, 1, 4, 0), - ([-1, 2, 2, None, 4], pa.int16(), 'INT32', -1, 4, 1, 4, 0), - ([-1, 2, 2, None, 4], pa.int32(), 'INT32', -1, 4, 1, 4, 0), - ([-1, 2, 2, None, 4], pa.int64(), 'INT64', -1, 4, 1, 4, 0), + ([1, 2, 2, None, 4], pa.uint8(), 'INT32', 1, 4, 1, 4, None), + ([1, 2, 2, None, 4], pa.uint16(), 'INT32', 1, 4, 1, 4, None), + ([1, 2, 2, None, 4], pa.uint32(), 'INT32', 1, 4, 1, 4, None), + ([1, 2, 2, None, 4], pa.uint64(), 'INT64', 1, 4, 1, 4, None), + ([-1, 2, 2, None, 4], pa.int8(), 'INT32', -1, 4, 1, 4, None), + ([-1, 2, 2, None, 4], pa.int16(), 'INT32', -1, 4, 1, 4, None), + ([-1, 2, 2, None, 4], pa.int32(), 'INT32', -1, 4, 1, 4, None), + ([-1, 2, 2, None, 4], pa.int64(), 'INT64', -1, 4, 1, 4, None), #( # [-1.1, 2.2, 2.3, None, 4.4], pa.float16(), # 'HALFFLOAT', -1.1, 4.4, 1, 4, 0 @@ -176,23 +176,23 @@ def test_parquet_metadata_lifetime(tempdir): # float16 operations are not yet implemented ( [-1.1, 2.2, 2.3, None, 4.4], pa.float32(), - 'FLOAT', -1.1, 4.4, 1, 4, 0 + 'FLOAT', -1.1, 4.4, 1, 4, None ), ( [-1.1, 2.2, 2.3, None, 4.4], pa.float64(), - 'DOUBLE', -1.1, 4.4, 1, 4, 0 + 'DOUBLE', -1.1, 4.4, 1, 4, None ), ( ['', 'b', chr(1000), None, 'aaa'], pa.binary(), - 'BYTE_ARRAY', b'', chr(1000).encode('utf-8'), 1, 4, 0 + 'BYTE_ARRAY', b'', chr(1000).encode('utf-8'), 1, 4, None ), ( [True, False, False, True, True], pa.bool_(), - 'BOOLEAN', False, True, 0, 5, 0 + 'BOOLEAN', False, True, 0, 5, None ), ( [b'\x00', b'b', b'12', None, b'aaa'], pa.binary(), - 'BYTE_ARRAY', b'\x00', b'b', 1, 4, 0 + 'BYTE_ARRAY', b'\x00', b'b', 1, 4, None ), ] ) @@ -362,6 +362,21 @@ def test_field_id_metadata(): assert schema[5].metadata[field_id] == b'-1000' +def test_parquet_file_page_index(): + for write_page_index in (False, True): + table = pa.table({'a': [1, 2, 3]}) + + writer = pa.BufferOutputStream() + _write_table(table, writer, write_page_index=write_page_index) + reader = pa.BufferReader(writer.getvalue()) + + # Can retrieve sorting columns from metadata + metadata = pq.read_metadata(reader) + cc = metadata.row_group(0).column(0) + assert cc.has_offset_index is write_page_index + assert cc.has_column_index is write_page_index + + @pytest.mark.pandas def test_multi_dataset_metadata(tempdir): filenames = ["ARROW-1983-dataset.0", "ARROW-1983-dataset.1"] diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index c728a842c1137..417daa80449c5 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -179,6 +179,21 @@ def test_to_numpy_zero_copy(): np.testing.assert_array_equal(np_arr, expected) +def test_chunked_array_to_numpy_zero_copy(): + elements = [[2, 2, 4], [4, 5, 100]] + + chunked_arr = pa.chunked_array(elements) + + msg = "zero_copy_only must be False for pyarrow.ChunkedArray.to_numpy" + + with pytest.raises(ValueError, match=msg): + chunked_arr.to_numpy(zero_copy_only=True) + + np_arr = chunked_arr.to_numpy() + expected = [2, 2, 4, 4, 5, 100] + np.testing.assert_array_equal(np_arr, expected) + + def test_to_numpy_unsupported_types(): # ARROW-2871: Some primitive types are not yet supported in to_numpy bool_arr = pa.array([True, False, True]) @@ -286,7 +301,7 @@ def test_asarray(): np_arr = np.asarray([_ for _ in arr]) assert np_arr.tolist() == [0, 1, 2, 3] assert np_arr.dtype == np.dtype('O') - assert type(np_arr[0]) == pa.lib.Int64Value + assert isinstance(np_arr[0], pa.lib.Int64Value) # Calling with the arrow array gives back an array with 'int64' dtype np_arr = np.asarray(arr) diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index 865fecc7b2291..98ab84c03900f 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -537,7 +537,7 @@ def test_trim(): def test_slice_compatibility(): arr = pa.array(["", "𝑓", "𝑓ö", "𝑓öõ", "𝑓öõḍ", "𝑓öõḍš"]) for start in range(-6, 6): - for stop in range(-6, 6): + for stop in itertools.chain(range(-6, 6), [None]): for step in [-3, -2, -1, 1, 2, 3]: expected = pa.array([k.as_py()[start:stop:step] for k in arr]) @@ -1756,6 +1756,17 @@ def test_logical(): assert pc.invert(a) == pa.array([False, True, True, None]) +def test_dictionary_decode(): + array = pa.array(["a", "a", "b", "c", "b"]) + dictionary_array = array.dictionary_encode() + dictionary_array_decode = pc.dictionary_decode(dictionary_array) + + assert array != dictionary_array + + assert array == dictionary_array_decode + assert array == pc.dictionary_decode(array) + + def test_cast(): arr = pa.array([1, 2, 3, 4], type='int64') options = pc.CastOptions(pa.int8()) diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py index af4c91a89459d..cf2535a3c62d1 100644 --- a/python/pyarrow/tests/test_convert_builtin.py +++ b/python/pyarrow/tests/test_convert_builtin.py @@ -1353,7 +1353,7 @@ def test_sequence_timestamp_from_int_with_unit(): assert len(arr_s) == 1 assert arr_s.type == s assert repr(arr_s[0]) == ( - "" + "" ) assert str(arr_s[0]) == "1970-01-01 00:00:01" diff --git a/python/pyarrow/tests/test_csv.py b/python/pyarrow/tests/test_csv.py index e92afce035275..81c31d98ac7ec 100644 --- a/python/pyarrow/tests/test_csv.py +++ b/python/pyarrow/tests/test_csv.py @@ -1936,7 +1936,7 @@ def test_write_quoting_style(): except Exception as e: # This will trigger when we try to write a comma (,) # without quotes, which is invalid - assert type(e) == res + assert isinstance(e, res) break assert buf.getvalue() == res buf.seek(0) diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index 814454861e790..f92317c0f223e 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -589,6 +589,7 @@ def test_partitioning(): partitioning = klass(schema) assert isinstance(partitioning, ds.Partitioning) assert partitioning == klass(schema) + assert partitioning != "other object" schema = pa.schema([ pa.field('group', pa.int64()), @@ -1642,12 +1643,15 @@ def test_fragments_repr(tempdir, dataset): @pytest.mark.parquet -def test_partitioning_factory(mockfs): +@pytest.mark.parametrize( + "pickled", [lambda x: x, lambda x: pickle.loads(pickle.dumps(x))]) +def test_partitioning_factory(mockfs, pickled): paths_or_selector = fs.FileSelector('subdir', recursive=True) format = ds.ParquetFileFormat() options = ds.FileSystemFactoryOptions('subdir') partitioning_factory = ds.DirectoryPartitioning.discover(['group', 'key']) + partitioning_factory = pickled(partitioning_factory) assert isinstance(partitioning_factory, ds.PartitioningFactory) options.partitioning_factory = partitioning_factory @@ -1673,13 +1677,16 @@ def test_partitioning_factory(mockfs): @pytest.mark.parquet @pytest.mark.parametrize('infer_dictionary', [False, True]) -def test_partitioning_factory_dictionary(mockfs, infer_dictionary): +@pytest.mark.parametrize( + "pickled", [lambda x: x, lambda x: pickle.loads(pickle.dumps(x))]) +def test_partitioning_factory_dictionary(mockfs, infer_dictionary, pickled): paths_or_selector = fs.FileSelector('subdir', recursive=True) format = ds.ParquetFileFormat() options = ds.FileSystemFactoryOptions('subdir') - options.partitioning_factory = ds.DirectoryPartitioning.discover( + partitioning_factory = ds.DirectoryPartitioning.discover( ['group', 'key'], infer_dictionary=infer_dictionary) + options.partitioning_factory = pickled(partitioning_factory) factory = ds.FileSystemDatasetFactory( mockfs, paths_or_selector, format, options) @@ -1703,7 +1710,9 @@ def test_partitioning_factory_dictionary(mockfs, infer_dictionary): assert inferred_schema.field('key').type == pa.string() -def test_partitioning_factory_segment_encoding(): +@pytest.mark.parametrize( + "pickled", [lambda x: x, lambda x: pickle.loads(pickle.dumps(x))]) +def test_partitioning_factory_segment_encoding(pickled): mockfs = fs._MockFileSystem() format = ds.IpcFileFormat() schema = pa.schema([("i64", pa.int64())]) @@ -1726,8 +1735,9 @@ def test_partitioning_factory_segment_encoding(): # Directory selector = fs.FileSelector("directory", recursive=True) options = ds.FileSystemFactoryOptions("directory") - options.partitioning_factory = ds.DirectoryPartitioning.discover( + partitioning_factory = ds.DirectoryPartitioning.discover( schema=partition_schema) + options.partitioning_factory = pickled(partitioning_factory) factory = ds.FileSystemDatasetFactory(mockfs, selector, format, options) inferred_schema = factory.inspect() assert inferred_schema == full_schema @@ -1736,24 +1746,27 @@ def test_partitioning_factory_segment_encoding(): }) assert actual[0][0].as_py() == 1620086400 - options.partitioning_factory = ds.DirectoryPartitioning.discover( + partitioning_factory = ds.DirectoryPartitioning.discover( ["date", "string"], segment_encoding="none") + options.partitioning_factory = pickled(partitioning_factory) factory = ds.FileSystemDatasetFactory(mockfs, selector, format, options) fragments = list(factory.finish().get_fragments()) assert fragments[0].partition_expression.equals( (ds.field("date") == "2021-05-04 00%3A00%3A00") & (ds.field("string") == "%24")) - options.partitioning = ds.DirectoryPartitioning( + partitioning = ds.DirectoryPartitioning( string_partition_schema, segment_encoding="none") + options.partitioning = pickled(partitioning) factory = ds.FileSystemDatasetFactory(mockfs, selector, format, options) fragments = list(factory.finish().get_fragments()) assert fragments[0].partition_expression.equals( (ds.field("date") == "2021-05-04 00%3A00%3A00") & (ds.field("string") == "%24")) - options.partitioning_factory = ds.DirectoryPartitioning.discover( + partitioning_factory = ds.DirectoryPartitioning.discover( schema=partition_schema, segment_encoding="none") + options.partitioning_factory = pickled(partitioning_factory) factory = ds.FileSystemDatasetFactory(mockfs, selector, format, options) with pytest.raises(pa.ArrowInvalid, match="Could not cast segments for partition field"): @@ -1762,8 +1775,9 @@ def test_partitioning_factory_segment_encoding(): # Hive selector = fs.FileSelector("hive", recursive=True) options = ds.FileSystemFactoryOptions("hive") - options.partitioning_factory = ds.HivePartitioning.discover( + partitioning_factory = ds.HivePartitioning.discover( schema=partition_schema) + options.partitioning_factory = pickled(partitioning_factory) factory = ds.FileSystemDatasetFactory(mockfs, selector, format, options) inferred_schema = factory.inspect() assert inferred_schema == full_schema @@ -1772,8 +1786,9 @@ def test_partitioning_factory_segment_encoding(): }) assert actual[0][0].as_py() == 1620086400 - options.partitioning_factory = ds.HivePartitioning.discover( + partitioning_factory = ds.HivePartitioning.discover( segment_encoding="none") + options.partitioning_factory = pickled(partitioning_factory) factory = ds.FileSystemDatasetFactory(mockfs, selector, format, options) fragments = list(factory.finish().get_fragments()) assert fragments[0].partition_expression.equals( @@ -1788,15 +1803,18 @@ def test_partitioning_factory_segment_encoding(): (ds.field("date") == "2021-05-04 00%3A00%3A00") & (ds.field("string") == "%24")) - options.partitioning_factory = ds.HivePartitioning.discover( + partitioning_factory = ds.HivePartitioning.discover( schema=partition_schema, segment_encoding="none") + options.partitioning_factory = pickled(partitioning_factory) factory = ds.FileSystemDatasetFactory(mockfs, selector, format, options) with pytest.raises(pa.ArrowInvalid, match="Could not cast segments for partition field"): inferred_schema = factory.inspect() -def test_partitioning_factory_hive_segment_encoding_key_encoded(): +@pytest.mark.parametrize( + "pickled", [lambda x: x, lambda x: pickle.loads(pickle.dumps(x))]) +def test_partitioning_factory_hive_segment_encoding_key_encoded(pickled): mockfs = fs._MockFileSystem() format = ds.IpcFileFormat() schema = pa.schema([("i64", pa.int64())]) @@ -1825,8 +1843,9 @@ def test_partitioning_factory_hive_segment_encoding_key_encoded(): # Hive selector = fs.FileSelector("hive", recursive=True) options = ds.FileSystemFactoryOptions("hive") - options.partitioning_factory = ds.HivePartitioning.discover( + partitioning_factory = ds.HivePartitioning.discover( schema=partition_schema) + options.partitioning_factory = pickled(partitioning_factory) factory = ds.FileSystemDatasetFactory(mockfs, selector, format, options) inferred_schema = factory.inspect() assert inferred_schema == full_schema @@ -1835,40 +1854,45 @@ def test_partitioning_factory_hive_segment_encoding_key_encoded(): }) assert actual[0][0].as_py() == 1620086400 - options.partitioning_factory = ds.HivePartitioning.discover( + partitioning_factory = ds.HivePartitioning.discover( segment_encoding="uri") + options.partitioning_factory = pickled(partitioning_factory) factory = ds.FileSystemDatasetFactory(mockfs, selector, format, options) fragments = list(factory.finish().get_fragments()) assert fragments[0].partition_expression.equals( (ds.field("test'; date") == "2021-05-04 00:00:00") & (ds.field("test';[ string'") == "$")) - options.partitioning = ds.HivePartitioning( + partitioning = ds.HivePartitioning( string_partition_schema, segment_encoding="uri") + options.partitioning = pickled(partitioning) factory = ds.FileSystemDatasetFactory(mockfs, selector, format, options) fragments = list(factory.finish().get_fragments()) assert fragments[0].partition_expression.equals( (ds.field("test'; date") == "2021-05-04 00:00:00") & (ds.field("test';[ string'") == "$")) - options.partitioning_factory = ds.HivePartitioning.discover( + partitioning_factory = ds.HivePartitioning.discover( segment_encoding="none") + options.partitioning_factory = pickled(partitioning_factory) factory = ds.FileSystemDatasetFactory(mockfs, selector, format, options) fragments = list(factory.finish().get_fragments()) assert fragments[0].partition_expression.equals( (ds.field("test%27%3B%20date") == "2021-05-04 00%3A00%3A00") & (ds.field("test%27%3B%5B%20string%27") == "%24")) - options.partitioning = ds.HivePartitioning( + partitioning = ds.HivePartitioning( string_partition_schema_en, segment_encoding="none") + options.partitioning = pickled(partitioning) factory = ds.FileSystemDatasetFactory(mockfs, selector, format, options) fragments = list(factory.finish().get_fragments()) assert fragments[0].partition_expression.equals( (ds.field("test%27%3B%20date") == "2021-05-04 00%3A00%3A00") & (ds.field("test%27%3B%5B%20string%27") == "%24")) - options.partitioning_factory = ds.HivePartitioning.discover( + partitioning_factory = ds.HivePartitioning.discover( schema=partition_schema_en, segment_encoding="none") + options.partitioning_factory = pickled(partitioning_factory) factory = ds.FileSystemDatasetFactory(mockfs, selector, format, options) with pytest.raises(pa.ArrowInvalid, match="Could not cast segments for partition field"): @@ -4997,8 +5021,8 @@ def test_dataset_filter(tempdir, dstype): # Ensure chained filtering works. result = ds1.filter(pc.field("colA") < 3).filter(pc.field("col2") == "a") - assert type(result) == (ds.FileSystemDataset if dstype == - "fs" else ds.InMemoryDataset) + expected = ds.FileSystemDataset if dstype == "fs" else ds.InMemoryDataset + assert isinstance(result, expected) assert result.to_table() == pa.table({ "colA": [1], @@ -5157,9 +5181,9 @@ def test_read_table_nested_columns(tempdir, format): "a.dotted.field": [1, 2], "interaction": [ {"type": None, "element": "button", - "values": [1, 2], "structs":[{"foo": "bar"}, None]}, + "values": [1, 2], "structs": [{"foo": "bar"}, None]}, {"type": "scroll", "element": "window", - "values": [None, 3, 4], "structs":[{"fizz": "buzz"}]} + "values": [None, 3, 4], "structs": [{"fizz": "buzz"}]} ]}) ds.write_dataset(table, tempdir / "table", format=format) ds1 = ds.dataset(tempdir / "table", format=format) diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index 009896fd67e40..973aa29c7583f 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -340,8 +340,8 @@ def test_ext_scalar_from_array(): assert len(scalars_a) == 4 assert ty1.__arrow_ext_scalar_class__() == UuidScalarType - assert type(a[0]) == UuidScalarType - assert type(scalars_a[0]) == UuidScalarType + assert isinstance(a[0], UuidScalarType) + assert isinstance(scalars_a[0], UuidScalarType) for s, val in zip(scalars_a, data): assert isinstance(s, pa.ExtensionScalar) @@ -737,7 +737,7 @@ def __arrow_ext_deserialize__(cls, storage_type, serialized): def __eq__(self, other): if isinstance(other, pa.BaseExtensionType): - return (type(self) == type(other) and + return (isinstance(self, type(other)) and self.freq == other.freq) else: return NotImplemented @@ -799,7 +799,7 @@ def test_generic_ext_type_ipc(registered_period_type): arr = pa.ExtensionArray.from_storage(period_type, storage) batch = pa.RecordBatch.from_arrays([arr], ["ext"]) # check the built array has exactly the expected clss - assert type(arr) == period_class + assert isinstance(arr, period_class) buf = ipc_write_batch(batch) del batch @@ -807,7 +807,7 @@ def test_generic_ext_type_ipc(registered_period_type): result = batch.column(0) # check the deserialized array class is the expected one - assert type(result) == period_class + assert isinstance(result, period_class) assert result.type.extension_name == "test.period" assert arr.storage.to_pylist() == [1, 2, 3, 4] @@ -830,7 +830,7 @@ def test_generic_ext_type_ipc(registered_period_type): result = batch.column(0) assert isinstance(result.type, PeriodType) assert result.type.freq == 'H' - assert type(result) == period_class + assert isinstance(result, period_class) def test_generic_ext_type_ipc_unknown(registered_period_type): @@ -1261,7 +1261,7 @@ def test_tensor_type_ipc(tensor_type): # check the built array has exactly the expected clss tensor_class = tensor_type.__arrow_ext_class__() - assert type(arr) == tensor_class + assert isinstance(arr, tensor_class) buf = ipc_write_batch(batch) del batch @@ -1269,7 +1269,7 @@ def test_tensor_type_ipc(tensor_type): result = batch.column(0) # check the deserialized array class is the expected one - assert type(result) == tensor_class + assert isinstance(result, tensor_class) assert result.type.extension_name == "arrow.fixed_shape_tensor" assert arr.storage.to_pylist() == [[1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6]] diff --git a/python/pyarrow/tests/test_flight.py b/python/pyarrow/tests/test_flight.py index 930523b9f5442..bf15ad0bc4d65 100644 --- a/python/pyarrow/tests/test_flight.py +++ b/python/pyarrow/tests/test_flight.py @@ -833,7 +833,7 @@ def sending_headers(self): def received_headers(self, headers): # Let the test code know what the last set of headers we # received were. - self.factory.last_headers = headers + self.factory.last_headers.update(headers) class MultiHeaderServerMiddlewareFactory(ServerMiddlewareFactory): @@ -1495,7 +1495,7 @@ def test_tls_override_hostname(): """Check that incorrectly overriding the hostname fails.""" certs = example_tls_certs() - with ConstantFlightServer(tls_certificates=certs["certificates"]) as s,\ + with ConstantFlightServer(tls_certificates=certs["certificates"]) as s, \ flight.connect(('localhost', s.port), tls_root_certs=certs["root_cert"], override_hostname="fakehostname") as client: @@ -2323,3 +2323,45 @@ def test_do_put_does_not_crash_when_schema_is_none(): with pytest.raises(TypeError, match=msg): client.do_put(flight.FlightDescriptor.for_command('foo'), schema=None) + + +def test_headers_trailers(): + """Ensure that server-sent headers/trailers make it through.""" + + class HeadersTrailersFlightServer(FlightServerBase): + def get_flight_info(self, context, descriptor): + context.add_header("x-header", "header-value") + context.add_header("x-header-bin", "header\x01value") + context.add_trailer("x-trailer", "trailer-value") + context.add_trailer("x-trailer-bin", "trailer\x01value") + return flight.FlightInfo( + pa.schema([]), + descriptor, + [], + -1, -1 + ) + + class HeadersTrailersMiddlewareFactory(ClientMiddlewareFactory): + def __init__(self): + self.headers = [] + + def start_call(self, info): + return HeadersTrailersMiddleware(self) + + class HeadersTrailersMiddleware(ClientMiddleware): + def __init__(self, factory): + self.factory = factory + + def received_headers(self, headers): + for key, values in headers.items(): + for value in values: + self.factory.headers.append((key, value)) + + factory = HeadersTrailersMiddlewareFactory() + with HeadersTrailersFlightServer() as server, \ + FlightClient(("localhost", server.port), middleware=[factory]) as client: + client.get_flight_info(flight.FlightDescriptor.for_path("")) + assert ("x-header", "header-value") in factory.headers + assert ("x-header-bin", b"header\x01value") in factory.headers + assert ("x-trailer", "trailer-value") in factory.headers + assert ("x-trailer-bin", b"trailer\x01value") in factory.headers diff --git a/python/pyarrow/tests/test_memory.py b/python/pyarrow/tests/test_memory.py index 092c50de33b90..d9fdeb152c35e 100644 --- a/python/pyarrow/tests/test_memory.py +++ b/python/pyarrow/tests/test_memory.py @@ -134,8 +134,14 @@ def check_env_var(name, expected, *, expect_warning=False): res.check_returncode() # fail errlines = res.stderr.splitlines() if expect_warning: - assert len(errlines) == 1 - assert f"Unsupported backend '{name}'" in errlines[0] + assert len(errlines) in (1, 2) + if len(errlines) == 1: + # ARROW_USE_GLOG=OFF + assert f"Unsupported backend '{name}'" in errlines[0] + else: + # ARROW_USE_GLOG=ON + assert "InitGoogleLogging()" in errlines[0] + assert f"Unsupported backend '{name}'" in errlines[1] else: assert len(errlines) == 0 diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index cf71b8b82d642..8bdc7253a4837 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -4258,20 +4258,19 @@ def test_to_pandas_extension_dtypes_mapping(): assert isinstance(result['a'].dtype, pd.PeriodDtype) -@pytest.mark.parametrize("arr", - [pd.period_range("2012-01-01", periods=3, freq="D").array, - pd.interval_range(1, 4).array]) -def test_array_to_pandas(arr): +def test_array_to_pandas(): if Version(pd.__version__) < Version("1.1"): pytest.skip("ExtensionDtype to_pandas method missing") - result = pa.array(arr).to_pandas() - expected = pd.Series(arr) - tm.assert_series_equal(result, expected) + for arr in [pd.period_range("2012-01-01", periods=3, freq="D").array, + pd.interval_range(1, 4).array]: + result = pa.array(arr).to_pandas() + expected = pd.Series(arr) + tm.assert_series_equal(result, expected) - result = pa.table({"col": arr})["col"].to_pandas() - expected = pd.Series(arr, name="col") - tm.assert_series_equal(result, expected) + result = pa.table({"col": arr})["col"].to_pandas() + expected = pd.Series(arr, name="col") + tm.assert_series_equal(result, expected) def test_roundtrip_empty_table_with_extension_dtype_index(): diff --git a/python/pyarrow/tests/test_scalars.py b/python/pyarrow/tests/test_scalars.py index b7180e5250fdf..a989301fe5735 100644 --- a/python/pyarrow/tests/test_scalars.py +++ b/python/pyarrow/tests/test_scalars.py @@ -154,6 +154,18 @@ def test_hashing_struct_scalar(): assert hash1 == hash2 +def test_timestamp_scalar(): + a = repr(pa.scalar("0000-01-01").cast(pa.timestamp("s"))) + assert a == "" + b = repr(pa.scalar(datetime.datetime(2015, 1, 1), type=pa.timestamp('s', tz='UTC'))) + assert b == "" + c = repr(pa.scalar(datetime.datetime(2015, 1, 1), type=pa.timestamp('us'))) + assert c == "" + d = repr(pc.assume_timezone( + pa.scalar("2000-01-01").cast(pa.timestamp("s")), "America/New_York")) + assert d == "" + + def test_bool(): false = pa.scalar(False) true = pa.scalar(True) @@ -791,3 +803,26 @@ def test_union(): assert arr[0].as_py() == b'a' assert arr[5].type_code == 1 assert arr[5].as_py() == 3 + + +def test_map_scalar_as_py_with_custom_field_name(): + """ + Check we can call `MapScalar.as_py` with custom field names + + See https://github.com/apache/arrow/issues/36809 + """ + assert pa.scalar( + [("foo", "bar")], + pa.map_( + pa.string(), + pa.string() + ), + ).as_py() == [("foo", "bar")] + + assert pa.scalar( + [("foo", "bar")], + pa.map_( + pa.field("custom_key", pa.string(), nullable=False), + pa.field("custom_value", pa.string()), + ), + ).as_py() == [("foo", "bar")] diff --git a/python/pyarrow/tests/test_schema.py b/python/pyarrow/tests/test_schema.py index 2f2417f590a2d..2c2f9547f2276 100644 --- a/python/pyarrow/tests/test_schema.py +++ b/python/pyarrow/tests/test_schema.py @@ -50,6 +50,7 @@ def test_type_integers(): assert str(t) == name +@pytest.mark.pandas def test_type_to_pandas_dtype(): M8 = np.dtype('datetime64[ms]') if Version(pd.__version__) < Version("2.0.0"): diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py index e66c7a79e877d..e28256e91f0a6 100644 --- a/python/pyarrow/tests/test_table.py +++ b/python/pyarrow/tests/test_table.py @@ -511,7 +511,7 @@ def test_recordbatch_basics(): ('c0', [0, 1, 2, 3, 4]), ('c1', [-10, -5, 0, None, 10]) ]) - assert type(pydict) == dict + assert isinstance(pydict, dict) with pytest.raises(IndexError): # bounds checking @@ -949,7 +949,7 @@ def test_table_basics(): ('a', [0, 1, 2, 3, 4]), ('b', [-10, -5, 0, 5, 10]) ]) - assert type(pydict) == dict + assert isinstance(pydict, dict) columns = [] for col in table.itercolumns(): @@ -2422,3 +2422,28 @@ def test_numpy_asarray(constructor): result = np.asarray(table3, dtype="int32") np.testing.assert_allclose(result, expected) assert result.dtype == "int32" + + +@pytest.mark.acero +def test_invalid_non_join_column(): + NUM_ITEMS = 30 + t1 = pa.Table.from_pydict({ + 'id': range(NUM_ITEMS), + 'array_column': [[z for z in range(3)] for x in range(NUM_ITEMS)], + }) + t2 = pa.Table.from_pydict({ + 'id': range(NUM_ITEMS), + 'value': [x for x in range(NUM_ITEMS)] + }) + + # check as left table + with pytest.raises(pa.lib.ArrowInvalid) as excinfo: + t1.join(t2, 'id', join_type='inner') + exp_error_msg = "Data type list is not supported " \ + + "in join non-key field array_column" + assert exp_error_msg in str(excinfo.value) + + # check as right table + with pytest.raises(pa.lib.ArrowInvalid) as excinfo: + t2.join(t1, 'id', join_type='inner') + assert exp_error_msg in str(excinfo.value) diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index fbd4f8a94b64c..12ad2fc4b6f60 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -3605,9 +3605,9 @@ def timestamp(unit, tz=None): >>> from datetime import datetime >>> pa.scalar(datetime(2012, 1, 1), type=pa.timestamp('s', tz='UTC')) - )> + >>> pa.scalar(datetime(2012, 1, 1), type=pa.timestamp('us')) - + Returns ------- diff --git a/python/pyproject.toml b/python/pyproject.toml index fe8c938a9ce4f..7e61304585809 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -17,7 +17,7 @@ [build-system] requires = [ - "cython >= 0.29.31", + "cython >= 0.29.31,<3", "oldest-supported-numpy>=0.14", "setuptools_scm", "setuptools >= 40.1.0", diff --git a/python/requirements-build.txt b/python/requirements-build.txt index 507e9081373e2..6378d1b94e1bb 100644 --- a/python/requirements-build.txt +++ b/python/requirements-build.txt @@ -1,4 +1,4 @@ -cython>=0.29.31 +cython>=0.29.31,<3 oldest-supported-numpy>=0.14 setuptools_scm setuptools>=38.6.0 diff --git a/python/requirements-wheel-build.txt b/python/requirements-wheel-build.txt index 6043d2ffb2c6e..e4f5243fbc2fe 100644 --- a/python/requirements-wheel-build.txt +++ b/python/requirements-wheel-build.txt @@ -1,4 +1,4 @@ -cython>=0.29.31 +cython>=0.29.31,<3 oldest-supported-numpy>=0.14 setuptools_scm setuptools>=58 diff --git a/python/setup.py b/python/setup.py index f06cb5a627562..dc529679c7f90 100755 --- a/python/setup.py +++ b/python/setup.py @@ -40,8 +40,9 @@ # Check if we're running 64-bit Python is_64_bit = sys.maxsize > 2**32 -if Cython.__version__ < '0.29.31': - raise Exception('Please upgrade to Cython 0.29.31 or newer') +if Cython.__version__ < '0.29.31' or Cython.__version__ >= '3.0': + raise Exception( + 'Please update your Cython version. Supported Cython >= 0.29.31, < 3.0') setup_dir = os.path.abspath(os.path.dirname(__file__)) @@ -491,7 +492,7 @@ def has_ext_modules(foo): 'pyarrow/_generated_version.py'), 'version_scheme': guess_next_dev_version }, - setup_requires=['setuptools_scm', 'cython >= 0.29.31'] + setup_requires, + setup_requires=['setuptools_scm', 'cython >= 0.29.31,<3'] + setup_requires, install_requires=install_requires, tests_require=['pytest', 'pandas', 'hypothesis'], python_requires='>=3.8', diff --git a/r/.lintr b/r/.lintr index 1bd80aff4c62d..085ff45123411 100644 --- a/r/.lintr +++ b/r/.lintr @@ -15,6 +15,7 @@ license: # Licensed to the Apache Software Foundation (ASF) under one # specific language governing permissions and limitations # under the License. linters: linters_with_defaults( + indentation_linter = NULL, line_length_linter = line_length_linter(120), object_name_linter = NULL, # Even with a liberal definition of name styles, some of our names cause issues due to `.`s for s3 classes or NA in the name diff --git a/r/NAMESPACE b/r/NAMESPACE index aa7b30252bbc0..7eaa51bc5771f 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -443,6 +443,7 @@ importFrom(rlang,as_label) importFrom(rlang,as_quosure) importFrom(rlang,call2) importFrom(rlang,call_args) +importFrom(rlang,call_name) importFrom(rlang,caller_env) importFrom(rlang,check_dots_empty) importFrom(rlang,check_dots_empty0) diff --git a/r/NEWS.md b/r/NEWS.md index f358c2aae45fc..6d09355170a67 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -19,6 +19,40 @@ # arrow 12.0.1.9000 +## New features + +* `open_dataset()` now works with ND-JSON files (#35055) +* Calling `schema()` on multiple Arrow objects now returns the object's schema (#35543) +* dplyr `.by`/`by` argument now supported in arrow implementation of dplyr verbs (@eitsupi, #35667) + +## Minor improvements and fixes + +* Convenience function `arrow_array()` can be used to create Arrow Arrays (#36381) +* Convenience function `scalar()` can be used to create Arrow Scalars (#36265) +* Prevent crashed when passing data between arrow and duckdb by always calling `RecordBatchReader::ReadNext()` from DuckDB from the main R thread (#36307) +* Issue a warning for `set_io_thread_count()` with `num_threads` < 2 (#36304) +* Ensure missing grouping variables are added to the beginning of the variable list (#36305) +* CSV File reader options class objects can print the selected values (#35955) +* Schema metadata can be set as a named character vector (#35954) +* Ensure that the RStringViewer helper class does not own any Array references (#35812) +* `strptime()` in arrow will return a timezone-aware timestamp if `%z` is part of the format string (#35671) +* Column ordering when combining `group_by()` and `across()` now matches dplyr (@eitsupi, #35473) + +## Installation + +* Link to correct version of OpenSSL when using autobrew (#36551) +* Require cmake 3.16 in bundled build script (#36321) + +## Docs + +* Split out R6 classes and convenience functions to improve readability (#36394) +* Enable pkgdown built-in search (@eitsupi, #36374) +* Re-organise reference page on pkgdown site to improve readability (#36171) + +# arrow 12.0.1.1 + +* Update a package version reference to be text only instead of numeric due to CRAN update requiring this (#36353, #36364) + # arrow 12.0.1 * Update the version of the date library vendored with Arrow C++ library diff --git a/r/PACKAGING.md b/r/PACKAGING.md index 6cfa903650729..edfca651e9d38 100644 --- a/r/PACKAGING.md +++ b/r/PACKAGING.md @@ -31,7 +31,7 @@ For a high-level overview of the release process see the - [ ] Ensure the contents of the README are accurate and up to date. - [ ] Run `urlchecker::url_check()` on the R directory at the release candidate. commit. Ignore any errors with badges as they will be removed in the CRAN release branch. -- [ ] [Polish NEWS](https://style.tidyverse.org/news.html#news-release) but do **not** update version numbers (this is done automatically later). +- [ ] [Polish NEWS](https://style.tidyverse.org/news.html#news-release) but do **not** update version numbers (this is done automatically later). You can find commits by, for example, `git log --oneline aa057d0..HEAD | grep "\[R\]"` - [ ] Run preliminary reverse dependency checks using `archery docker run r-revdepcheck`. - [ ] For major releases, prepare tweet thread highlighting new features. @@ -126,7 +126,7 @@ Wait for CRAN... - [ ] Tag the tip of the CRAN-specific release branch - [ ] Add a new line to the matrix in the [backwards compatability job](https://github.com/apache/arrow/blob/main/dev/tasks/r/github.linux.arrow.version.back.compat.yml) - [ ] (patch releases only) Update the package version in `ci/scripts/PKGBUILD`, `dev/tasks/homebrew-formulae/autobrew/apache-arrow.rb`, `r/DESCRIPTION`, and `r/NEWS.md` -- [ ] (CRAN-only releases) Rebuild the docs with `pkgdown::build_site(examples = FALSE, lazy = TRUE, install = FALSE)` and submit a PR to [the `asf-site` branch of the docs site](https://github.com/apache/arrow-site) with the contents of `r/docs/news/index.html`. +- [ ] (CRAN-only releases) Rebuild news page with `pkgdown::build_news()` and submit a PR to the asf-site branch of the docs site with the contents of `arrow/r/docs/news/index.html` replacing the current contents of `arrow-site/docs/r/news/index.html` - [ ] (CRAN-only releases) Bump the version number in `r/pkgdown/assets/versions.json`, and update this on the [the `asf-site` branch of the docs site](https://github.com/apache/arrow-site) too. - [ ] Update the packaging checklist template to reflect any new realities of the packaging process. diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R index 79871d8735c96..8f44f8936bdd3 100644 --- a/r/R/arrow-package.R +++ b/r/R/arrow-package.R @@ -27,7 +27,7 @@ #' @importFrom rlang is_list call2 is_empty as_function as_label arg_match is_symbol is_call call_args #' @importFrom rlang quo_set_env quo_get_env is_formula quo_is_call f_rhs parse_expr f_env new_quosure #' @importFrom rlang new_quosures expr_text caller_env check_dots_empty check_dots_empty0 dots_list is_string inform -#' @importFrom rlang is_bare_list +#' @importFrom rlang is_bare_list call_name #' @importFrom tidyselect vars_pull eval_select eval_rename #' @importFrom glue glue #' @useDynLib arrow, .registration = TRUE diff --git a/r/R/buffer.R b/r/R/buffer.R index b7f6895a618c0..04d9e8c938ff3 100644 --- a/r/R/buffer.R +++ b/r/R/buffer.R @@ -30,8 +30,8 @@ #' - `$size` : size in memory, in bytes #' - `$capacity`: possible capacity, in bytes #' -#' @rdname buffer -#' @name buffer +#' @rdname Buffer-class +#' @name Buffer #' @examples #' my_buffer <- buffer(c(1, 2, 3, 4)) #' my_buffer$is_mutable @@ -69,8 +69,11 @@ Buffer$create <- function(x) { } } +#' Create a Buffer +#' @rdname buffer #' @param x R object. Only raw, numeric and integer vectors are currently supported #' @return an instance of `Buffer` that borrows memory from `x` +#' @seealso [Buffer] #' @export buffer <- Buffer$create diff --git a/r/R/chunked-array.R b/r/R/chunked-array.R index dd1beb2afd14d..90b5f4115df94 100644 --- a/r/R/chunked-array.R +++ b/r/R/chunked-array.R @@ -55,7 +55,7 @@ #' - `$Validate()`: Perform any validation checks to determine obvious inconsistencies #' within the array's internal data. This can be an expensive check, potentially `O(length)` #' -#' @rdname ChunkedArray +#' @rdname ChunkedArray-class #' @name ChunkedArray #' @seealso [Array] #' @examples @@ -154,9 +154,26 @@ c.ChunkedArray <- function(...) { ChunkedArray$create(...) } -#' @param \dots Vectors to coerce -#' @param type currently ignored -#' @rdname ChunkedArray +#' Create a Chunked Array +#' +#' @param ... R objects to coerce into a ChunkedArray. They must be of the same type. +#' @param type An optional [data type][data-type]. If omitted, the type will be inferred from the data. +#' @rdname chunked_array +#' @examples +#' # Pass items into chunked_array as separate objects to create chunks +#' class_scores <- chunked_array(c(87, 88, 89), c(94, 93, 92), c(71, 72, 73)) +#' +#' # If you pass a list into chunked_array, you get a list of length 1 +#' list_scores <- chunked_array(list(c(9.9, 9.6, 9.5), c(8.2, 8.3, 8.4), c(10.0, 9.9, 9.8))) +#' +#' # When constructing a ChunkedArray, the first chunk is used to infer type. +#' infer_type(chunked_array(c(1, 2, 3), c(5L, 6L, 7L))) +#' +#' # Concatenating chunked arrays returns a new chunked array containing all chunks +#' a <- chunked_array(c(1, 2), 3) +#' b <- chunked_array(c(4, 5), 6) +#' c(a, b) +#' @seealso [ChunkedArray] #' @export chunked_array <- ChunkedArray$create diff --git a/r/R/dplyr-funcs-string.R b/r/R/dplyr-funcs-string.R index 436083d9de455..b4becb4081bcb 100644 --- a/r/R/dplyr-funcs-string.R +++ b/r/R/dplyr-funcs-string.R @@ -56,15 +56,33 @@ get_stringr_pattern_options <- function(pattern) { ) } } + ensure_opts <- function(opts) { if (is.character(opts)) { opts <- list(pattern = opts, fixed = FALSE, ignore_case = FALSE) } opts } + + pattern <- clean_pattern_namespace(pattern) + ensure_opts(eval(pattern)) } +# Ensure that e.g. stringr::regex and regex both work within patterns +clean_pattern_namespace <- function(pattern) { + modifier_funcs <- c("fixed", "regex", "coll", "boundary") + if (is_call(pattern, modifier_funcs, ns = "stringr")) { + function_called <- call_name(pattern[1]) + + if (function_called %in% modifier_funcs) { + pattern[1] <- call2(function_called) + } + } + + pattern +} + #' Does this string contain regex metacharacters? #' #' @param string String to be tested diff --git a/r/R/field.R b/r/R/field.R index fce193ab53a41..704c7b4ce85c0 100644 --- a/r/R/field.R +++ b/r/R/field.R @@ -28,8 +28,8 @@ #' - `f$ToString()`: convert to a string #' - `f$Equals(other)`: test for equality. More naturally called as `f == other` #' -#' @rdname Field #' @name Field +#' @rdname Field-class #' @export Field <- R6Class("Field", inherit = ArrowObject, @@ -63,6 +63,8 @@ Field$create <- function(name, type, metadata, nullable = TRUE) { #' @include arrowExports.R Field$import_from_c <- ImportField +#' Create a Field +#' #' @param name field name #' @param type logical type, instance of [DataType] #' @param metadata currently ignored @@ -71,6 +73,7 @@ Field$import_from_c <- ImportField #' @examples #' field("x", int32()) #' @rdname Field +#' @seealso [Field] #' @export field <- Field$create diff --git a/r/R/record-batch.R b/r/R/record-batch.R index 528ecef2f3d13..b137b374e9773 100644 --- a/r/R/record-batch.R +++ b/r/R/record-batch.R @@ -75,7 +75,7 @@ #' Modify or replace by assigning in (`batch$metadata <- new_metadata`). #' All list elements are coerced to string. See `schema()` for more information. #' - `$columns`: Returns a list of `Array`s -#' @rdname RecordBatch +#' @rdname RecordBatch-class #' @name RecordBatch #' @export RecordBatch <- R6Class("RecordBatch", @@ -169,13 +169,15 @@ RecordBatch$from_message <- function(obj, schema) { #' @include arrowExports.R RecordBatch$import_from_c <- ImportRecordBatch +#' Create a RecordBatch +#' #' @param ... A `data.frame` or a named set of Arrays or vectors. If given a #' mixture of data.frames and vectors, the inputs will be autospliced together #' (see examples). Alternatively, you can provide a single Arrow IPC #' `InputStream`, `Message`, `Buffer`, or R `raw` object containing a `Buffer`. #' @param schema a [Schema], or `NULL` (the default) to infer the schema from #' the data in `...`. When providing an Arrow IPC buffer, `schema` is required. -#' @rdname RecordBatch +#' @rdname record_batch #' @examples #' batch <- record_batch(name = rownames(mtcars), mtcars) #' dim(batch) diff --git a/r/R/schema.R b/r/R/schema.R index 70e53f6b6c853..1ad18e314191e 100644 --- a/r/R/schema.R +++ b/r/R/schema.R @@ -75,8 +75,7 @@ #' Files with compressed metadata are readable by older versions of arrow, but #' the metadata is dropped. #' -#' @rdname schema-class -#' @name Schema +#' @rdname Schema-class #' @export Schema <- R6Class("Schema", inherit = ArrowObject, @@ -230,8 +229,6 @@ print_schema_fields <- function(s) { paste(map_chr(s$fields, ~ .$ToString()), collapse = "\n") } -#' Schemas -#' #' Create a schema or extract one from an object. #' #' @seealso [Schema] for detailed documentation of the Schema R6 object @@ -383,7 +380,7 @@ length.Schema <- function(x) x$num_fields #' @export as.list.Schema <- function(x, ...) x$fields -#' read a Schema from a stream +#' Read a Schema from a stream #' #' @param stream a `Message`, `InputStream`, or `Buffer` #' @param ... currently ignored diff --git a/r/R/table.R b/r/R/table.R index 02ba41578d84b..ac2cbc1440f5b 100644 --- a/r/R/table.R +++ b/r/R/table.R @@ -74,8 +74,7 @@ #' Modify or replace by assigning in (`tab$metadata <- new_metadata`). #' All list elements are coerced to string. See `schema()` for more information. #' - `$columns`: Returns a list of `ChunkedArray`s -#' @rdname Table -#' @name Table +#' @rdname Table-class #' @export Table <- R6Class("Table", inherit = ArrowTabular, @@ -242,13 +241,15 @@ cbind.Table <- function(...) { Table$create(!!!columns) } +#' Create an Arrow Table +#' #' @param ... A `data.frame` or a named set of Arrays or vectors. If given a #' mixture of data.frames and named vectors, the inputs will be autospliced together #' (see examples). Alternatively, you can provide a single Arrow IPC #' `InputStream`, `Message`, `Buffer`, or R `raw` object containing a `Buffer`. #' @param schema a [Schema], or `NULL` (the default) to infer the schema from #' the data in `...`. When providing an Arrow IPC buffer, `schema` is required. -#' @rdname Table +#' @rdname table #' @examples #' tbl <- arrow_table(name = rownames(mtcars), mtcars) #' dim(tbl) @@ -257,6 +258,7 @@ cbind.Table <- function(...) { #' tbl$mpg #' tbl[["cyl"]] #' as.data.frame(tbl[4:8, c("gear", "hp", "wt")]) +#' @seealso [Table] #' @export arrow_table <- Table$create diff --git a/r/R/type.R b/r/R/type.R index 1283fa256a88b..58d3267243220 100644 --- a/r/R/type.R +++ b/r/R/type.R @@ -35,7 +35,7 @@ #' - `$num_fields`: number of child fields. #' #' @seealso [infer_type()] -#' @rdname DataType +#' @rdname DataType-class #' @name DataType #' @seealso [`data-type`] DataType <- R6Class("DataType", @@ -304,7 +304,7 @@ Decimal256Type <- R6Class("Decimal256Type", inherit = DecimalType) NestedType <- R6Class("NestedType", inherit = DataType) -#' Apache Arrow data types +#' Create Arrow data types #' #' These functions create type objects corresponding to Arrow types. Use them #' when defining a [schema()] or as inputs to other types, like `struct`. Most @@ -378,6 +378,7 @@ NestedType <- R6Class("NestedType", inherit = DataType) #' @param ... For `struct()`, a named list of types to define the struct columns #' #' @name data-type +#' @rdname data-type #' @return An Arrow type object inheriting from [DataType]. #' @export #' @seealso [dictionary()] for creating a dictionary (factor-like) type. diff --git a/r/_pkgdown.yml b/r/_pkgdown.yml index 5331fad53a824..9facce9d1b28b 100644 --- a/r/_pkgdown.yml +++ b/r/_pkgdown.yml @@ -179,9 +179,9 @@ reference: contents: - scalar - arrow_array - - ChunkedArray - - RecordBatch - - Table + - chunked_array + - record_batch + - arrow_table - buffer - vctrs_extension_array @@ -207,7 +207,7 @@ reference: - title: Fields and schemas contents: - - Field + - field - schema - unify_schemas - as_schema @@ -289,13 +289,18 @@ reference: - RecordBatchWriter - as_record_batch_reader -- title: Arrow data types, schemas, and containers - R6 classes +- title: Low-level C++ wrappers desc: > - R6 classes for Arrow data types. + Low-level R6 class representations of Arrow C++ objects intended for advanced users. contents: + - Buffer - Scalar - Array + - ChunkedArray + - RecordBatch - Schema + - Field + - Table - DataType - ArrayData - DictionaryType diff --git a/r/configure b/r/configure index 198a89cd85e83..a0f75f8ddb5cd 100755 --- a/r/configure +++ b/r/configure @@ -50,10 +50,10 @@ # Currently the configure script doesn't offer much to make this easy. # If you expect to rebuild multiple times, you should set up a dev # environment. -# * Installing a dev version as a regular developer. +# * Installing a dev version as a regular developer. # The best way is to maintain your own cmake build and install it # to a directory (not system) that you set as the env var -# $ARROW_HOME. +# $ARROW_HOME. # # For more information, see the various installation and developer vignettes. @@ -177,7 +177,7 @@ find_arrow () { else PC_LIB_VERSION=`grep '^Version' ${_LIBARROW_FOUND}/lib/pkgconfig/arrow.pc | sed s/Version:\ //` fi - # This is in an R script for convenience and testability. + # This is in an R script for convenience and testability. # Success means the found C++ library is ok to use. # Error means the versions don't line up and we shouldn't use it. # More specific messaging to the user is in the R script @@ -238,7 +238,7 @@ do_autobrew () { # Setup for local autobrew testing if [ -f "tools/apache-arrow.rb" ]; then # If you want to use a local apache-arrow.rb formula, do - # $ cp ../dev/tasks/homebrew-formulae/autobrew/apache-arrow.rb tools/apache-arrow.rb + # $ cp ../dev/tasks/homebrew-formulae/autobrew/apache-arrow*.rb tools # before R CMD build or INSTALL (assuming a local checkout of the apache/arrow repository). # If you have this, you should use the local autobrew script so they match. cp tools/autobrew . diff --git a/r/man/Buffer-class.Rd b/r/man/Buffer-class.Rd new file mode 100644 index 0000000000000..edb56d41806a4 --- /dev/null +++ b/r/man/Buffer-class.Rd @@ -0,0 +1,32 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/buffer.R +\docType{class} +\name{Buffer} +\alias{Buffer} +\title{Buffer class} +\description{ +A Buffer is an object containing a pointer to a piece of +contiguous memory with a particular size. +} +\section{Factory}{ + +\code{buffer()} lets you create an \code{arrow::Buffer} from an R object +} + +\section{Methods}{ + +\itemize{ +\item \verb{$is_mutable} : is this buffer mutable? +\item \verb{$ZeroPadding()} : zero bytes in padding, i.e. bytes between size and capacity +\item \verb{$size} : size in memory, in bytes +\item \verb{$capacity}: possible capacity, in bytes +} +} + +\examples{ +my_buffer <- buffer(c(1, 2, 3, 4)) +my_buffer$is_mutable +my_buffer$ZeroPadding() +my_buffer$size +my_buffer$capacity +} diff --git a/r/man/ChunkedArray.Rd b/r/man/ChunkedArray-class.Rd similarity index 95% rename from r/man/ChunkedArray.Rd rename to r/man/ChunkedArray-class.Rd index 5cb3c4fe749a0..77fa58586686e 100644 --- a/r/man/ChunkedArray.Rd +++ b/r/man/ChunkedArray-class.Rd @@ -3,16 +3,7 @@ \docType{class} \name{ChunkedArray} \alias{ChunkedArray} -\alias{chunked_array} \title{ChunkedArray class} -\usage{ -chunked_array(..., type = NULL) -} -\arguments{ -\item{\dots}{Vectors to coerce} - -\item{type}{currently ignored} -} \description{ A \code{ChunkedArray} is a data structure managing a list of primitive Arrow \link[=Array]{Arrays} logically as one large array. Chunked arrays diff --git a/r/man/DataType.Rd b/r/man/DataType-class.Rd similarity index 100% rename from r/man/DataType.Rd rename to r/man/DataType-class.Rd diff --git a/r/man/Field-class.Rd b/r/man/Field-class.Rd new file mode 100644 index 0000000000000..35d8e236b2b3f --- /dev/null +++ b/r/man/Field-class.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/field.R +\docType{class} +\name{Field} +\alias{Field} +\title{Field class} +\description{ +\code{field()} lets you create an \code{arrow::Field} that maps a +\link[=data-type]{DataType} to a column name. Fields are contained in +\link[=Schema]{Schemas}. +} +\section{Methods}{ + +\itemize{ +\item \code{f$ToString()}: convert to a string +\item \code{f$Equals(other)}: test for equality. More naturally called as \code{f == other} +} +} + diff --git a/r/man/Field.Rd b/r/man/Field.Rd index 9c8f4794fc4b9..4d6fadcad3b09 100644 --- a/r/man/Field.Rd +++ b/r/man/Field.Rd @@ -1,10 +1,8 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/field.R -\docType{class} -\name{Field} -\alias{Field} +\name{field} \alias{field} -\title{Field class} +\title{Create a Field} \usage{ field(name, type, metadata, nullable = TRUE) } @@ -18,18 +16,11 @@ field(name, type, metadata, nullable = TRUE) \item{nullable}{TRUE if field is nullable} } \description{ -\code{field()} lets you create an \code{arrow::Field} that maps a -\link[=data-type]{DataType} to a column name. Fields are contained in -\link[=Schema]{Schemas}. +Create a Field } -\section{Methods}{ - -\itemize{ -\item \code{f$ToString()}: convert to a string -\item \code{f$Equals(other)}: test for equality. More naturally called as \code{f == other} -} -} - \examples{ field("x", int32()) } +\seealso{ +\link{Field} +} diff --git a/r/man/RecordBatch.Rd b/r/man/RecordBatch-class.Rd similarity index 82% rename from r/man/RecordBatch.Rd rename to r/man/RecordBatch-class.Rd index f936a6125b622..513301e8bbada 100644 --- a/r/man/RecordBatch.Rd +++ b/r/man/RecordBatch-class.Rd @@ -3,20 +3,7 @@ \docType{class} \name{RecordBatch} \alias{RecordBatch} -\alias{record_batch} \title{RecordBatch class} -\usage{ -record_batch(..., schema = NULL) -} -\arguments{ -\item{...}{A \code{data.frame} or a named set of Arrays or vectors. If given a -mixture of data.frames and vectors, the inputs will be autospliced together -(see examples). Alternatively, you can provide a single Arrow IPC -\code{InputStream}, \code{Message}, \code{Buffer}, or R \code{raw} object containing a \code{Buffer}.} - -\item{schema}{a \link{Schema}, or \code{NULL} (the default) to infer the schema from -the data in \code{...}. When providing an Arrow IPC buffer, \code{schema} is required.} -} \description{ A record batch is a collection of equal-length arrays matching a particular \link{Schema}. It is a table-like data structure that is semantically @@ -80,12 +67,3 @@ All list elements are coerced to string. See \code{schema()} for more informatio } } -\examples{ -batch <- record_batch(name = rownames(mtcars), mtcars) -dim(batch) -dim(head(batch)) -names(batch) -batch$mpg -batch[["cyl"]] -as.data.frame(batch[4:8, c("gear", "hp", "wt")]) -} diff --git a/r/man/schema-class.Rd b/r/man/Schema-class.Rd similarity index 100% rename from r/man/schema-class.Rd rename to r/man/Schema-class.Rd diff --git a/r/man/Table.Rd b/r/man/Table-class.Rd similarity index 82% rename from r/man/Table.Rd rename to r/man/Table-class.Rd index 0423728ef60cd..e8151f69f4569 100644 --- a/r/man/Table.Rd +++ b/r/man/Table-class.Rd @@ -3,20 +3,7 @@ \docType{class} \name{Table} \alias{Table} -\alias{arrow_table} \title{Table class} -\usage{ -arrow_table(..., schema = NULL) -} -\arguments{ -\item{...}{A \code{data.frame} or a named set of Arrays or vectors. If given a -mixture of data.frames and named vectors, the inputs will be autospliced together -(see examples). Alternatively, you can provide a single Arrow IPC -\code{InputStream}, \code{Message}, \code{Buffer}, or R \code{raw} object containing a \code{Buffer}.} - -\item{schema}{a \link{Schema}, or \code{NULL} (the default) to infer the schema from -the data in \code{...}. When providing an Arrow IPC buffer, \code{schema} is required.} -} \description{ A Table is a sequence of \link[=ChunkedArray]{chunked arrays}. They have a similar interface to \link[=RecordBatch]{record batches}, but they can be @@ -80,12 +67,3 @@ All list elements are coerced to string. See \code{schema()} for more informatio } } -\examples{ -tbl <- arrow_table(name = rownames(mtcars), mtcars) -dim(tbl) -dim(head(tbl)) -names(tbl) -tbl$mpg -tbl[["cyl"]] -as.data.frame(tbl[4:8, c("gear", "hp", "wt")]) -} diff --git a/r/man/buffer.Rd b/r/man/buffer.Rd index 08d66ece5dc3f..c03fd99b00624 100644 --- a/r/man/buffer.Rd +++ b/r/man/buffer.Rd @@ -1,10 +1,8 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/buffer.R -\docType{class} \name{buffer} \alias{buffer} -\alias{Buffer} -\title{Buffer class} +\title{Create a Buffer} \usage{ buffer(x) } @@ -15,28 +13,8 @@ buffer(x) an instance of \code{Buffer} that borrows memory from \code{x} } \description{ -A Buffer is an object containing a pointer to a piece of -contiguous memory with a particular size. +Create a Buffer } -\section{Factory}{ - -\code{buffer()} lets you create an \code{arrow::Buffer} from an R object -} - -\section{Methods}{ - -\itemize{ -\item \verb{$is_mutable} : is this buffer mutable? -\item \verb{$ZeroPadding()} : zero bytes in padding, i.e. bytes between size and capacity -\item \verb{$size} : size in memory, in bytes -\item \verb{$capacity}: possible capacity, in bytes -} -} - -\examples{ -my_buffer <- buffer(c(1, 2, 3, 4)) -my_buffer$is_mutable -my_buffer$ZeroPadding() -my_buffer$size -my_buffer$capacity +\seealso{ +\link{Buffer} } diff --git a/r/man/chunked_array.Rd b/r/man/chunked_array.Rd new file mode 100644 index 0000000000000..b73fd454be18b --- /dev/null +++ b/r/man/chunked_array.Rd @@ -0,0 +1,34 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/chunked-array.R +\name{chunked_array} +\alias{chunked_array} +\title{Create a Chunked Array} +\usage{ +chunked_array(..., type = NULL) +} +\arguments{ +\item{...}{R objects to coerce into a ChunkedArray. They must be of the same type.} + +\item{type}{An optional \link[=data-type]{data type}. If omitted, the type will be inferred from the data.} +} +\description{ +Create a Chunked Array +} +\examples{ +# Pass items into chunked_array as separate objects to create chunks +class_scores <- chunked_array(c(87, 88, 89), c(94, 93, 92), c(71, 72, 73)) + +# If you pass a list into chunked_array, you get a list of length 1 +list_scores <- chunked_array(list(c(9.9, 9.6, 9.5), c(8.2, 8.3, 8.4), c(10.0, 9.9, 9.8))) + +# When constructing a ChunkedArray, the first chunk is used to infer type. +infer_type(chunked_array(c(1, 2, 3), c(5L, 6L, 7L))) + +# Concatenating chunked arrays returns a new chunked array containing all chunks +a <- chunked_array(c(1, 2), 3) +b <- chunked_array(c(4, 5), 6) +c(a, b) +} +\seealso{ +\link{ChunkedArray} +} diff --git a/r/man/data-type.Rd b/r/man/data-type.Rd index 79b09a4f32164..214e8ddc1f6c7 100644 --- a/r/man/data-type.Rd +++ b/r/man/data-type.Rd @@ -40,7 +40,7 @@ \alias{fixed_size_list_of} \alias{MapType} \alias{map_of} -\title{Apache Arrow data types} +\title{Create Arrow data types} \usage{ int8() diff --git a/r/man/read_schema.Rd b/r/man/read_schema.Rd index 8738b8aebf740..94d35568de00b 100644 --- a/r/man/read_schema.Rd +++ b/r/man/read_schema.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/schema.R \name{read_schema} \alias{read_schema} -\title{read a Schema from a stream} +\title{Read a Schema from a stream} \usage{ read_schema(stream, ...) } @@ -15,5 +15,5 @@ read_schema(stream, ...) A \link{Schema} } \description{ -read a Schema from a stream +Read a Schema from a stream } diff --git a/r/man/record_batch.Rd b/r/man/record_batch.Rd new file mode 100644 index 0000000000000..6586009448e5c --- /dev/null +++ b/r/man/record_batch.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/record-batch.R +\name{record_batch} +\alias{record_batch} +\title{Create a RecordBatch} +\usage{ +record_batch(..., schema = NULL) +} +\arguments{ +\item{...}{A \code{data.frame} or a named set of Arrays or vectors. If given a +mixture of data.frames and vectors, the inputs will be autospliced together +(see examples). Alternatively, you can provide a single Arrow IPC +\code{InputStream}, \code{Message}, \code{Buffer}, or R \code{raw} object containing a \code{Buffer}.} + +\item{schema}{a \link{Schema}, or \code{NULL} (the default) to infer the schema from +the data in \code{...}. When providing an Arrow IPC buffer, \code{schema} is required.} +} +\description{ +Create a RecordBatch +} +\examples{ +batch <- record_batch(name = rownames(mtcars), mtcars) +dim(batch) +dim(head(batch)) +names(batch) +batch$mpg +batch[["cyl"]] +as.data.frame(batch[4:8, c("gear", "hp", "wt")]) +} diff --git a/r/man/schema.Rd b/r/man/schema.Rd index 42532d84b4271..65ab2eea0d27c 100644 --- a/r/man/schema.Rd +++ b/r/man/schema.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/schema.R \name{schema} \alias{schema} -\title{Schemas} +\title{Create a schema or extract one from an object.} \usage{ schema(...) } diff --git a/r/man/table.Rd b/r/man/table.Rd new file mode 100644 index 0000000000000..f83c56139beb0 --- /dev/null +++ b/r/man/table.Rd @@ -0,0 +1,32 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/table.R +\name{arrow_table} +\alias{arrow_table} +\title{Create an Arrow Table} +\usage{ +arrow_table(..., schema = NULL) +} +\arguments{ +\item{...}{A \code{data.frame} or a named set of Arrays or vectors. If given a +mixture of data.frames and named vectors, the inputs will be autospliced together +(see examples). Alternatively, you can provide a single Arrow IPC +\code{InputStream}, \code{Message}, \code{Buffer}, or R \code{raw} object containing a \code{Buffer}.} + +\item{schema}{a \link{Schema}, or \code{NULL} (the default) to infer the schema from +the data in \code{...}. When providing an Arrow IPC buffer, \code{schema} is required.} +} +\description{ +Create an Arrow Table +} +\examples{ +tbl <- arrow_table(name = rownames(mtcars), mtcars) +dim(tbl) +dim(head(tbl)) +names(tbl) +tbl$mpg +tbl[["cyl"]] +as.data.frame(tbl[4:8, c("gear", "hp", "wt")]) +} +\seealso{ +\link{Table} +} diff --git a/r/pkgdown/assets/versions.html b/r/pkgdown/assets/versions.html index 9c80c32735a85..31f393a27785d 100644 --- a/r/pkgdown/assets/versions.html +++ b/r/pkgdown/assets/versions.html @@ -1,7 +1,8 @@ -

11.0.0.9000 (dev)

-

11.0.0 (release)

+

12.0.1.9000 (dev)

+

12.0.1.1 (release)

+

11.0.0.3

10.0.1

9.0.0

8.0.0

diff --git a/r/pkgdown/assets/versions.json b/r/pkgdown/assets/versions.json index c7500309b22df..4d7658e6e5e25 100644 --- a/r/pkgdown/assets/versions.json +++ b/r/pkgdown/assets/versions.json @@ -4,7 +4,7 @@ "version": "dev/" }, { - "name": "12.0.1 (release)", + "name": "12.0.1.1 (release)", "version": "" }, { diff --git a/r/pkgdown/extra.js b/r/pkgdown/extra.js deleted file mode 100644 index 4e67262ec181a..0000000000000 --- a/r/pkgdown/extra.js +++ /dev/null @@ -1,66 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -(function () { - // Load the rmarkdown tabset script - var script = document.createElement("script"); - script.type = "text/javascript"; - script.async = true; - script.src = - "https://cdn.jsdelivr.net/gh/rstudio/rmarkdown@47d837d3d9cd5e8e212b05767454f058db7d2789/inst/rmd/h/navigation-1.1/tabsets.js"; - script.integrity = "sha256-Rs54TE1FCN1uLM4f7VQEMiRTl1Ia7TiQLkMruItwV+Q="; - script.crossOrigin = "anonymous"; - - // Run the processing as the onload callback - script.onload = () => { - // Monkey patch the .html method to use the .text method - $(document).ready(function () { - (function ($) { - $.fn.html = function (content) { - return this.text(); - }; - })(jQuery); - - window.buildTabsets("toc"); - }); - - $(document).ready(function () { - $(".tabset-dropdown > .nav-tabs > li").click(function () { - $(this).parent().toggleClass("nav-tabs-open"); - }); - }); - - $(document).ready(function () { - /** - * The tabset creation above sometimes relies on empty headers to stop the - * tabbing. Though they shouldn't be included in the TOC in the first place, - * this will remove empty headers from the TOC after it's created. - */ - - // find all the empty elements and remove them (and their parents) - var empty_a = $("#toc").find("a").filter(":empty"); - empty_a.parent().remove(); - - // now find any empty
    s and remove them too - var empty_ul = $("#toc").find("ul").filter(":empty"); - empty_ul.remove(); - }); - - }; - - document.head.appendChild(script); -})(); diff --git a/r/tests/testthat/test-dplyr-funcs-string.R b/r/tests/testthat/test-dplyr-funcs-string.R index 0dc834dbfea16..fc202bfb3a99e 100644 --- a/r/tests/testthat/test-dplyr-funcs-string.R +++ b/r/tests/testthat/test-dplyr-funcs-string.R @@ -1466,3 +1466,33 @@ test_that("str_remove and str_remove_all", { df ) }) + +test_that("GH-36720: stringr modifier functions can be called with namespace prefix", { + df <- tibble(x = c("Foo", "bar")) + compare_dplyr_binding( + .input %>% + transmute(x = str_replace_all(x, stringr::regex("^f", ignore_case = TRUE), "baz")) %>% + collect(), + df + ) + + compare_dplyr_binding( + .input %>% + filter(str_detect(x, stringr::fixed("f", ignore_case = TRUE), negate = TRUE)) %>% + collect(), + df + ) + + x <- Expression$field_ref("x") + + expect_error( + call_binding("str_detect", x, stringr::boundary(type = "character")), + "Pattern modifier `boundary()` not supported in Arrow", + fixed = TRUE + ) + expect_error( + call_binding("str_replace_all", x, stringr::coll("o", locale = "en"), "ó"), + "Pattern modifier `coll()` not supported in Arrow", + fixed = TRUE + ) +}) diff --git a/r/tools/autobrew b/r/tools/autobrew index f181309892174..35ffebcab3796 100644 --- a/r/tools/autobrew +++ b/r/tools/autobrew @@ -62,7 +62,7 @@ fi # Hardcode this for my custom autobrew build rm -f $BREWDIR/lib/*.dylib AWS_LIBS="-laws-cpp-sdk-config -laws-cpp-sdk-transfer -laws-cpp-sdk-identity-management -laws-cpp-sdk-cognito-identity -laws-cpp-sdk-sts -laws-cpp-sdk-s3 -laws-cpp-sdk-core -laws-c-event-stream -laws-checksums -laws-c-common -laws-crt-cpp -laws-c-io -laws-c-s3 -laws-c-auth -laws-c-http -laws-c-cal -laws-c-compression -laws-c-mqtt -lpthread -lcurl" -PKG_LIBS="-lparquet -larrow_dataset -larrow_acero -larrow -larrow_bundled_dependencies -lthrift -lbrotlienc-static -lbrotlidec-static -lbrotlicommon-static -llz4 -lsnappy -lzstd $AWS_LIBS" +PKG_LIBS="-lparquet -larrow_dataset -larrow_acero -larrow -larrow_bundled_dependencies -lthrift -lbrotlienc-static -lbrotlidec-static -lbrotlicommon-static -llz4 -lsnappy -lzstd $AWS_LIBS -lssl -lcrypto" PKG_DIRS="-L$BREWDIR/lib" # Prevent CRAN builder from linking against old libs in /usr/local/lib diff --git a/r/tools/nixlibs.R b/r/tools/nixlibs.R index 8b353fd09bb5a..90ea868ea3491 100644 --- a/r/tools/nixlibs.R +++ b/r/tools/nixlibs.R @@ -30,7 +30,7 @@ if (test_mode && is.na(VERSION)) { dev_version <- package_version(VERSION)[1, 4] # Small dev versions are added for R-only changes during CRAN submission. -if (is.na(dev_version) || dev_version < 100) { +if (is.na(dev_version) || dev_version < "100") { VERSION <- package_version(VERSION)[1, 1:3] arrow_repo <- paste0(getOption("arrow.repo", sprintf("https://apache.jfrog.io/artifactory/arrow/r/%s", VERSION)), "/libarrow/") } else { diff --git a/r/tools/winlibs.R b/r/tools/winlibs.R index d941da4baa61f..b554770e40c9b 100644 --- a/r/tools/winlibs.R +++ b/r/tools/winlibs.R @@ -53,7 +53,7 @@ if (!file.exists(sprintf("windows/arrow-%s/include/arrow/api.h", VERSION))) { dev_version <- package_version(VERSION)[1, 4] # Small dev versions are added for R-only changes during CRAN submission. - if (is.na(dev_version) || dev_version < 100) { + if (is.na(dev_version) || dev_version < "100") { VERSION <- package_version(VERSION)[1, 1:3] get_file(rwinlib, VERSION) diff --git a/r/vignettes/developers/setup.Rmd b/r/vignettes/developers/setup.Rmd index 11b7cf2906c86..479af577aa848 100644 --- a/r/vignettes/developers/setup.Rmd +++ b/r/vignettes/developers/setup.Rmd @@ -87,7 +87,7 @@ If you need to alter both libarrow and the R package code, or if you can't get a There are five major steps to the process. -### Step 1 - Install dependencies {.tabset} +### Step 1 - Install dependencies When building libarrow, by default, system dependencies will be used if suitable versions are found. If system dependencies are not present, libarrow will build them during its own build process. The only dependencies that you need to install _outside_ of the build process are [cmake](https://cmake.org/) (for configuring the build) and [openssl](https://www.openssl.org/) if you are building with S3 support. diff --git a/swift/ArrowFlight/.gitignore b/swift/ArrowFlight/.gitignore new file mode 100644 index 0000000000000..d561187385c2d --- /dev/null +++ b/swift/ArrowFlight/.gitignore @@ -0,0 +1,9 @@ +.DS_Store +/.build +/Packages +/*.xcodeproj +xcuserdata/ +DerivedData/ +.swiftpm/ +.netrc +Package.resolved \ No newline at end of file diff --git a/swift/ArrowFlight/Package.swift b/swift/ArrowFlight/Package.swift new file mode 100644 index 0000000000000..f3caa83486764 --- /dev/null +++ b/swift/ArrowFlight/Package.swift @@ -0,0 +1,53 @@ +// swift-tools-version:5.7 +// The swift-tools-version declares the minimum version of Swift required to build this package. + +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import PackageDescription + +let package = Package( + name: "ArrowFlight", + platforms: [ + .macOS(.v10_15) + ], + products: [ + // Products define the executables and libraries a package produces, making them visible to other packages. + .library( + name: "ArrowFlight", + targets: ["ArrowFlight"]), + ], + dependencies: [ + .package(url: "https://github.com/grpc/grpc-swift.git", from: "1.15.0"), + .package(url: "https://github.com/apple/swift-protobuf.git", from: "1.6.0"), + .package(path: "../Arrow") + ], + targets: [ + // Targets are the basic building blocks of a package, defining a module or a test suite. + // Targets can depend on other targets in this package and products from dependencies. + .target( + name: "ArrowFlight", + dependencies: [ + .product(name: "Arrow", package: "Arrow"), + .product(name: "GRPC", package: "grpc-swift"), + .product(name: "SwiftProtobuf", package: "swift-protobuf") + ]), + .testTarget( + name: "ArrowFlightTests", + dependencies: ["ArrowFlight"]), + ] +) diff --git a/swift/ArrowFlight/Sources/ArrowFlight/Flight.grpc.swift b/swift/ArrowFlight/Sources/ArrowFlight/Flight.grpc.swift new file mode 100644 index 0000000000000..8daaa19f07b50 --- /dev/null +++ b/swift/ArrowFlight/Sources/ArrowFlight/Flight.grpc.swift @@ -0,0 +1,1343 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// DO NOT EDIT. +// swift-format-ignore-file +// +// Generated by the protocol buffer compiler. +// Source: Flight.proto +// +import GRPC +import NIO +import NIOConcurrencyHelpers +import SwiftProtobuf + + +/// +/// A flight service is an endpoint for retrieving or storing Arrow data. A +/// flight service can expose one or more predefined endpoints that can be +/// accessed using the Arrow Flight Protocol. Additionally, a flight service +/// can expose a set of actions that are available. +/// +/// Usage: instantiate `Arrow_Flight_Protocol_FlightServiceClient`, then call methods of this protocol to make API calls. +internal protocol Arrow_Flight_Protocol_FlightServiceClientProtocol: GRPCClient { + var serviceName: String { get } + var interceptors: Arrow_Flight_Protocol_FlightServiceClientInterceptorFactoryProtocol? { get } + + func handshake( + callOptions: CallOptions?, + handler: @escaping (Arrow_Flight_Protocol_HandshakeResponse) -> Void + ) -> BidirectionalStreamingCall + + func listFlights( + _ request: Arrow_Flight_Protocol_Criteria, + callOptions: CallOptions?, + handler: @escaping (Arrow_Flight_Protocol_FlightInfo) -> Void + ) -> ServerStreamingCall + + func getFlightInfo( + _ request: Arrow_Flight_Protocol_FlightDescriptor, + callOptions: CallOptions? + ) -> UnaryCall + + func getSchema( + _ request: Arrow_Flight_Protocol_FlightDescriptor, + callOptions: CallOptions? + ) -> UnaryCall + + func doGet( + _ request: Arrow_Flight_Protocol_Ticket, + callOptions: CallOptions?, + handler: @escaping (Arrow_Flight_Protocol_FlightData) -> Void + ) -> ServerStreamingCall + + func doPut( + callOptions: CallOptions?, + handler: @escaping (Arrow_Flight_Protocol_PutResult) -> Void + ) -> BidirectionalStreamingCall + + func doExchange( + callOptions: CallOptions?, + handler: @escaping (Arrow_Flight_Protocol_FlightData) -> Void + ) -> BidirectionalStreamingCall + + func doAction( + _ request: Arrow_Flight_Protocol_Action, + callOptions: CallOptions?, + handler: @escaping (Arrow_Flight_Protocol_Result) -> Void + ) -> ServerStreamingCall + + func listActions( + _ request: Arrow_Flight_Protocol_Empty, + callOptions: CallOptions?, + handler: @escaping (Arrow_Flight_Protocol_ActionType) -> Void + ) -> ServerStreamingCall +} + +extension Arrow_Flight_Protocol_FlightServiceClientProtocol { + internal var serviceName: String { + return "arrow.flight.protocol.FlightService" + } + + /// + /// Handshake between client and server. Depending on the server, the + /// handshake may be required to determine the token that should be used for + /// future operations. Both request and response are streams to allow multiple + /// round-trips depending on auth mechanism. + /// + /// Callers should use the `send` method on the returned object to send messages + /// to the server. The caller should send an `.end` after the final message has been sent. + /// + /// - Parameters: + /// - callOptions: Call options. + /// - handler: A closure called when each response is received from the server. + /// - Returns: A `ClientStreamingCall` with futures for the metadata and status. + internal func handshake( + callOptions: CallOptions? = nil, + handler: @escaping (Arrow_Flight_Protocol_HandshakeResponse) -> Void + ) -> BidirectionalStreamingCall { + return self.makeBidirectionalStreamingCall( + path: Arrow_Flight_Protocol_FlightServiceClientMetadata.Methods.handshake.path, + callOptions: callOptions ?? self.defaultCallOptions, + interceptors: self.interceptors?.makeHandshakeInterceptors() ?? [], + handler: handler + ) + } + + /// + /// Get a list of available streams given a particular criteria. Most flight + /// services will expose one or more streams that are readily available for + /// retrieval. This api allows listing the streams available for + /// consumption. A user can also provide a criteria. The criteria can limit + /// the subset of streams that can be listed via this interface. Each flight + /// service allows its own definition of how to consume criteria. + /// + /// - Parameters: + /// - request: Request to send to ListFlights. + /// - callOptions: Call options. + /// - handler: A closure called when each response is received from the server. + /// - Returns: A `ServerStreamingCall` with futures for the metadata and status. + internal func listFlights( + _ request: Arrow_Flight_Protocol_Criteria, + callOptions: CallOptions? = nil, + handler: @escaping (Arrow_Flight_Protocol_FlightInfo) -> Void + ) -> ServerStreamingCall { + return self.makeServerStreamingCall( + path: Arrow_Flight_Protocol_FlightServiceClientMetadata.Methods.listFlights.path, + request: request, + callOptions: callOptions ?? self.defaultCallOptions, + interceptors: self.interceptors?.makeListFlightsInterceptors() ?? [], + handler: handler + ) + } + + /// + /// For a given FlightDescriptor, get information about how the flight can be + /// consumed. This is a useful interface if the consumer of the interface + /// already can identify the specific flight to consume. This interface can + /// also allow a consumer to generate a flight stream through a specified + /// descriptor. For example, a flight descriptor might be something that + /// includes a SQL statement or a Pickled Python operation that will be + /// executed. In those cases, the descriptor will not be previously available + /// within the list of available streams provided by ListFlights but will be + /// available for consumption for the duration defined by the specific flight + /// service. + /// + /// - Parameters: + /// - request: Request to send to GetFlightInfo. + /// - callOptions: Call options. + /// - Returns: A `UnaryCall` with futures for the metadata, status and response. + internal func getFlightInfo( + _ request: Arrow_Flight_Protocol_FlightDescriptor, + callOptions: CallOptions? = nil + ) -> UnaryCall { + return self.makeUnaryCall( + path: Arrow_Flight_Protocol_FlightServiceClientMetadata.Methods.getFlightInfo.path, + request: request, + callOptions: callOptions ?? self.defaultCallOptions, + interceptors: self.interceptors?.makeGetFlightInfoInterceptors() ?? [] + ) + } + + /// + /// For a given FlightDescriptor, get the Schema as described in Schema.fbs::Schema + /// This is used when a consumer needs the Schema of flight stream. Similar to + /// GetFlightInfo this interface may generate a new flight that was not previously + /// available in ListFlights. + /// + /// - Parameters: + /// - request: Request to send to GetSchema. + /// - callOptions: Call options. + /// - Returns: A `UnaryCall` with futures for the metadata, status and response. + internal func getSchema( + _ request: Arrow_Flight_Protocol_FlightDescriptor, + callOptions: CallOptions? = nil + ) -> UnaryCall { + return self.makeUnaryCall( + path: Arrow_Flight_Protocol_FlightServiceClientMetadata.Methods.getSchema.path, + request: request, + callOptions: callOptions ?? self.defaultCallOptions, + interceptors: self.interceptors?.makeGetSchemaInterceptors() ?? [] + ) + } + + /// + /// Retrieve a single stream associated with a particular descriptor + /// associated with the referenced ticket. A Flight can be composed of one or + /// more streams where each stream can be retrieved using a separate opaque + /// ticket that the flight service uses for managing a collection of streams. + /// + /// - Parameters: + /// - request: Request to send to DoGet. + /// - callOptions: Call options. + /// - handler: A closure called when each response is received from the server. + /// - Returns: A `ServerStreamingCall` with futures for the metadata and status. + internal func doGet( + _ request: Arrow_Flight_Protocol_Ticket, + callOptions: CallOptions? = nil, + handler: @escaping (Arrow_Flight_Protocol_FlightData) -> Void + ) -> ServerStreamingCall { + return self.makeServerStreamingCall( + path: Arrow_Flight_Protocol_FlightServiceClientMetadata.Methods.doGet.path, + request: request, + callOptions: callOptions ?? self.defaultCallOptions, + interceptors: self.interceptors?.makeDoGetInterceptors() ?? [], + handler: handler + ) + } + + /// + /// Push a stream to the flight service associated with a particular + /// flight stream. This allows a client of a flight service to upload a stream + /// of data. Depending on the particular flight service, a client consumer + /// could be allowed to upload a single stream per descriptor or an unlimited + /// number. In the latter, the service might implement a 'seal' action that + /// can be applied to a descriptor once all streams are uploaded. + /// + /// Callers should use the `send` method on the returned object to send messages + /// to the server. The caller should send an `.end` after the final message has been sent. + /// + /// - Parameters: + /// - callOptions: Call options. + /// - handler: A closure called when each response is received from the server. + /// - Returns: A `ClientStreamingCall` with futures for the metadata and status. + internal func doPut( + callOptions: CallOptions? = nil, + handler: @escaping (Arrow_Flight_Protocol_PutResult) -> Void + ) -> BidirectionalStreamingCall { + return self.makeBidirectionalStreamingCall( + path: Arrow_Flight_Protocol_FlightServiceClientMetadata.Methods.doPut.path, + callOptions: callOptions ?? self.defaultCallOptions, + interceptors: self.interceptors?.makeDoPutInterceptors() ?? [], + handler: handler + ) + } + + /// + /// Open a bidirectional data channel for a given descriptor. This + /// allows clients to send and receive arbitrary Arrow data and + /// application-specific metadata in a single logical stream. In + /// contrast to DoGet/DoPut, this is more suited for clients + /// offloading computation (rather than storage) to a Flight service. + /// + /// Callers should use the `send` method on the returned object to send messages + /// to the server. The caller should send an `.end` after the final message has been sent. + /// + /// - Parameters: + /// - callOptions: Call options. + /// - handler: A closure called when each response is received from the server. + /// - Returns: A `ClientStreamingCall` with futures for the metadata and status. + internal func doExchange( + callOptions: CallOptions? = nil, + handler: @escaping (Arrow_Flight_Protocol_FlightData) -> Void + ) -> BidirectionalStreamingCall { + return self.makeBidirectionalStreamingCall( + path: Arrow_Flight_Protocol_FlightServiceClientMetadata.Methods.doExchange.path, + callOptions: callOptions ?? self.defaultCallOptions, + interceptors: self.interceptors?.makeDoExchangeInterceptors() ?? [], + handler: handler + ) + } + + /// + /// Flight services can support an arbitrary number of simple actions in + /// addition to the possible ListFlights, GetFlightInfo, DoGet, DoPut + /// operations that are potentially available. DoAction allows a flight client + /// to do a specific action against a flight service. An action includes + /// opaque request and response objects that are specific to the type action + /// being undertaken. + /// + /// - Parameters: + /// - request: Request to send to DoAction. + /// - callOptions: Call options. + /// - handler: A closure called when each response is received from the server. + /// - Returns: A `ServerStreamingCall` with futures for the metadata and status. + internal func doAction( + _ request: Arrow_Flight_Protocol_Action, + callOptions: CallOptions? = nil, + handler: @escaping (Arrow_Flight_Protocol_Result) -> Void + ) -> ServerStreamingCall { + return self.makeServerStreamingCall( + path: Arrow_Flight_Protocol_FlightServiceClientMetadata.Methods.doAction.path, + request: request, + callOptions: callOptions ?? self.defaultCallOptions, + interceptors: self.interceptors?.makeDoActionInterceptors() ?? [], + handler: handler + ) + } + + /// + /// A flight service exposes all of the available action types that it has + /// along with descriptions. This allows different flight consumers to + /// understand the capabilities of the flight service. + /// + /// - Parameters: + /// - request: Request to send to ListActions. + /// - callOptions: Call options. + /// - handler: A closure called when each response is received from the server. + /// - Returns: A `ServerStreamingCall` with futures for the metadata and status. + internal func listActions( + _ request: Arrow_Flight_Protocol_Empty, + callOptions: CallOptions? = nil, + handler: @escaping (Arrow_Flight_Protocol_ActionType) -> Void + ) -> ServerStreamingCall { + return self.makeServerStreamingCall( + path: Arrow_Flight_Protocol_FlightServiceClientMetadata.Methods.listActions.path, + request: request, + callOptions: callOptions ?? self.defaultCallOptions, + interceptors: self.interceptors?.makeListActionsInterceptors() ?? [], + handler: handler + ) + } +} + +@available(*, deprecated) +extension Arrow_Flight_Protocol_FlightServiceClient: @unchecked Sendable {} + +@available(*, deprecated, renamed: "Arrow_Flight_Protocol_FlightServiceNIOClient") +internal final class Arrow_Flight_Protocol_FlightServiceClient: Arrow_Flight_Protocol_FlightServiceClientProtocol { + private let lock = Lock() + private var _defaultCallOptions: CallOptions + private var _interceptors: Arrow_Flight_Protocol_FlightServiceClientInterceptorFactoryProtocol? + internal let channel: GRPCChannel + internal var defaultCallOptions: CallOptions { + get { self.lock.withLock { return self._defaultCallOptions } } + set { self.lock.withLockVoid { self._defaultCallOptions = newValue } } + } + internal var interceptors: Arrow_Flight_Protocol_FlightServiceClientInterceptorFactoryProtocol? { + get { self.lock.withLock { return self._interceptors } } + set { self.lock.withLockVoid { self._interceptors = newValue } } + } + + /// Creates a client for the arrow.flight.protocol.FlightService service. + /// + /// - Parameters: + /// - channel: `GRPCChannel` to the service host. + /// - defaultCallOptions: Options to use for each service call if the user doesn't provide them. + /// - interceptors: A factory providing interceptors for each RPC. + internal init( + channel: GRPCChannel, + defaultCallOptions: CallOptions = CallOptions(), + interceptors: Arrow_Flight_Protocol_FlightServiceClientInterceptorFactoryProtocol? = nil + ) { + self.channel = channel + self._defaultCallOptions = defaultCallOptions + self._interceptors = interceptors + } +} + +internal struct Arrow_Flight_Protocol_FlightServiceNIOClient: Arrow_Flight_Protocol_FlightServiceClientProtocol { + internal var channel: GRPCChannel + internal var defaultCallOptions: CallOptions + internal var interceptors: Arrow_Flight_Protocol_FlightServiceClientInterceptorFactoryProtocol? + + /// Creates a client for the arrow.flight.protocol.FlightService service. + /// + /// - Parameters: + /// - channel: `GRPCChannel` to the service host. + /// - defaultCallOptions: Options to use for each service call if the user doesn't provide them. + /// - interceptors: A factory providing interceptors for each RPC. + internal init( + channel: GRPCChannel, + defaultCallOptions: CallOptions = CallOptions(), + interceptors: Arrow_Flight_Protocol_FlightServiceClientInterceptorFactoryProtocol? = nil + ) { + self.channel = channel + self.defaultCallOptions = defaultCallOptions + self.interceptors = interceptors + } +} + +/// +/// A flight service is an endpoint for retrieving or storing Arrow data. A +/// flight service can expose one or more predefined endpoints that can be +/// accessed using the Arrow Flight Protocol. Additionally, a flight service +/// can expose a set of actions that are available. +@available(macOS 10.15, iOS 13, tvOS 13, watchOS 6, *) +internal protocol Arrow_Flight_Protocol_FlightServiceAsyncClientProtocol: GRPCClient { + static var serviceDescriptor: GRPCServiceDescriptor { get } + var interceptors: Arrow_Flight_Protocol_FlightServiceClientInterceptorFactoryProtocol? { get } + + func makeHandshakeCall( + callOptions: CallOptions? + ) -> GRPCAsyncBidirectionalStreamingCall + + func makeListFlightsCall( + _ request: Arrow_Flight_Protocol_Criteria, + callOptions: CallOptions? + ) -> GRPCAsyncServerStreamingCall + + func makeGetFlightInfoCall( + _ request: Arrow_Flight_Protocol_FlightDescriptor, + callOptions: CallOptions? + ) -> GRPCAsyncUnaryCall + + func makeGetSchemaCall( + _ request: Arrow_Flight_Protocol_FlightDescriptor, + callOptions: CallOptions? + ) -> GRPCAsyncUnaryCall + + func makeDoGetCall( + _ request: Arrow_Flight_Protocol_Ticket, + callOptions: CallOptions? + ) -> GRPCAsyncServerStreamingCall + + func makeDoPutCall( + callOptions: CallOptions? + ) -> GRPCAsyncBidirectionalStreamingCall + + func makeDoExchangeCall( + callOptions: CallOptions? + ) -> GRPCAsyncBidirectionalStreamingCall + + func makeDoActionCall( + _ request: Arrow_Flight_Protocol_Action, + callOptions: CallOptions? + ) -> GRPCAsyncServerStreamingCall + + func makeListActionsCall( + _ request: Arrow_Flight_Protocol_Empty, + callOptions: CallOptions? + ) -> GRPCAsyncServerStreamingCall +} + +@available(macOS 10.15, iOS 13, tvOS 13, watchOS 6, *) +extension Arrow_Flight_Protocol_FlightServiceAsyncClientProtocol { + internal static var serviceDescriptor: GRPCServiceDescriptor { + return Arrow_Flight_Protocol_FlightServiceClientMetadata.serviceDescriptor + } + + internal var interceptors: Arrow_Flight_Protocol_FlightServiceClientInterceptorFactoryProtocol? { + return nil + } + + internal func makeHandshakeCall( + callOptions: CallOptions? = nil + ) -> GRPCAsyncBidirectionalStreamingCall { + return self.makeAsyncBidirectionalStreamingCall( + path: Arrow_Flight_Protocol_FlightServiceClientMetadata.Methods.handshake.path, + callOptions: callOptions ?? self.defaultCallOptions, + interceptors: self.interceptors?.makeHandshakeInterceptors() ?? [] + ) + } + + internal func makeListFlightsCall( + _ request: Arrow_Flight_Protocol_Criteria, + callOptions: CallOptions? = nil + ) -> GRPCAsyncServerStreamingCall { + return self.makeAsyncServerStreamingCall( + path: Arrow_Flight_Protocol_FlightServiceClientMetadata.Methods.listFlights.path, + request: request, + callOptions: callOptions ?? self.defaultCallOptions, + interceptors: self.interceptors?.makeListFlightsInterceptors() ?? [] + ) + } + + internal func makeGetFlightInfoCall( + _ request: Arrow_Flight_Protocol_FlightDescriptor, + callOptions: CallOptions? = nil + ) -> GRPCAsyncUnaryCall { + return self.makeAsyncUnaryCall( + path: Arrow_Flight_Protocol_FlightServiceClientMetadata.Methods.getFlightInfo.path, + request: request, + callOptions: callOptions ?? self.defaultCallOptions, + interceptors: self.interceptors?.makeGetFlightInfoInterceptors() ?? [] + ) + } + + internal func makeGetSchemaCall( + _ request: Arrow_Flight_Protocol_FlightDescriptor, + callOptions: CallOptions? = nil + ) -> GRPCAsyncUnaryCall { + return self.makeAsyncUnaryCall( + path: Arrow_Flight_Protocol_FlightServiceClientMetadata.Methods.getSchema.path, + request: request, + callOptions: callOptions ?? self.defaultCallOptions, + interceptors: self.interceptors?.makeGetSchemaInterceptors() ?? [] + ) + } + + internal func makeDoGetCall( + _ request: Arrow_Flight_Protocol_Ticket, + callOptions: CallOptions? = nil + ) -> GRPCAsyncServerStreamingCall { + return self.makeAsyncServerStreamingCall( + path: Arrow_Flight_Protocol_FlightServiceClientMetadata.Methods.doGet.path, + request: request, + callOptions: callOptions ?? self.defaultCallOptions, + interceptors: self.interceptors?.makeDoGetInterceptors() ?? [] + ) + } + + internal func makeDoPutCall( + callOptions: CallOptions? = nil + ) -> GRPCAsyncBidirectionalStreamingCall { + return self.makeAsyncBidirectionalStreamingCall( + path: Arrow_Flight_Protocol_FlightServiceClientMetadata.Methods.doPut.path, + callOptions: callOptions ?? self.defaultCallOptions, + interceptors: self.interceptors?.makeDoPutInterceptors() ?? [] + ) + } + + internal func makeDoExchangeCall( + callOptions: CallOptions? = nil + ) -> GRPCAsyncBidirectionalStreamingCall { + return self.makeAsyncBidirectionalStreamingCall( + path: Arrow_Flight_Protocol_FlightServiceClientMetadata.Methods.doExchange.path, + callOptions: callOptions ?? self.defaultCallOptions, + interceptors: self.interceptors?.makeDoExchangeInterceptors() ?? [] + ) + } + + internal func makeDoActionCall( + _ request: Arrow_Flight_Protocol_Action, + callOptions: CallOptions? = nil + ) -> GRPCAsyncServerStreamingCall { + return self.makeAsyncServerStreamingCall( + path: Arrow_Flight_Protocol_FlightServiceClientMetadata.Methods.doAction.path, + request: request, + callOptions: callOptions ?? self.defaultCallOptions, + interceptors: self.interceptors?.makeDoActionInterceptors() ?? [] + ) + } + + internal func makeListActionsCall( + _ request: Arrow_Flight_Protocol_Empty, + callOptions: CallOptions? = nil + ) -> GRPCAsyncServerStreamingCall { + return self.makeAsyncServerStreamingCall( + path: Arrow_Flight_Protocol_FlightServiceClientMetadata.Methods.listActions.path, + request: request, + callOptions: callOptions ?? self.defaultCallOptions, + interceptors: self.interceptors?.makeListActionsInterceptors() ?? [] + ) + } +} + +@available(macOS 10.15, iOS 13, tvOS 13, watchOS 6, *) +extension Arrow_Flight_Protocol_FlightServiceAsyncClientProtocol { + internal func handshake( + _ requests: RequestStream, + callOptions: CallOptions? = nil + ) -> GRPCAsyncResponseStream where RequestStream: Sequence, RequestStream.Element == Arrow_Flight_Protocol_HandshakeRequest { + return self.performAsyncBidirectionalStreamingCall( + path: Arrow_Flight_Protocol_FlightServiceClientMetadata.Methods.handshake.path, + requests: requests, + callOptions: callOptions ?? self.defaultCallOptions, + interceptors: self.interceptors?.makeHandshakeInterceptors() ?? [] + ) + } + + internal func handshake( + _ requests: RequestStream, + callOptions: CallOptions? = nil + ) -> GRPCAsyncResponseStream where RequestStream: AsyncSequence & Sendable, RequestStream.Element == Arrow_Flight_Protocol_HandshakeRequest { + return self.performAsyncBidirectionalStreamingCall( + path: Arrow_Flight_Protocol_FlightServiceClientMetadata.Methods.handshake.path, + requests: requests, + callOptions: callOptions ?? self.defaultCallOptions, + interceptors: self.interceptors?.makeHandshakeInterceptors() ?? [] + ) + } + + internal func listFlights( + _ request: Arrow_Flight_Protocol_Criteria, + callOptions: CallOptions? = nil + ) -> GRPCAsyncResponseStream { + return self.performAsyncServerStreamingCall( + path: Arrow_Flight_Protocol_FlightServiceClientMetadata.Methods.listFlights.path, + request: request, + callOptions: callOptions ?? self.defaultCallOptions, + interceptors: self.interceptors?.makeListFlightsInterceptors() ?? [] + ) + } + + internal func getFlightInfo( + _ request: Arrow_Flight_Protocol_FlightDescriptor, + callOptions: CallOptions? = nil + ) async throws -> Arrow_Flight_Protocol_FlightInfo { + return try await self.performAsyncUnaryCall( + path: Arrow_Flight_Protocol_FlightServiceClientMetadata.Methods.getFlightInfo.path, + request: request, + callOptions: callOptions ?? self.defaultCallOptions, + interceptors: self.interceptors?.makeGetFlightInfoInterceptors() ?? [] + ) + } + + internal func getSchema( + _ request: Arrow_Flight_Protocol_FlightDescriptor, + callOptions: CallOptions? = nil + ) async throws -> Arrow_Flight_Protocol_SchemaResult { + return try await self.performAsyncUnaryCall( + path: Arrow_Flight_Protocol_FlightServiceClientMetadata.Methods.getSchema.path, + request: request, + callOptions: callOptions ?? self.defaultCallOptions, + interceptors: self.interceptors?.makeGetSchemaInterceptors() ?? [] + ) + } + + internal func doGet( + _ request: Arrow_Flight_Protocol_Ticket, + callOptions: CallOptions? = nil + ) -> GRPCAsyncResponseStream { + return self.performAsyncServerStreamingCall( + path: Arrow_Flight_Protocol_FlightServiceClientMetadata.Methods.doGet.path, + request: request, + callOptions: callOptions ?? self.defaultCallOptions, + interceptors: self.interceptors?.makeDoGetInterceptors() ?? [] + ) + } + + internal func doPut( + _ requests: RequestStream, + callOptions: CallOptions? = nil + ) -> GRPCAsyncResponseStream where RequestStream: Sequence, RequestStream.Element == Arrow_Flight_Protocol_FlightData { + return self.performAsyncBidirectionalStreamingCall( + path: Arrow_Flight_Protocol_FlightServiceClientMetadata.Methods.doPut.path, + requests: requests, + callOptions: callOptions ?? self.defaultCallOptions, + interceptors: self.interceptors?.makeDoPutInterceptors() ?? [] + ) + } + + internal func doPut( + _ requests: RequestStream, + callOptions: CallOptions? = nil + ) -> GRPCAsyncResponseStream where RequestStream: AsyncSequence & Sendable, RequestStream.Element == Arrow_Flight_Protocol_FlightData { + return self.performAsyncBidirectionalStreamingCall( + path: Arrow_Flight_Protocol_FlightServiceClientMetadata.Methods.doPut.path, + requests: requests, + callOptions: callOptions ?? self.defaultCallOptions, + interceptors: self.interceptors?.makeDoPutInterceptors() ?? [] + ) + } + + internal func doExchange( + _ requests: RequestStream, + callOptions: CallOptions? = nil + ) -> GRPCAsyncResponseStream where RequestStream: Sequence, RequestStream.Element == Arrow_Flight_Protocol_FlightData { + return self.performAsyncBidirectionalStreamingCall( + path: Arrow_Flight_Protocol_FlightServiceClientMetadata.Methods.doExchange.path, + requests: requests, + callOptions: callOptions ?? self.defaultCallOptions, + interceptors: self.interceptors?.makeDoExchangeInterceptors() ?? [] + ) + } + + internal func doExchange( + _ requests: RequestStream, + callOptions: CallOptions? = nil + ) -> GRPCAsyncResponseStream where RequestStream: AsyncSequence & Sendable, RequestStream.Element == Arrow_Flight_Protocol_FlightData { + return self.performAsyncBidirectionalStreamingCall( + path: Arrow_Flight_Protocol_FlightServiceClientMetadata.Methods.doExchange.path, + requests: requests, + callOptions: callOptions ?? self.defaultCallOptions, + interceptors: self.interceptors?.makeDoExchangeInterceptors() ?? [] + ) + } + + internal func doAction( + _ request: Arrow_Flight_Protocol_Action, + callOptions: CallOptions? = nil + ) -> GRPCAsyncResponseStream { + return self.performAsyncServerStreamingCall( + path: Arrow_Flight_Protocol_FlightServiceClientMetadata.Methods.doAction.path, + request: request, + callOptions: callOptions ?? self.defaultCallOptions, + interceptors: self.interceptors?.makeDoActionInterceptors() ?? [] + ) + } + + internal func listActions( + _ request: Arrow_Flight_Protocol_Empty, + callOptions: CallOptions? = nil + ) -> GRPCAsyncResponseStream { + return self.performAsyncServerStreamingCall( + path: Arrow_Flight_Protocol_FlightServiceClientMetadata.Methods.listActions.path, + request: request, + callOptions: callOptions ?? self.defaultCallOptions, + interceptors: self.interceptors?.makeListActionsInterceptors() ?? [] + ) + } +} + +@available(macOS 10.15, iOS 13, tvOS 13, watchOS 6, *) +internal struct Arrow_Flight_Protocol_FlightServiceAsyncClient: Arrow_Flight_Protocol_FlightServiceAsyncClientProtocol { + internal var channel: GRPCChannel + internal var defaultCallOptions: CallOptions + internal var interceptors: Arrow_Flight_Protocol_FlightServiceClientInterceptorFactoryProtocol? + + internal init( + channel: GRPCChannel, + defaultCallOptions: CallOptions = CallOptions(), + interceptors: Arrow_Flight_Protocol_FlightServiceClientInterceptorFactoryProtocol? = nil + ) { + self.channel = channel + self.defaultCallOptions = defaultCallOptions + self.interceptors = interceptors + } +} + +internal protocol Arrow_Flight_Protocol_FlightServiceClientInterceptorFactoryProtocol: Sendable { + + /// - Returns: Interceptors to use when invoking 'handshake'. + func makeHandshakeInterceptors() -> [ClientInterceptor] + + /// - Returns: Interceptors to use when invoking 'listFlights'. + func makeListFlightsInterceptors() -> [ClientInterceptor] + + /// - Returns: Interceptors to use when invoking 'getFlightInfo'. + func makeGetFlightInfoInterceptors() -> [ClientInterceptor] + + /// - Returns: Interceptors to use when invoking 'getSchema'. + func makeGetSchemaInterceptors() -> [ClientInterceptor] + + /// - Returns: Interceptors to use when invoking 'doGet'. + func makeDoGetInterceptors() -> [ClientInterceptor] + + /// - Returns: Interceptors to use when invoking 'doPut'. + func makeDoPutInterceptors() -> [ClientInterceptor] + + /// - Returns: Interceptors to use when invoking 'doExchange'. + func makeDoExchangeInterceptors() -> [ClientInterceptor] + + /// - Returns: Interceptors to use when invoking 'doAction'. + func makeDoActionInterceptors() -> [ClientInterceptor] + + /// - Returns: Interceptors to use when invoking 'listActions'. + func makeListActionsInterceptors() -> [ClientInterceptor] +} + +internal enum Arrow_Flight_Protocol_FlightServiceClientMetadata { + internal static let serviceDescriptor = GRPCServiceDescriptor( + name: "FlightService", + fullName: "arrow.flight.protocol.FlightService", + methods: [ + Arrow_Flight_Protocol_FlightServiceClientMetadata.Methods.handshake, + Arrow_Flight_Protocol_FlightServiceClientMetadata.Methods.listFlights, + Arrow_Flight_Protocol_FlightServiceClientMetadata.Methods.getFlightInfo, + Arrow_Flight_Protocol_FlightServiceClientMetadata.Methods.getSchema, + Arrow_Flight_Protocol_FlightServiceClientMetadata.Methods.doGet, + Arrow_Flight_Protocol_FlightServiceClientMetadata.Methods.doPut, + Arrow_Flight_Protocol_FlightServiceClientMetadata.Methods.doExchange, + Arrow_Flight_Protocol_FlightServiceClientMetadata.Methods.doAction, + Arrow_Flight_Protocol_FlightServiceClientMetadata.Methods.listActions, + ] + ) + + internal enum Methods { + internal static let handshake = GRPCMethodDescriptor( + name: "Handshake", + path: "/arrow.flight.protocol.FlightService/Handshake", + type: GRPCCallType.bidirectionalStreaming + ) + + internal static let listFlights = GRPCMethodDescriptor( + name: "ListFlights", + path: "/arrow.flight.protocol.FlightService/ListFlights", + type: GRPCCallType.serverStreaming + ) + + internal static let getFlightInfo = GRPCMethodDescriptor( + name: "GetFlightInfo", + path: "/arrow.flight.protocol.FlightService/GetFlightInfo", + type: GRPCCallType.unary + ) + + internal static let getSchema = GRPCMethodDescriptor( + name: "GetSchema", + path: "/arrow.flight.protocol.FlightService/GetSchema", + type: GRPCCallType.unary + ) + + internal static let doGet = GRPCMethodDescriptor( + name: "DoGet", + path: "/arrow.flight.protocol.FlightService/DoGet", + type: GRPCCallType.serverStreaming + ) + + internal static let doPut = GRPCMethodDescriptor( + name: "DoPut", + path: "/arrow.flight.protocol.FlightService/DoPut", + type: GRPCCallType.bidirectionalStreaming + ) + + internal static let doExchange = GRPCMethodDescriptor( + name: "DoExchange", + path: "/arrow.flight.protocol.FlightService/DoExchange", + type: GRPCCallType.bidirectionalStreaming + ) + + internal static let doAction = GRPCMethodDescriptor( + name: "DoAction", + path: "/arrow.flight.protocol.FlightService/DoAction", + type: GRPCCallType.serverStreaming + ) + + internal static let listActions = GRPCMethodDescriptor( + name: "ListActions", + path: "/arrow.flight.protocol.FlightService/ListActions", + type: GRPCCallType.serverStreaming + ) + } +} + +/// +/// A flight service is an endpoint for retrieving or storing Arrow data. A +/// flight service can expose one or more predefined endpoints that can be +/// accessed using the Arrow Flight Protocol. Additionally, a flight service +/// can expose a set of actions that are available. +/// +/// To build a server, implement a class that conforms to this protocol. +internal protocol Arrow_Flight_Protocol_FlightServiceProvider: CallHandlerProvider { + var interceptors: Arrow_Flight_Protocol_FlightServiceServerInterceptorFactoryProtocol? { get } + + /// + /// Handshake between client and server. Depending on the server, the + /// handshake may be required to determine the token that should be used for + /// future operations. Both request and response are streams to allow multiple + /// round-trips depending on auth mechanism. + func handshake(context: StreamingResponseCallContext) -> EventLoopFuture<(StreamEvent) -> Void> + + /// + /// Get a list of available streams given a particular criteria. Most flight + /// services will expose one or more streams that are readily available for + /// retrieval. This api allows listing the streams available for + /// consumption. A user can also provide a criteria. The criteria can limit + /// the subset of streams that can be listed via this interface. Each flight + /// service allows its own definition of how to consume criteria. + func listFlights(request: Arrow_Flight_Protocol_Criteria, context: StreamingResponseCallContext) -> EventLoopFuture + + /// + /// For a given FlightDescriptor, get information about how the flight can be + /// consumed. This is a useful interface if the consumer of the interface + /// already can identify the specific flight to consume. This interface can + /// also allow a consumer to generate a flight stream through a specified + /// descriptor. For example, a flight descriptor might be something that + /// includes a SQL statement or a Pickled Python operation that will be + /// executed. In those cases, the descriptor will not be previously available + /// within the list of available streams provided by ListFlights but will be + /// available for consumption for the duration defined by the specific flight + /// service. + func getFlightInfo(request: Arrow_Flight_Protocol_FlightDescriptor, context: StatusOnlyCallContext) -> EventLoopFuture + + /// + /// For a given FlightDescriptor, get the Schema as described in Schema.fbs::Schema + /// This is used when a consumer needs the Schema of flight stream. Similar to + /// GetFlightInfo this interface may generate a new flight that was not previously + /// available in ListFlights. + func getSchema(request: Arrow_Flight_Protocol_FlightDescriptor, context: StatusOnlyCallContext) -> EventLoopFuture + + /// + /// Retrieve a single stream associated with a particular descriptor + /// associated with the referenced ticket. A Flight can be composed of one or + /// more streams where each stream can be retrieved using a separate opaque + /// ticket that the flight service uses for managing a collection of streams. + func doGet(request: Arrow_Flight_Protocol_Ticket, context: StreamingResponseCallContext) -> EventLoopFuture + + /// + /// Push a stream to the flight service associated with a particular + /// flight stream. This allows a client of a flight service to upload a stream + /// of data. Depending on the particular flight service, a client consumer + /// could be allowed to upload a single stream per descriptor or an unlimited + /// number. In the latter, the service might implement a 'seal' action that + /// can be applied to a descriptor once all streams are uploaded. + func doPut(context: StreamingResponseCallContext) -> EventLoopFuture<(StreamEvent) -> Void> + + /// + /// Open a bidirectional data channel for a given descriptor. This + /// allows clients to send and receive arbitrary Arrow data and + /// application-specific metadata in a single logical stream. In + /// contrast to DoGet/DoPut, this is more suited for clients + /// offloading computation (rather than storage) to a Flight service. + func doExchange(context: StreamingResponseCallContext) -> EventLoopFuture<(StreamEvent) -> Void> + + /// + /// Flight services can support an arbitrary number of simple actions in + /// addition to the possible ListFlights, GetFlightInfo, DoGet, DoPut + /// operations that are potentially available. DoAction allows a flight client + /// to do a specific action against a flight service. An action includes + /// opaque request and response objects that are specific to the type action + /// being undertaken. + func doAction(request: Arrow_Flight_Protocol_Action, context: StreamingResponseCallContext) -> EventLoopFuture + + /// + /// A flight service exposes all of the available action types that it has + /// along with descriptions. This allows different flight consumers to + /// understand the capabilities of the flight service. + func listActions(request: Arrow_Flight_Protocol_Empty, context: StreamingResponseCallContext) -> EventLoopFuture +} + +extension Arrow_Flight_Protocol_FlightServiceProvider { + internal var serviceName: Substring { + return Arrow_Flight_Protocol_FlightServiceServerMetadata.serviceDescriptor.fullName[...] + } + + /// Determines, calls and returns the appropriate request handler, depending on the request's method. + /// Returns nil for methods not handled by this service. + internal func handle( + method name: Substring, + context: CallHandlerContext + ) -> GRPCServerHandlerProtocol? { + switch name { + case "Handshake": + return BidirectionalStreamingServerHandler( + context: context, + requestDeserializer: ProtobufDeserializer(), + responseSerializer: ProtobufSerializer(), + interceptors: self.interceptors?.makeHandshakeInterceptors() ?? [], + observerFactory: self.handshake(context:) + ) + + case "ListFlights": + return ServerStreamingServerHandler( + context: context, + requestDeserializer: ProtobufDeserializer(), + responseSerializer: ProtobufSerializer(), + interceptors: self.interceptors?.makeListFlightsInterceptors() ?? [], + userFunction: self.listFlights(request:context:) + ) + + case "GetFlightInfo": + return UnaryServerHandler( + context: context, + requestDeserializer: ProtobufDeserializer(), + responseSerializer: ProtobufSerializer(), + interceptors: self.interceptors?.makeGetFlightInfoInterceptors() ?? [], + userFunction: self.getFlightInfo(request:context:) + ) + + case "GetSchema": + return UnaryServerHandler( + context: context, + requestDeserializer: ProtobufDeserializer(), + responseSerializer: ProtobufSerializer(), + interceptors: self.interceptors?.makeGetSchemaInterceptors() ?? [], + userFunction: self.getSchema(request:context:) + ) + + case "DoGet": + return ServerStreamingServerHandler( + context: context, + requestDeserializer: ProtobufDeserializer(), + responseSerializer: ProtobufSerializer(), + interceptors: self.interceptors?.makeDoGetInterceptors() ?? [], + userFunction: self.doGet(request:context:) + ) + + case "DoPut": + return BidirectionalStreamingServerHandler( + context: context, + requestDeserializer: ProtobufDeserializer(), + responseSerializer: ProtobufSerializer(), + interceptors: self.interceptors?.makeDoPutInterceptors() ?? [], + observerFactory: self.doPut(context:) + ) + + case "DoExchange": + return BidirectionalStreamingServerHandler( + context: context, + requestDeserializer: ProtobufDeserializer(), + responseSerializer: ProtobufSerializer(), + interceptors: self.interceptors?.makeDoExchangeInterceptors() ?? [], + observerFactory: self.doExchange(context:) + ) + + case "DoAction": + return ServerStreamingServerHandler( + context: context, + requestDeserializer: ProtobufDeserializer(), + responseSerializer: ProtobufSerializer(), + interceptors: self.interceptors?.makeDoActionInterceptors() ?? [], + userFunction: self.doAction(request:context:) + ) + + case "ListActions": + return ServerStreamingServerHandler( + context: context, + requestDeserializer: ProtobufDeserializer(), + responseSerializer: ProtobufSerializer(), + interceptors: self.interceptors?.makeListActionsInterceptors() ?? [], + userFunction: self.listActions(request:context:) + ) + + default: + return nil + } + } +} + +/// +/// A flight service is an endpoint for retrieving or storing Arrow data. A +/// flight service can expose one or more predefined endpoints that can be +/// accessed using the Arrow Flight Protocol. Additionally, a flight service +/// can expose a set of actions that are available. +/// +/// To implement a server, implement an object which conforms to this protocol. +@available(macOS 10.15, iOS 13, tvOS 13, watchOS 6, *) +internal protocol Arrow_Flight_Protocol_FlightServiceAsyncProvider: CallHandlerProvider, Sendable { + static var serviceDescriptor: GRPCServiceDescriptor { get } + var interceptors: Arrow_Flight_Protocol_FlightServiceServerInterceptorFactoryProtocol? { get } + + /// + /// Handshake between client and server. Depending on the server, the + /// handshake may be required to determine the token that should be used for + /// future operations. Both request and response are streams to allow multiple + /// round-trips depending on auth mechanism. + func handshake( + requestStream: GRPCAsyncRequestStream, + responseStream: GRPCAsyncResponseStreamWriter, + context: GRPCAsyncServerCallContext + ) async throws + + /// + /// Get a list of available streams given a particular criteria. Most flight + /// services will expose one or more streams that are readily available for + /// retrieval. This api allows listing the streams available for + /// consumption. A user can also provide a criteria. The criteria can limit + /// the subset of streams that can be listed via this interface. Each flight + /// service allows its own definition of how to consume criteria. + func listFlights( + request: Arrow_Flight_Protocol_Criteria, + responseStream: GRPCAsyncResponseStreamWriter, + context: GRPCAsyncServerCallContext + ) async throws + + /// + /// For a given FlightDescriptor, get information about how the flight can be + /// consumed. This is a useful interface if the consumer of the interface + /// already can identify the specific flight to consume. This interface can + /// also allow a consumer to generate a flight stream through a specified + /// descriptor. For example, a flight descriptor might be something that + /// includes a SQL statement or a Pickled Python operation that will be + /// executed. In those cases, the descriptor will not be previously available + /// within the list of available streams provided by ListFlights but will be + /// available for consumption for the duration defined by the specific flight + /// service. + func getFlightInfo( + request: Arrow_Flight_Protocol_FlightDescriptor, + context: GRPCAsyncServerCallContext + ) async throws -> Arrow_Flight_Protocol_FlightInfo + + /// + /// For a given FlightDescriptor, get the Schema as described in Schema.fbs::Schema + /// This is used when a consumer needs the Schema of flight stream. Similar to + /// GetFlightInfo this interface may generate a new flight that was not previously + /// available in ListFlights. + func getSchema( + request: Arrow_Flight_Protocol_FlightDescriptor, + context: GRPCAsyncServerCallContext + ) async throws -> Arrow_Flight_Protocol_SchemaResult + + /// + /// Retrieve a single stream associated with a particular descriptor + /// associated with the referenced ticket. A Flight can be composed of one or + /// more streams where each stream can be retrieved using a separate opaque + /// ticket that the flight service uses for managing a collection of streams. + func doGet( + request: Arrow_Flight_Protocol_Ticket, + responseStream: GRPCAsyncResponseStreamWriter, + context: GRPCAsyncServerCallContext + ) async throws + + /// + /// Push a stream to the flight service associated with a particular + /// flight stream. This allows a client of a flight service to upload a stream + /// of data. Depending on the particular flight service, a client consumer + /// could be allowed to upload a single stream per descriptor or an unlimited + /// number. In the latter, the service might implement a 'seal' action that + /// can be applied to a descriptor once all streams are uploaded. + func doPut( + requestStream: GRPCAsyncRequestStream, + responseStream: GRPCAsyncResponseStreamWriter, + context: GRPCAsyncServerCallContext + ) async throws + + /// + /// Open a bidirectional data channel for a given descriptor. This + /// allows clients to send and receive arbitrary Arrow data and + /// application-specific metadata in a single logical stream. In + /// contrast to DoGet/DoPut, this is more suited for clients + /// offloading computation (rather than storage) to a Flight service. + func doExchange( + requestStream: GRPCAsyncRequestStream, + responseStream: GRPCAsyncResponseStreamWriter, + context: GRPCAsyncServerCallContext + ) async throws + + /// + /// Flight services can support an arbitrary number of simple actions in + /// addition to the possible ListFlights, GetFlightInfo, DoGet, DoPut + /// operations that are potentially available. DoAction allows a flight client + /// to do a specific action against a flight service. An action includes + /// opaque request and response objects that are specific to the type action + /// being undertaken. + func doAction( + request: Arrow_Flight_Protocol_Action, + responseStream: GRPCAsyncResponseStreamWriter, + context: GRPCAsyncServerCallContext + ) async throws + + /// + /// A flight service exposes all of the available action types that it has + /// along with descriptions. This allows different flight consumers to + /// understand the capabilities of the flight service. + func listActions( + request: Arrow_Flight_Protocol_Empty, + responseStream: GRPCAsyncResponseStreamWriter, + context: GRPCAsyncServerCallContext + ) async throws +} + +@available(macOS 10.15, iOS 13, tvOS 13, watchOS 6, *) +extension Arrow_Flight_Protocol_FlightServiceAsyncProvider { + internal static var serviceDescriptor: GRPCServiceDescriptor { + return Arrow_Flight_Protocol_FlightServiceServerMetadata.serviceDescriptor + } + + internal var serviceName: Substring { + return Arrow_Flight_Protocol_FlightServiceServerMetadata.serviceDescriptor.fullName[...] + } + + internal var interceptors: Arrow_Flight_Protocol_FlightServiceServerInterceptorFactoryProtocol? { + return nil + } + + internal func handle( + method name: Substring, + context: CallHandlerContext + ) -> GRPCServerHandlerProtocol? { + switch name { + case "Handshake": + return GRPCAsyncServerHandler( + context: context, + requestDeserializer: ProtobufDeserializer(), + responseSerializer: ProtobufSerializer(), + interceptors: self.interceptors?.makeHandshakeInterceptors() ?? [], + wrapping: { try await self.handshake(requestStream: $0, responseStream: $1, context: $2) } + ) + + case "ListFlights": + return GRPCAsyncServerHandler( + context: context, + requestDeserializer: ProtobufDeserializer(), + responseSerializer: ProtobufSerializer(), + interceptors: self.interceptors?.makeListFlightsInterceptors() ?? [], + wrapping: { try await self.listFlights(request: $0, responseStream: $1, context: $2) } + ) + + case "GetFlightInfo": + return GRPCAsyncServerHandler( + context: context, + requestDeserializer: ProtobufDeserializer(), + responseSerializer: ProtobufSerializer(), + interceptors: self.interceptors?.makeGetFlightInfoInterceptors() ?? [], + wrapping: { try await self.getFlightInfo(request: $0, context: $1) } + ) + + case "GetSchema": + return GRPCAsyncServerHandler( + context: context, + requestDeserializer: ProtobufDeserializer(), + responseSerializer: ProtobufSerializer(), + interceptors: self.interceptors?.makeGetSchemaInterceptors() ?? [], + wrapping: { try await self.getSchema(request: $0, context: $1) } + ) + + case "DoGet": + return GRPCAsyncServerHandler( + context: context, + requestDeserializer: ProtobufDeserializer(), + responseSerializer: ProtobufSerializer(), + interceptors: self.interceptors?.makeDoGetInterceptors() ?? [], + wrapping: { try await self.doGet(request: $0, responseStream: $1, context: $2) } + ) + + case "DoPut": + return GRPCAsyncServerHandler( + context: context, + requestDeserializer: ProtobufDeserializer(), + responseSerializer: ProtobufSerializer(), + interceptors: self.interceptors?.makeDoPutInterceptors() ?? [], + wrapping: { try await self.doPut(requestStream: $0, responseStream: $1, context: $2) } + ) + + case "DoExchange": + return GRPCAsyncServerHandler( + context: context, + requestDeserializer: ProtobufDeserializer(), + responseSerializer: ProtobufSerializer(), + interceptors: self.interceptors?.makeDoExchangeInterceptors() ?? [], + wrapping: { try await self.doExchange(requestStream: $0, responseStream: $1, context: $2) } + ) + + case "DoAction": + return GRPCAsyncServerHandler( + context: context, + requestDeserializer: ProtobufDeserializer(), + responseSerializer: ProtobufSerializer(), + interceptors: self.interceptors?.makeDoActionInterceptors() ?? [], + wrapping: { try await self.doAction(request: $0, responseStream: $1, context: $2) } + ) + + case "ListActions": + return GRPCAsyncServerHandler( + context: context, + requestDeserializer: ProtobufDeserializer(), + responseSerializer: ProtobufSerializer(), + interceptors: self.interceptors?.makeListActionsInterceptors() ?? [], + wrapping: { try await self.listActions(request: $0, responseStream: $1, context: $2) } + ) + + default: + return nil + } + } +} + +internal protocol Arrow_Flight_Protocol_FlightServiceServerInterceptorFactoryProtocol: Sendable { + + /// - Returns: Interceptors to use when handling 'handshake'. + /// Defaults to calling `self.makeInterceptors()`. + func makeHandshakeInterceptors() -> [ServerInterceptor] + + /// - Returns: Interceptors to use when handling 'listFlights'. + /// Defaults to calling `self.makeInterceptors()`. + func makeListFlightsInterceptors() -> [ServerInterceptor] + + /// - Returns: Interceptors to use when handling 'getFlightInfo'. + /// Defaults to calling `self.makeInterceptors()`. + func makeGetFlightInfoInterceptors() -> [ServerInterceptor] + + /// - Returns: Interceptors to use when handling 'getSchema'. + /// Defaults to calling `self.makeInterceptors()`. + func makeGetSchemaInterceptors() -> [ServerInterceptor] + + /// - Returns: Interceptors to use when handling 'doGet'. + /// Defaults to calling `self.makeInterceptors()`. + func makeDoGetInterceptors() -> [ServerInterceptor] + + /// - Returns: Interceptors to use when handling 'doPut'. + /// Defaults to calling `self.makeInterceptors()`. + func makeDoPutInterceptors() -> [ServerInterceptor] + + /// - Returns: Interceptors to use when handling 'doExchange'. + /// Defaults to calling `self.makeInterceptors()`. + func makeDoExchangeInterceptors() -> [ServerInterceptor] + + /// - Returns: Interceptors to use when handling 'doAction'. + /// Defaults to calling `self.makeInterceptors()`. + func makeDoActionInterceptors() -> [ServerInterceptor] + + /// - Returns: Interceptors to use when handling 'listActions'. + /// Defaults to calling `self.makeInterceptors()`. + func makeListActionsInterceptors() -> [ServerInterceptor] +} + +internal enum Arrow_Flight_Protocol_FlightServiceServerMetadata { + internal static let serviceDescriptor = GRPCServiceDescriptor( + name: "FlightService", + fullName: "arrow.flight.protocol.FlightService", + methods: [ + Arrow_Flight_Protocol_FlightServiceServerMetadata.Methods.handshake, + Arrow_Flight_Protocol_FlightServiceServerMetadata.Methods.listFlights, + Arrow_Flight_Protocol_FlightServiceServerMetadata.Methods.getFlightInfo, + Arrow_Flight_Protocol_FlightServiceServerMetadata.Methods.getSchema, + Arrow_Flight_Protocol_FlightServiceServerMetadata.Methods.doGet, + Arrow_Flight_Protocol_FlightServiceServerMetadata.Methods.doPut, + Arrow_Flight_Protocol_FlightServiceServerMetadata.Methods.doExchange, + Arrow_Flight_Protocol_FlightServiceServerMetadata.Methods.doAction, + Arrow_Flight_Protocol_FlightServiceServerMetadata.Methods.listActions, + ] + ) + + internal enum Methods { + internal static let handshake = GRPCMethodDescriptor( + name: "Handshake", + path: "/arrow.flight.protocol.FlightService/Handshake", + type: GRPCCallType.bidirectionalStreaming + ) + + internal static let listFlights = GRPCMethodDescriptor( + name: "ListFlights", + path: "/arrow.flight.protocol.FlightService/ListFlights", + type: GRPCCallType.serverStreaming + ) + + internal static let getFlightInfo = GRPCMethodDescriptor( + name: "GetFlightInfo", + path: "/arrow.flight.protocol.FlightService/GetFlightInfo", + type: GRPCCallType.unary + ) + + internal static let getSchema = GRPCMethodDescriptor( + name: "GetSchema", + path: "/arrow.flight.protocol.FlightService/GetSchema", + type: GRPCCallType.unary + ) + + internal static let doGet = GRPCMethodDescriptor( + name: "DoGet", + path: "/arrow.flight.protocol.FlightService/DoGet", + type: GRPCCallType.serverStreaming + ) + + internal static let doPut = GRPCMethodDescriptor( + name: "DoPut", + path: "/arrow.flight.protocol.FlightService/DoPut", + type: GRPCCallType.bidirectionalStreaming + ) + + internal static let doExchange = GRPCMethodDescriptor( + name: "DoExchange", + path: "/arrow.flight.protocol.FlightService/DoExchange", + type: GRPCCallType.bidirectionalStreaming + ) + + internal static let doAction = GRPCMethodDescriptor( + name: "DoAction", + path: "/arrow.flight.protocol.FlightService/DoAction", + type: GRPCCallType.serverStreaming + ) + + internal static let listActions = GRPCMethodDescriptor( + name: "ListActions", + path: "/arrow.flight.protocol.FlightService/ListActions", + type: GRPCCallType.serverStreaming + ) + } +} diff --git a/swift/ArrowFlight/Sources/ArrowFlight/Flight.pb.swift b/swift/ArrowFlight/Sources/ArrowFlight/Flight.pb.swift new file mode 100644 index 0000000000000..b50d4062529c2 --- /dev/null +++ b/swift/ArrowFlight/Sources/ArrowFlight/Flight.pb.swift @@ -0,0 +1,1366 @@ +// DO NOT EDIT. +// swift-format-ignore-file +// +// Generated by the Swift generator plugin for the protocol buffer compiler. +// Source: Flight.proto +// +// For information on using the generated types, please see the documentation: +// https://github.com/apple/swift-protobuf/ + +// +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +//

    +// http://www.apache.org/licenses/LICENSE-2.0 +//

    +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +import Foundation +import SwiftProtobuf + +// If the compiler emits an error on this type, it is because this file +// was generated by a version of the `protoc` Swift plug-in that is +// incompatible with the version of SwiftProtobuf to which you are linking. +// Please ensure that you are building against the same version of the API +// that was used to generate this file. +fileprivate struct _GeneratedWithProtocGenSwiftVersion: SwiftProtobuf.ProtobufAPIVersionCheck { + struct _2: SwiftProtobuf.ProtobufAPIVersion_2 {} + typealias Version = _2 +} + +/// +/// The result of a cancel operation. +/// +/// This is used by CancelFlightInfoResult.status. +enum Arrow_Flight_Protocol_CancelStatus: SwiftProtobuf.Enum { + typealias RawValue = Int + + /// The cancellation status is unknown. Servers should avoid using + /// this value (send a NOT_FOUND error if the requested query is + /// not known). Clients can retry the request. + case unspecified // = 0 + + /// The cancellation request is complete. Subsequent requests with + /// the same payload may return CANCELLED or a NOT_FOUND error. + case cancelled // = 1 + + /// The cancellation request is in progress. The client may retry + /// the cancellation request. + case cancelling // = 2 + + /// The query is not cancellable. The client should not retry the + /// cancellation request. + case notCancellable // = 3 + case UNRECOGNIZED(Int) + + init() { + self = .unspecified + } + + init?(rawValue: Int) { + switch rawValue { + case 0: self = .unspecified + case 1: self = .cancelled + case 2: self = .cancelling + case 3: self = .notCancellable + default: self = .UNRECOGNIZED(rawValue) + } + } + + var rawValue: Int { + switch self { + case .unspecified: return 0 + case .cancelled: return 1 + case .cancelling: return 2 + case .notCancellable: return 3 + case .UNRECOGNIZED(let i): return i + } + } + +} + +#if swift(>=4.2) + +extension Arrow_Flight_Protocol_CancelStatus: CaseIterable { + // The compiler won't synthesize support with the UNRECOGNIZED case. + static var allCases: [Arrow_Flight_Protocol_CancelStatus] = [ + .unspecified, + .cancelled, + .cancelling, + .notCancellable, + ] +} + +#endif // swift(>=4.2) + +/// +/// The request that a client provides to a server on handshake. +struct Arrow_Flight_Protocol_HandshakeRequest { + // SwiftProtobuf.Message conformance is added in an extension below. See the + // `Message` and `Message+*Additions` files in the SwiftProtobuf library for + // methods supported on all messages. + + /// + /// A defined protocol version + var protocolVersion: UInt64 = 0 + + /// + /// Arbitrary auth/handshake info. + var payload: Data = Data() + + var unknownFields = SwiftProtobuf.UnknownStorage() + + init() {} +} + +struct Arrow_Flight_Protocol_HandshakeResponse { + // SwiftProtobuf.Message conformance is added in an extension below. See the + // `Message` and `Message+*Additions` files in the SwiftProtobuf library for + // methods supported on all messages. + + /// + /// A defined protocol version + var protocolVersion: UInt64 = 0 + + /// + /// Arbitrary auth/handshake info. + var payload: Data = Data() + + var unknownFields = SwiftProtobuf.UnknownStorage() + + init() {} +} + +/// +/// A message for doing simple auth. +struct Arrow_Flight_Protocol_BasicAuth { + // SwiftProtobuf.Message conformance is added in an extension below. See the + // `Message` and `Message+*Additions` files in the SwiftProtobuf library for + // methods supported on all messages. + + var username: String = String() + + var password: String = String() + + var unknownFields = SwiftProtobuf.UnknownStorage() + + init() {} +} + +struct Arrow_Flight_Protocol_Empty { + // SwiftProtobuf.Message conformance is added in an extension below. See the + // `Message` and `Message+*Additions` files in the SwiftProtobuf library for + // methods supported on all messages. + + var unknownFields = SwiftProtobuf.UnknownStorage() + + init() {} +} + +/// +/// Describes an available action, including both the name used for execution +/// along with a short description of the purpose of the action. +struct Arrow_Flight_Protocol_ActionType { + // SwiftProtobuf.Message conformance is added in an extension below. See the + // `Message` and `Message+*Additions` files in the SwiftProtobuf library for + // methods supported on all messages. + + var type: String = String() + + var description_p: String = String() + + var unknownFields = SwiftProtobuf.UnknownStorage() + + init() {} +} + +/// +/// A service specific expression that can be used to return a limited set +/// of available Arrow Flight streams. +struct Arrow_Flight_Protocol_Criteria { + // SwiftProtobuf.Message conformance is added in an extension below. See the + // `Message` and `Message+*Additions` files in the SwiftProtobuf library for + // methods supported on all messages. + + var expression: Data = Data() + + var unknownFields = SwiftProtobuf.UnknownStorage() + + init() {} +} + +/// +/// An opaque action specific for the service. +struct Arrow_Flight_Protocol_Action { + // SwiftProtobuf.Message conformance is added in an extension below. See the + // `Message` and `Message+*Additions` files in the SwiftProtobuf library for + // methods supported on all messages. + + var type: String = String() + + var body: Data = Data() + + var unknownFields = SwiftProtobuf.UnknownStorage() + + init() {} +} + +/// +/// The request of the CancelFlightInfo action. +/// +/// The request should be stored in Action.body. +struct Arrow_Flight_Protocol_CancelFlightInfoRequest { + // SwiftProtobuf.Message conformance is added in an extension below. See the + // `Message` and `Message+*Additions` files in the SwiftProtobuf library for + // methods supported on all messages. + + var info: Arrow_Flight_Protocol_FlightInfo { + get {return _info ?? Arrow_Flight_Protocol_FlightInfo()} + set {_info = newValue} + } + /// Returns true if `info` has been explicitly set. + var hasInfo: Bool {return self._info != nil} + /// Clears the value of `info`. Subsequent reads from it will return its default value. + mutating func clearInfo() {self._info = nil} + + var unknownFields = SwiftProtobuf.UnknownStorage() + + init() {} + + fileprivate var _info: Arrow_Flight_Protocol_FlightInfo? = nil +} + +/// +/// The request of the RenewFlightEndpoint action. +/// +/// The request should be stored in Action.body. +struct Arrow_Flight_Protocol_RenewFlightEndpointRequest { + // SwiftProtobuf.Message conformance is added in an extension below. See the + // `Message` and `Message+*Additions` files in the SwiftProtobuf library for + // methods supported on all messages. + + var endpoint: Arrow_Flight_Protocol_FlightEndpoint { + get {return _endpoint ?? Arrow_Flight_Protocol_FlightEndpoint()} + set {_endpoint = newValue} + } + /// Returns true if `endpoint` has been explicitly set. + var hasEndpoint: Bool {return self._endpoint != nil} + /// Clears the value of `endpoint`. Subsequent reads from it will return its default value. + mutating func clearEndpoint() {self._endpoint = nil} + + var unknownFields = SwiftProtobuf.UnknownStorage() + + init() {} + + fileprivate var _endpoint: Arrow_Flight_Protocol_FlightEndpoint? = nil +} + +/// +/// An opaque result returned after executing an action. +struct Arrow_Flight_Protocol_Result { + // SwiftProtobuf.Message conformance is added in an extension below. See the + // `Message` and `Message+*Additions` files in the SwiftProtobuf library for + // methods supported on all messages. + + var body: Data = Data() + + var unknownFields = SwiftProtobuf.UnknownStorage() + + init() {} +} + +/// +/// The result of the CancelFlightInfo action. +/// +/// The result should be stored in Result.body. +struct Arrow_Flight_Protocol_CancelFlightInfoResult { + // SwiftProtobuf.Message conformance is added in an extension below. See the + // `Message` and `Message+*Additions` files in the SwiftProtobuf library for + // methods supported on all messages. + + var status: Arrow_Flight_Protocol_CancelStatus = .unspecified + + var unknownFields = SwiftProtobuf.UnknownStorage() + + init() {} +} + +/// +/// Wrap the result of a getSchema call +struct Arrow_Flight_Protocol_SchemaResult { + // SwiftProtobuf.Message conformance is added in an extension below. See the + // `Message` and `Message+*Additions` files in the SwiftProtobuf library for + // methods supported on all messages. + + /// The schema of the dataset in its IPC form: + /// 4 bytes - an optional IPC_CONTINUATION_TOKEN prefix + /// 4 bytes - the byte length of the payload + /// a flatbuffer Message whose header is the Schema + var schema: Data = Data() + + var unknownFields = SwiftProtobuf.UnknownStorage() + + init() {} +} + +/// +/// The name or tag for a Flight. May be used as a way to retrieve or generate +/// a flight or be used to expose a set of previously defined flights. +struct Arrow_Flight_Protocol_FlightDescriptor { + // SwiftProtobuf.Message conformance is added in an extension below. See the + // `Message` and `Message+*Additions` files in the SwiftProtobuf library for + // methods supported on all messages. + + var type: Arrow_Flight_Protocol_FlightDescriptor.DescriptorType = .unknown + + /// + /// Opaque value used to express a command. Should only be defined when + /// type = CMD. + var cmd: Data = Data() + + /// + /// List of strings identifying a particular dataset. Should only be defined + /// when type = PATH. + var path: [String] = [] + + var unknownFields = SwiftProtobuf.UnknownStorage() + + /// + /// Describes what type of descriptor is defined. + enum DescriptorType: SwiftProtobuf.Enum { + typealias RawValue = Int + + /// Protobuf pattern, not used. + case unknown // = 0 + + /// + /// A named path that identifies a dataset. A path is composed of a string + /// or list of strings describing a particular dataset. This is conceptually + /// similar to a path inside a filesystem. + case path // = 1 + + /// + /// An opaque command to generate a dataset. + case cmd // = 2 + case UNRECOGNIZED(Int) + + init() { + self = .unknown + } + + init?(rawValue: Int) { + switch rawValue { + case 0: self = .unknown + case 1: self = .path + case 2: self = .cmd + default: self = .UNRECOGNIZED(rawValue) + } + } + + var rawValue: Int { + switch self { + case .unknown: return 0 + case .path: return 1 + case .cmd: return 2 + case .UNRECOGNIZED(let i): return i + } + } + + } + + init() {} +} + +#if swift(>=4.2) + +extension Arrow_Flight_Protocol_FlightDescriptor.DescriptorType: CaseIterable { + // The compiler won't synthesize support with the UNRECOGNIZED case. + static var allCases: [Arrow_Flight_Protocol_FlightDescriptor.DescriptorType] = [ + .unknown, + .path, + .cmd, + ] +} + +#endif // swift(>=4.2) + +/// +/// The access coordinates for retrieval of a dataset. With a FlightInfo, a +/// consumer is able to determine how to retrieve a dataset. +struct Arrow_Flight_Protocol_FlightInfo { + // SwiftProtobuf.Message conformance is added in an extension below. See the + // `Message` and `Message+*Additions` files in the SwiftProtobuf library for + // methods supported on all messages. + + /// The schema of the dataset in its IPC form: + /// 4 bytes - an optional IPC_CONTINUATION_TOKEN prefix + /// 4 bytes - the byte length of the payload + /// a flatbuffer Message whose header is the Schema + var schema: Data = Data() + + /// + /// The descriptor associated with this info. + var flightDescriptor: Arrow_Flight_Protocol_FlightDescriptor { + get {return _flightDescriptor ?? Arrow_Flight_Protocol_FlightDescriptor()} + set {_flightDescriptor = newValue} + } + /// Returns true if `flightDescriptor` has been explicitly set. + var hasFlightDescriptor: Bool {return self._flightDescriptor != nil} + /// Clears the value of `flightDescriptor`. Subsequent reads from it will return its default value. + mutating func clearFlightDescriptor() {self._flightDescriptor = nil} + + /// + /// A list of endpoints associated with the flight. To consume the + /// whole flight, all endpoints (and hence all Tickets) must be + /// consumed. Endpoints can be consumed in any order. + /// + /// In other words, an application can use multiple endpoints to + /// represent partitioned data. + /// + /// If the returned data has an ordering, an application can use + /// "FlightInfo.ordered = true" or should return the all data in a + /// single endpoint. Otherwise, there is no ordering defined on + /// endpoints or the data within. + /// + /// A client can read ordered data by reading data from returned + /// endpoints, in order, from front to back. + /// + /// Note that a client may ignore "FlightInfo.ordered = true". If an + /// ordering is important for an application, an application must + /// choose one of them: + /// + /// * An application requires that all clients must read data in + /// returned endpoints order. + /// * An application must return the all data in a single endpoint. + var endpoint: [Arrow_Flight_Protocol_FlightEndpoint] = [] + + /// Set these to -1 if unknown. + var totalRecords: Int64 = 0 + + var totalBytes: Int64 = 0 + + /// + /// FlightEndpoints are in the same order as the data. + var ordered: Bool = false + + var unknownFields = SwiftProtobuf.UnknownStorage() + + init() {} + + fileprivate var _flightDescriptor: Arrow_Flight_Protocol_FlightDescriptor? = nil +} + +/// +/// A particular stream or split associated with a flight. +struct Arrow_Flight_Protocol_FlightEndpoint { + // SwiftProtobuf.Message conformance is added in an extension below. See the + // `Message` and `Message+*Additions` files in the SwiftProtobuf library for + // methods supported on all messages. + + /// + /// Token used to retrieve this stream. + var ticket: Arrow_Flight_Protocol_Ticket { + get {return _ticket ?? Arrow_Flight_Protocol_Ticket()} + set {_ticket = newValue} + } + /// Returns true if `ticket` has been explicitly set. + var hasTicket: Bool {return self._ticket != nil} + /// Clears the value of `ticket`. Subsequent reads from it will return its default value. + mutating func clearTicket() {self._ticket = nil} + + /// + /// A list of URIs where this ticket can be redeemed via DoGet(). + /// + /// If the list is empty, the expectation is that the ticket can only + /// be redeemed on the current service where the ticket was + /// generated. + /// + /// If the list is not empty, the expectation is that the ticket can + /// be redeemed at any of the locations, and that the data returned + /// will be equivalent. In this case, the ticket may only be redeemed + /// at one of the given locations, and not (necessarily) on the + /// current service. + /// + /// In other words, an application can use multiple locations to + /// represent redundant and/or load balanced services. + var location: [Arrow_Flight_Protocol_Location] = [] + + /// + /// Expiration time of this stream. If present, clients may assume + /// they can retry DoGet requests. Otherwise, it is + /// application-defined whether DoGet requests may be retried. + var expirationTime: SwiftProtobuf.Google_Protobuf_Timestamp { + get {return _expirationTime ?? SwiftProtobuf.Google_Protobuf_Timestamp()} + set {_expirationTime = newValue} + } + /// Returns true if `expirationTime` has been explicitly set. + var hasExpirationTime: Bool {return self._expirationTime != nil} + /// Clears the value of `expirationTime`. Subsequent reads from it will return its default value. + mutating func clearExpirationTime() {self._expirationTime = nil} + + var unknownFields = SwiftProtobuf.UnknownStorage() + + init() {} + + fileprivate var _ticket: Arrow_Flight_Protocol_Ticket? = nil + fileprivate var _expirationTime: SwiftProtobuf.Google_Protobuf_Timestamp? = nil +} + +/// +/// A location where a Flight service will accept retrieval of a particular +/// stream given a ticket. +struct Arrow_Flight_Protocol_Location { + // SwiftProtobuf.Message conformance is added in an extension below. See the + // `Message` and `Message+*Additions` files in the SwiftProtobuf library for + // methods supported on all messages. + + var uri: String = String() + + var unknownFields = SwiftProtobuf.UnknownStorage() + + init() {} +} + +/// +/// An opaque identifier that the service can use to retrieve a particular +/// portion of a stream. +/// +/// Tickets are meant to be single use. It is an error/application-defined +/// behavior to reuse a ticket. +struct Arrow_Flight_Protocol_Ticket { + // SwiftProtobuf.Message conformance is added in an extension below. See the + // `Message` and `Message+*Additions` files in the SwiftProtobuf library for + // methods supported on all messages. + + var ticket: Data = Data() + + var unknownFields = SwiftProtobuf.UnknownStorage() + + init() {} +} + +/// +/// A batch of Arrow data as part of a stream of batches. +struct Arrow_Flight_Protocol_FlightData { + // SwiftProtobuf.Message conformance is added in an extension below. See the + // `Message` and `Message+*Additions` files in the SwiftProtobuf library for + // methods supported on all messages. + + /// + /// The descriptor of the data. This is only relevant when a client is + /// starting a new DoPut stream. + var flightDescriptor: Arrow_Flight_Protocol_FlightDescriptor { + get {return _flightDescriptor ?? Arrow_Flight_Protocol_FlightDescriptor()} + set {_flightDescriptor = newValue} + } + /// Returns true if `flightDescriptor` has been explicitly set. + var hasFlightDescriptor: Bool {return self._flightDescriptor != nil} + /// Clears the value of `flightDescriptor`. Subsequent reads from it will return its default value. + mutating func clearFlightDescriptor() {self._flightDescriptor = nil} + + /// + /// Header for message data as described in Message.fbs::Message. + var dataHeader: Data = Data() + + /// + /// Application-defined metadata. + var appMetadata: Data = Data() + + /// + /// The actual batch of Arrow data. Preferably handled with minimal-copies + /// coming last in the definition to help with sidecar patterns (it is + /// expected that some implementations will fetch this field off the wire + /// with specialized code to avoid extra memory copies). + var dataBody: Data = Data() + + var unknownFields = SwiftProtobuf.UnknownStorage() + + init() {} + + fileprivate var _flightDescriptor: Arrow_Flight_Protocol_FlightDescriptor? = nil +} + +///* +/// The response message associated with the submission of a DoPut. +struct Arrow_Flight_Protocol_PutResult { + // SwiftProtobuf.Message conformance is added in an extension below. See the + // `Message` and `Message+*Additions` files in the SwiftProtobuf library for + // methods supported on all messages. + + var appMetadata: Data = Data() + + var unknownFields = SwiftProtobuf.UnknownStorage() + + init() {} +} + +#if swift(>=5.5) && canImport(_Concurrency) +extension Arrow_Flight_Protocol_CancelStatus: @unchecked Sendable {} +extension Arrow_Flight_Protocol_HandshakeRequest: @unchecked Sendable {} +extension Arrow_Flight_Protocol_HandshakeResponse: @unchecked Sendable {} +extension Arrow_Flight_Protocol_BasicAuth: @unchecked Sendable {} +extension Arrow_Flight_Protocol_Empty: @unchecked Sendable {} +extension Arrow_Flight_Protocol_ActionType: @unchecked Sendable {} +extension Arrow_Flight_Protocol_Criteria: @unchecked Sendable {} +extension Arrow_Flight_Protocol_Action: @unchecked Sendable {} +extension Arrow_Flight_Protocol_CancelFlightInfoRequest: @unchecked Sendable {} +extension Arrow_Flight_Protocol_RenewFlightEndpointRequest: @unchecked Sendable {} +extension Arrow_Flight_Protocol_Result: @unchecked Sendable {} +extension Arrow_Flight_Protocol_CancelFlightInfoResult: @unchecked Sendable {} +extension Arrow_Flight_Protocol_SchemaResult: @unchecked Sendable {} +extension Arrow_Flight_Protocol_FlightDescriptor: @unchecked Sendable {} +extension Arrow_Flight_Protocol_FlightDescriptor.DescriptorType: @unchecked Sendable {} +extension Arrow_Flight_Protocol_FlightInfo: @unchecked Sendable {} +extension Arrow_Flight_Protocol_FlightEndpoint: @unchecked Sendable {} +extension Arrow_Flight_Protocol_Location: @unchecked Sendable {} +extension Arrow_Flight_Protocol_Ticket: @unchecked Sendable {} +extension Arrow_Flight_Protocol_FlightData: @unchecked Sendable {} +extension Arrow_Flight_Protocol_PutResult: @unchecked Sendable {} +#endif // swift(>=5.5) && canImport(_Concurrency) + +// MARK: - Code below here is support for the SwiftProtobuf runtime. + +fileprivate let _protobuf_package = "arrow.flight.protocol" + +extension Arrow_Flight_Protocol_CancelStatus: SwiftProtobuf._ProtoNameProviding { + static let _protobuf_nameMap: SwiftProtobuf._NameMap = [ + 0: .same(proto: "CANCEL_STATUS_UNSPECIFIED"), + 1: .same(proto: "CANCEL_STATUS_CANCELLED"), + 2: .same(proto: "CANCEL_STATUS_CANCELLING"), + 3: .same(proto: "CANCEL_STATUS_NOT_CANCELLABLE"), + ] +} + +extension Arrow_Flight_Protocol_HandshakeRequest: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding { + static let protoMessageName: String = _protobuf_package + ".HandshakeRequest" + static let _protobuf_nameMap: SwiftProtobuf._NameMap = [ + 1: .standard(proto: "protocol_version"), + 2: .same(proto: "payload"), + ] + + mutating func decodeMessage(decoder: inout D) throws { + while let fieldNumber = try decoder.nextFieldNumber() { + // The use of inline closures is to circumvent an issue where the compiler + // allocates stack space for every case branch when no optimizations are + // enabled. https://github.com/apple/swift-protobuf/issues/1034 + switch fieldNumber { + case 1: try { try decoder.decodeSingularUInt64Field(value: &self.protocolVersion) }() + case 2: try { try decoder.decodeSingularBytesField(value: &self.payload) }() + default: break + } + } + } + + func traverse(visitor: inout V) throws { + if self.protocolVersion != 0 { + try visitor.visitSingularUInt64Field(value: self.protocolVersion, fieldNumber: 1) + } + if !self.payload.isEmpty { + try visitor.visitSingularBytesField(value: self.payload, fieldNumber: 2) + } + try unknownFields.traverse(visitor: &visitor) + } + + static func ==(lhs: Arrow_Flight_Protocol_HandshakeRequest, rhs: Arrow_Flight_Protocol_HandshakeRequest) -> Bool { + if lhs.protocolVersion != rhs.protocolVersion {return false} + if lhs.payload != rhs.payload {return false} + if lhs.unknownFields != rhs.unknownFields {return false} + return true + } +} + +extension Arrow_Flight_Protocol_HandshakeResponse: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding { + static let protoMessageName: String = _protobuf_package + ".HandshakeResponse" + static let _protobuf_nameMap: SwiftProtobuf._NameMap = [ + 1: .standard(proto: "protocol_version"), + 2: .same(proto: "payload"), + ] + + mutating func decodeMessage(decoder: inout D) throws { + while let fieldNumber = try decoder.nextFieldNumber() { + // The use of inline closures is to circumvent an issue where the compiler + // allocates stack space for every case branch when no optimizations are + // enabled. https://github.com/apple/swift-protobuf/issues/1034 + switch fieldNumber { + case 1: try { try decoder.decodeSingularUInt64Field(value: &self.protocolVersion) }() + case 2: try { try decoder.decodeSingularBytesField(value: &self.payload) }() + default: break + } + } + } + + func traverse(visitor: inout V) throws { + if self.protocolVersion != 0 { + try visitor.visitSingularUInt64Field(value: self.protocolVersion, fieldNumber: 1) + } + if !self.payload.isEmpty { + try visitor.visitSingularBytesField(value: self.payload, fieldNumber: 2) + } + try unknownFields.traverse(visitor: &visitor) + } + + static func ==(lhs: Arrow_Flight_Protocol_HandshakeResponse, rhs: Arrow_Flight_Protocol_HandshakeResponse) -> Bool { + if lhs.protocolVersion != rhs.protocolVersion {return false} + if lhs.payload != rhs.payload {return false} + if lhs.unknownFields != rhs.unknownFields {return false} + return true + } +} + +extension Arrow_Flight_Protocol_BasicAuth: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding { + static let protoMessageName: String = _protobuf_package + ".BasicAuth" + static let _protobuf_nameMap: SwiftProtobuf._NameMap = [ + 2: .same(proto: "username"), + 3: .same(proto: "password"), + ] + + mutating func decodeMessage(decoder: inout D) throws { + while let fieldNumber = try decoder.nextFieldNumber() { + // The use of inline closures is to circumvent an issue where the compiler + // allocates stack space for every case branch when no optimizations are + // enabled. https://github.com/apple/swift-protobuf/issues/1034 + switch fieldNumber { + case 2: try { try decoder.decodeSingularStringField(value: &self.username) }() + case 3: try { try decoder.decodeSingularStringField(value: &self.password) }() + default: break + } + } + } + + func traverse(visitor: inout V) throws { + if !self.username.isEmpty { + try visitor.visitSingularStringField(value: self.username, fieldNumber: 2) + } + if !self.password.isEmpty { + try visitor.visitSingularStringField(value: self.password, fieldNumber: 3) + } + try unknownFields.traverse(visitor: &visitor) + } + + static func ==(lhs: Arrow_Flight_Protocol_BasicAuth, rhs: Arrow_Flight_Protocol_BasicAuth) -> Bool { + if lhs.username != rhs.username {return false} + if lhs.password != rhs.password {return false} + if lhs.unknownFields != rhs.unknownFields {return false} + return true + } +} + +extension Arrow_Flight_Protocol_Empty: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding { + static let protoMessageName: String = _protobuf_package + ".Empty" + static let _protobuf_nameMap = SwiftProtobuf._NameMap() + + mutating func decodeMessage(decoder: inout D) throws { + while let _ = try decoder.nextFieldNumber() { + } + } + + func traverse(visitor: inout V) throws { + try unknownFields.traverse(visitor: &visitor) + } + + static func ==(lhs: Arrow_Flight_Protocol_Empty, rhs: Arrow_Flight_Protocol_Empty) -> Bool { + if lhs.unknownFields != rhs.unknownFields {return false} + return true + } +} + +extension Arrow_Flight_Protocol_ActionType: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding { + static let protoMessageName: String = _protobuf_package + ".ActionType" + static let _protobuf_nameMap: SwiftProtobuf._NameMap = [ + 1: .same(proto: "type"), + 2: .same(proto: "description"), + ] + + mutating func decodeMessage(decoder: inout D) throws { + while let fieldNumber = try decoder.nextFieldNumber() { + // The use of inline closures is to circumvent an issue where the compiler + // allocates stack space for every case branch when no optimizations are + // enabled. https://github.com/apple/swift-protobuf/issues/1034 + switch fieldNumber { + case 1: try { try decoder.decodeSingularStringField(value: &self.type) }() + case 2: try { try decoder.decodeSingularStringField(value: &self.description_p) }() + default: break + } + } + } + + func traverse(visitor: inout V) throws { + if !self.type.isEmpty { + try visitor.visitSingularStringField(value: self.type, fieldNumber: 1) + } + if !self.description_p.isEmpty { + try visitor.visitSingularStringField(value: self.description_p, fieldNumber: 2) + } + try unknownFields.traverse(visitor: &visitor) + } + + static func ==(lhs: Arrow_Flight_Protocol_ActionType, rhs: Arrow_Flight_Protocol_ActionType) -> Bool { + if lhs.type != rhs.type {return false} + if lhs.description_p != rhs.description_p {return false} + if lhs.unknownFields != rhs.unknownFields {return false} + return true + } +} + +extension Arrow_Flight_Protocol_Criteria: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding { + static let protoMessageName: String = _protobuf_package + ".Criteria" + static let _protobuf_nameMap: SwiftProtobuf._NameMap = [ + 1: .same(proto: "expression"), + ] + + mutating func decodeMessage(decoder: inout D) throws { + while let fieldNumber = try decoder.nextFieldNumber() { + // The use of inline closures is to circumvent an issue where the compiler + // allocates stack space for every case branch when no optimizations are + // enabled. https://github.com/apple/swift-protobuf/issues/1034 + switch fieldNumber { + case 1: try { try decoder.decodeSingularBytesField(value: &self.expression) }() + default: break + } + } + } + + func traverse(visitor: inout V) throws { + if !self.expression.isEmpty { + try visitor.visitSingularBytesField(value: self.expression, fieldNumber: 1) + } + try unknownFields.traverse(visitor: &visitor) + } + + static func ==(lhs: Arrow_Flight_Protocol_Criteria, rhs: Arrow_Flight_Protocol_Criteria) -> Bool { + if lhs.expression != rhs.expression {return false} + if lhs.unknownFields != rhs.unknownFields {return false} + return true + } +} + +extension Arrow_Flight_Protocol_Action: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding { + static let protoMessageName: String = _protobuf_package + ".Action" + static let _protobuf_nameMap: SwiftProtobuf._NameMap = [ + 1: .same(proto: "type"), + 2: .same(proto: "body"), + ] + + mutating func decodeMessage(decoder: inout D) throws { + while let fieldNumber = try decoder.nextFieldNumber() { + // The use of inline closures is to circumvent an issue where the compiler + // allocates stack space for every case branch when no optimizations are + // enabled. https://github.com/apple/swift-protobuf/issues/1034 + switch fieldNumber { + case 1: try { try decoder.decodeSingularStringField(value: &self.type) }() + case 2: try { try decoder.decodeSingularBytesField(value: &self.body) }() + default: break + } + } + } + + func traverse(visitor: inout V) throws { + if !self.type.isEmpty { + try visitor.visitSingularStringField(value: self.type, fieldNumber: 1) + } + if !self.body.isEmpty { + try visitor.visitSingularBytesField(value: self.body, fieldNumber: 2) + } + try unknownFields.traverse(visitor: &visitor) + } + + static func ==(lhs: Arrow_Flight_Protocol_Action, rhs: Arrow_Flight_Protocol_Action) -> Bool { + if lhs.type != rhs.type {return false} + if lhs.body != rhs.body {return false} + if lhs.unknownFields != rhs.unknownFields {return false} + return true + } +} + +extension Arrow_Flight_Protocol_CancelFlightInfoRequest: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding { + static let protoMessageName: String = _protobuf_package + ".CancelFlightInfoRequest" + static let _protobuf_nameMap: SwiftProtobuf._NameMap = [ + 1: .same(proto: "info"), + ] + + mutating func decodeMessage(decoder: inout D) throws { + while let fieldNumber = try decoder.nextFieldNumber() { + // The use of inline closures is to circumvent an issue where the compiler + // allocates stack space for every case branch when no optimizations are + // enabled. https://github.com/apple/swift-protobuf/issues/1034 + switch fieldNumber { + case 1: try { try decoder.decodeSingularMessageField(value: &self._info) }() + default: break + } + } + } + + func traverse(visitor: inout V) throws { + // The use of inline closures is to circumvent an issue where the compiler + // allocates stack space for every if/case branch local when no optimizations + // are enabled. https://github.com/apple/swift-protobuf/issues/1034 and + // https://github.com/apple/swift-protobuf/issues/1182 + try { if let v = self._info { + try visitor.visitSingularMessageField(value: v, fieldNumber: 1) + } }() + try unknownFields.traverse(visitor: &visitor) + } + + static func ==(lhs: Arrow_Flight_Protocol_CancelFlightInfoRequest, rhs: Arrow_Flight_Protocol_CancelFlightInfoRequest) -> Bool { + if lhs._info != rhs._info {return false} + if lhs.unknownFields != rhs.unknownFields {return false} + return true + } +} + +extension Arrow_Flight_Protocol_RenewFlightEndpointRequest: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding { + static let protoMessageName: String = _protobuf_package + ".RenewFlightEndpointRequest" + static let _protobuf_nameMap: SwiftProtobuf._NameMap = [ + 1: .same(proto: "endpoint"), + ] + + mutating func decodeMessage(decoder: inout D) throws { + while let fieldNumber = try decoder.nextFieldNumber() { + // The use of inline closures is to circumvent an issue where the compiler + // allocates stack space for every case branch when no optimizations are + // enabled. https://github.com/apple/swift-protobuf/issues/1034 + switch fieldNumber { + case 1: try { try decoder.decodeSingularMessageField(value: &self._endpoint) }() + default: break + } + } + } + + func traverse(visitor: inout V) throws { + // The use of inline closures is to circumvent an issue where the compiler + // allocates stack space for every if/case branch local when no optimizations + // are enabled. https://github.com/apple/swift-protobuf/issues/1034 and + // https://github.com/apple/swift-protobuf/issues/1182 + try { if let v = self._endpoint { + try visitor.visitSingularMessageField(value: v, fieldNumber: 1) + } }() + try unknownFields.traverse(visitor: &visitor) + } + + static func ==(lhs: Arrow_Flight_Protocol_RenewFlightEndpointRequest, rhs: Arrow_Flight_Protocol_RenewFlightEndpointRequest) -> Bool { + if lhs._endpoint != rhs._endpoint {return false} + if lhs.unknownFields != rhs.unknownFields {return false} + return true + } +} + +extension Arrow_Flight_Protocol_Result: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding { + static let protoMessageName: String = _protobuf_package + ".Result" + static let _protobuf_nameMap: SwiftProtobuf._NameMap = [ + 1: .same(proto: "body"), + ] + + mutating func decodeMessage(decoder: inout D) throws { + while let fieldNumber = try decoder.nextFieldNumber() { + // The use of inline closures is to circumvent an issue where the compiler + // allocates stack space for every case branch when no optimizations are + // enabled. https://github.com/apple/swift-protobuf/issues/1034 + switch fieldNumber { + case 1: try { try decoder.decodeSingularBytesField(value: &self.body) }() + default: break + } + } + } + + func traverse(visitor: inout V) throws { + if !self.body.isEmpty { + try visitor.visitSingularBytesField(value: self.body, fieldNumber: 1) + } + try unknownFields.traverse(visitor: &visitor) + } + + static func ==(lhs: Arrow_Flight_Protocol_Result, rhs: Arrow_Flight_Protocol_Result) -> Bool { + if lhs.body != rhs.body {return false} + if lhs.unknownFields != rhs.unknownFields {return false} + return true + } +} + +extension Arrow_Flight_Protocol_CancelFlightInfoResult: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding { + static let protoMessageName: String = _protobuf_package + ".CancelFlightInfoResult" + static let _protobuf_nameMap: SwiftProtobuf._NameMap = [ + 1: .same(proto: "status"), + ] + + mutating func decodeMessage(decoder: inout D) throws { + while let fieldNumber = try decoder.nextFieldNumber() { + // The use of inline closures is to circumvent an issue where the compiler + // allocates stack space for every case branch when no optimizations are + // enabled. https://github.com/apple/swift-protobuf/issues/1034 + switch fieldNumber { + case 1: try { try decoder.decodeSingularEnumField(value: &self.status) }() + default: break + } + } + } + + func traverse(visitor: inout V) throws { + if self.status != .unspecified { + try visitor.visitSingularEnumField(value: self.status, fieldNumber: 1) + } + try unknownFields.traverse(visitor: &visitor) + } + + static func ==(lhs: Arrow_Flight_Protocol_CancelFlightInfoResult, rhs: Arrow_Flight_Protocol_CancelFlightInfoResult) -> Bool { + if lhs.status != rhs.status {return false} + if lhs.unknownFields != rhs.unknownFields {return false} + return true + } +} + +extension Arrow_Flight_Protocol_SchemaResult: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding { + static let protoMessageName: String = _protobuf_package + ".SchemaResult" + static let _protobuf_nameMap: SwiftProtobuf._NameMap = [ + 1: .same(proto: "schema"), + ] + + mutating func decodeMessage(decoder: inout D) throws { + while let fieldNumber = try decoder.nextFieldNumber() { + // The use of inline closures is to circumvent an issue where the compiler + // allocates stack space for every case branch when no optimizations are + // enabled. https://github.com/apple/swift-protobuf/issues/1034 + switch fieldNumber { + case 1: try { try decoder.decodeSingularBytesField(value: &self.schema) }() + default: break + } + } + } + + func traverse(visitor: inout V) throws { + if !self.schema.isEmpty { + try visitor.visitSingularBytesField(value: self.schema, fieldNumber: 1) + } + try unknownFields.traverse(visitor: &visitor) + } + + static func ==(lhs: Arrow_Flight_Protocol_SchemaResult, rhs: Arrow_Flight_Protocol_SchemaResult) -> Bool { + if lhs.schema != rhs.schema {return false} + if lhs.unknownFields != rhs.unknownFields {return false} + return true + } +} + +extension Arrow_Flight_Protocol_FlightDescriptor: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding { + static let protoMessageName: String = _protobuf_package + ".FlightDescriptor" + static let _protobuf_nameMap: SwiftProtobuf._NameMap = [ + 1: .same(proto: "type"), + 2: .same(proto: "cmd"), + 3: .same(proto: "path"), + ] + + mutating func decodeMessage(decoder: inout D) throws { + while let fieldNumber = try decoder.nextFieldNumber() { + // The use of inline closures is to circumvent an issue where the compiler + // allocates stack space for every case branch when no optimizations are + // enabled. https://github.com/apple/swift-protobuf/issues/1034 + switch fieldNumber { + case 1: try { try decoder.decodeSingularEnumField(value: &self.type) }() + case 2: try { try decoder.decodeSingularBytesField(value: &self.cmd) }() + case 3: try { try decoder.decodeRepeatedStringField(value: &self.path) }() + default: break + } + } + } + + func traverse(visitor: inout V) throws { + if self.type != .unknown { + try visitor.visitSingularEnumField(value: self.type, fieldNumber: 1) + } + if !self.cmd.isEmpty { + try visitor.visitSingularBytesField(value: self.cmd, fieldNumber: 2) + } + if !self.path.isEmpty { + try visitor.visitRepeatedStringField(value: self.path, fieldNumber: 3) + } + try unknownFields.traverse(visitor: &visitor) + } + + static func ==(lhs: Arrow_Flight_Protocol_FlightDescriptor, rhs: Arrow_Flight_Protocol_FlightDescriptor) -> Bool { + if lhs.type != rhs.type {return false} + if lhs.cmd != rhs.cmd {return false} + if lhs.path != rhs.path {return false} + if lhs.unknownFields != rhs.unknownFields {return false} + return true + } +} + +extension Arrow_Flight_Protocol_FlightDescriptor.DescriptorType: SwiftProtobuf._ProtoNameProviding { + static let _protobuf_nameMap: SwiftProtobuf._NameMap = [ + 0: .same(proto: "UNKNOWN"), + 1: .same(proto: "PATH"), + 2: .same(proto: "CMD"), + ] +} + +extension Arrow_Flight_Protocol_FlightInfo: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding { + static let protoMessageName: String = _protobuf_package + ".FlightInfo" + static let _protobuf_nameMap: SwiftProtobuf._NameMap = [ + 1: .same(proto: "schema"), + 2: .standard(proto: "flight_descriptor"), + 3: .same(proto: "endpoint"), + 4: .standard(proto: "total_records"), + 5: .standard(proto: "total_bytes"), + 6: .same(proto: "ordered"), + ] + + mutating func decodeMessage(decoder: inout D) throws { + while let fieldNumber = try decoder.nextFieldNumber() { + // The use of inline closures is to circumvent an issue where the compiler + // allocates stack space for every case branch when no optimizations are + // enabled. https://github.com/apple/swift-protobuf/issues/1034 + switch fieldNumber { + case 1: try { try decoder.decodeSingularBytesField(value: &self.schema) }() + case 2: try { try decoder.decodeSingularMessageField(value: &self._flightDescriptor) }() + case 3: try { try decoder.decodeRepeatedMessageField(value: &self.endpoint) }() + case 4: try { try decoder.decodeSingularInt64Field(value: &self.totalRecords) }() + case 5: try { try decoder.decodeSingularInt64Field(value: &self.totalBytes) }() + case 6: try { try decoder.decodeSingularBoolField(value: &self.ordered) }() + default: break + } + } + } + + func traverse(visitor: inout V) throws { + // The use of inline closures is to circumvent an issue where the compiler + // allocates stack space for every if/case branch local when no optimizations + // are enabled. https://github.com/apple/swift-protobuf/issues/1034 and + // https://github.com/apple/swift-protobuf/issues/1182 + if !self.schema.isEmpty { + try visitor.visitSingularBytesField(value: self.schema, fieldNumber: 1) + } + try { if let v = self._flightDescriptor { + try visitor.visitSingularMessageField(value: v, fieldNumber: 2) + } }() + if !self.endpoint.isEmpty { + try visitor.visitRepeatedMessageField(value: self.endpoint, fieldNumber: 3) + } + if self.totalRecords != 0 { + try visitor.visitSingularInt64Field(value: self.totalRecords, fieldNumber: 4) + } + if self.totalBytes != 0 { + try visitor.visitSingularInt64Field(value: self.totalBytes, fieldNumber: 5) + } + if self.ordered != false { + try visitor.visitSingularBoolField(value: self.ordered, fieldNumber: 6) + } + try unknownFields.traverse(visitor: &visitor) + } + + static func ==(lhs: Arrow_Flight_Protocol_FlightInfo, rhs: Arrow_Flight_Protocol_FlightInfo) -> Bool { + if lhs.schema != rhs.schema {return false} + if lhs._flightDescriptor != rhs._flightDescriptor {return false} + if lhs.endpoint != rhs.endpoint {return false} + if lhs.totalRecords != rhs.totalRecords {return false} + if lhs.totalBytes != rhs.totalBytes {return false} + if lhs.ordered != rhs.ordered {return false} + if lhs.unknownFields != rhs.unknownFields {return false} + return true + } +} + +extension Arrow_Flight_Protocol_FlightEndpoint: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding { + static let protoMessageName: String = _protobuf_package + ".FlightEndpoint" + static let _protobuf_nameMap: SwiftProtobuf._NameMap = [ + 1: .same(proto: "ticket"), + 2: .same(proto: "location"), + 3: .standard(proto: "expiration_time"), + ] + + mutating func decodeMessage(decoder: inout D) throws { + while let fieldNumber = try decoder.nextFieldNumber() { + // The use of inline closures is to circumvent an issue where the compiler + // allocates stack space for every case branch when no optimizations are + // enabled. https://github.com/apple/swift-protobuf/issues/1034 + switch fieldNumber { + case 1: try { try decoder.decodeSingularMessageField(value: &self._ticket) }() + case 2: try { try decoder.decodeRepeatedMessageField(value: &self.location) }() + case 3: try { try decoder.decodeSingularMessageField(value: &self._expirationTime) }() + default: break + } + } + } + + func traverse(visitor: inout V) throws { + // The use of inline closures is to circumvent an issue where the compiler + // allocates stack space for every if/case branch local when no optimizations + // are enabled. https://github.com/apple/swift-protobuf/issues/1034 and + // https://github.com/apple/swift-protobuf/issues/1182 + try { if let v = self._ticket { + try visitor.visitSingularMessageField(value: v, fieldNumber: 1) + } }() + if !self.location.isEmpty { + try visitor.visitRepeatedMessageField(value: self.location, fieldNumber: 2) + } + try { if let v = self._expirationTime { + try visitor.visitSingularMessageField(value: v, fieldNumber: 3) + } }() + try unknownFields.traverse(visitor: &visitor) + } + + static func ==(lhs: Arrow_Flight_Protocol_FlightEndpoint, rhs: Arrow_Flight_Protocol_FlightEndpoint) -> Bool { + if lhs._ticket != rhs._ticket {return false} + if lhs.location != rhs.location {return false} + if lhs._expirationTime != rhs._expirationTime {return false} + if lhs.unknownFields != rhs.unknownFields {return false} + return true + } +} + +extension Arrow_Flight_Protocol_Location: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding { + static let protoMessageName: String = _protobuf_package + ".Location" + static let _protobuf_nameMap: SwiftProtobuf._NameMap = [ + 1: .same(proto: "uri"), + ] + + mutating func decodeMessage(decoder: inout D) throws { + while let fieldNumber = try decoder.nextFieldNumber() { + // The use of inline closures is to circumvent an issue where the compiler + // allocates stack space for every case branch when no optimizations are + // enabled. https://github.com/apple/swift-protobuf/issues/1034 + switch fieldNumber { + case 1: try { try decoder.decodeSingularStringField(value: &self.uri) }() + default: break + } + } + } + + func traverse(visitor: inout V) throws { + if !self.uri.isEmpty { + try visitor.visitSingularStringField(value: self.uri, fieldNumber: 1) + } + try unknownFields.traverse(visitor: &visitor) + } + + static func ==(lhs: Arrow_Flight_Protocol_Location, rhs: Arrow_Flight_Protocol_Location) -> Bool { + if lhs.uri != rhs.uri {return false} + if lhs.unknownFields != rhs.unknownFields {return false} + return true + } +} + +extension Arrow_Flight_Protocol_Ticket: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding { + static let protoMessageName: String = _protobuf_package + ".Ticket" + static let _protobuf_nameMap: SwiftProtobuf._NameMap = [ + 1: .same(proto: "ticket"), + ] + + mutating func decodeMessage(decoder: inout D) throws { + while let fieldNumber = try decoder.nextFieldNumber() { + // The use of inline closures is to circumvent an issue where the compiler + // allocates stack space for every case branch when no optimizations are + // enabled. https://github.com/apple/swift-protobuf/issues/1034 + switch fieldNumber { + case 1: try { try decoder.decodeSingularBytesField(value: &self.ticket) }() + default: break + } + } + } + + func traverse(visitor: inout V) throws { + if !self.ticket.isEmpty { + try visitor.visitSingularBytesField(value: self.ticket, fieldNumber: 1) + } + try unknownFields.traverse(visitor: &visitor) + } + + static func ==(lhs: Arrow_Flight_Protocol_Ticket, rhs: Arrow_Flight_Protocol_Ticket) -> Bool { + if lhs.ticket != rhs.ticket {return false} + if lhs.unknownFields != rhs.unknownFields {return false} + return true + } +} + +extension Arrow_Flight_Protocol_FlightData: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding { + static let protoMessageName: String = _protobuf_package + ".FlightData" + static let _protobuf_nameMap: SwiftProtobuf._NameMap = [ + 1: .standard(proto: "flight_descriptor"), + 2: .standard(proto: "data_header"), + 3: .standard(proto: "app_metadata"), + 1000: .standard(proto: "data_body"), + ] + + mutating func decodeMessage(decoder: inout D) throws { + while let fieldNumber = try decoder.nextFieldNumber() { + // The use of inline closures is to circumvent an issue where the compiler + // allocates stack space for every case branch when no optimizations are + // enabled. https://github.com/apple/swift-protobuf/issues/1034 + switch fieldNumber { + case 1: try { try decoder.decodeSingularMessageField(value: &self._flightDescriptor) }() + case 2: try { try decoder.decodeSingularBytesField(value: &self.dataHeader) }() + case 3: try { try decoder.decodeSingularBytesField(value: &self.appMetadata) }() + case 1000: try { try decoder.decodeSingularBytesField(value: &self.dataBody) }() + default: break + } + } + } + + func traverse(visitor: inout V) throws { + // The use of inline closures is to circumvent an issue where the compiler + // allocates stack space for every if/case branch local when no optimizations + // are enabled. https://github.com/apple/swift-protobuf/issues/1034 and + // https://github.com/apple/swift-protobuf/issues/1182 + try { if let v = self._flightDescriptor { + try visitor.visitSingularMessageField(value: v, fieldNumber: 1) + } }() + if !self.dataHeader.isEmpty { + try visitor.visitSingularBytesField(value: self.dataHeader, fieldNumber: 2) + } + if !self.appMetadata.isEmpty { + try visitor.visitSingularBytesField(value: self.appMetadata, fieldNumber: 3) + } + if !self.dataBody.isEmpty { + try visitor.visitSingularBytesField(value: self.dataBody, fieldNumber: 1000) + } + try unknownFields.traverse(visitor: &visitor) + } + + static func ==(lhs: Arrow_Flight_Protocol_FlightData, rhs: Arrow_Flight_Protocol_FlightData) -> Bool { + if lhs._flightDescriptor != rhs._flightDescriptor {return false} + if lhs.dataHeader != rhs.dataHeader {return false} + if lhs.appMetadata != rhs.appMetadata {return false} + if lhs.dataBody != rhs.dataBody {return false} + if lhs.unknownFields != rhs.unknownFields {return false} + return true + } +} + +extension Arrow_Flight_Protocol_PutResult: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding { + static let protoMessageName: String = _protobuf_package + ".PutResult" + static let _protobuf_nameMap: SwiftProtobuf._NameMap = [ + 1: .standard(proto: "app_metadata"), + ] + + mutating func decodeMessage(decoder: inout D) throws { + while let fieldNumber = try decoder.nextFieldNumber() { + // The use of inline closures is to circumvent an issue where the compiler + // allocates stack space for every case branch when no optimizations are + // enabled. https://github.com/apple/swift-protobuf/issues/1034 + switch fieldNumber { + case 1: try { try decoder.decodeSingularBytesField(value: &self.appMetadata) }() + default: break + } + } + } + + func traverse(visitor: inout V) throws { + if !self.appMetadata.isEmpty { + try visitor.visitSingularBytesField(value: self.appMetadata, fieldNumber: 1) + } + try unknownFields.traverse(visitor: &visitor) + } + + static func ==(lhs: Arrow_Flight_Protocol_PutResult, rhs: Arrow_Flight_Protocol_PutResult) -> Bool { + if lhs.appMetadata != rhs.appMetadata {return false} + if lhs.unknownFields != rhs.unknownFields {return false} + return true + } +} diff --git a/swift/ArrowFlight/Sources/ArrowFlight/FlightAction.swift b/swift/ArrowFlight/Sources/ArrowFlight/FlightAction.swift new file mode 100644 index 0000000000000..04e917d474cff --- /dev/null +++ b/swift/ArrowFlight/Sources/ArrowFlight/FlightAction.swift @@ -0,0 +1,39 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import Foundation + +public class FlightAction { + public let type: String + public let body: Data + init(_ action: Arrow_Flight_Protocol_Action) { + self.type = action.type + self.body = action.body + } + + public init(_ type: String, body: Data = Data()) { + self.type = type; + self.body = body; + } + + func toProtocol() -> Arrow_Flight_Protocol_Action { + var flight_action = Arrow_Flight_Protocol_Action() + flight_action.type = self.type + flight_action.body = self.body + return flight_action + } +} diff --git a/swift/ArrowFlight/Sources/ArrowFlight/FlightActionType.swift b/swift/ArrowFlight/Sources/ArrowFlight/FlightActionType.swift new file mode 100644 index 0000000000000..b3b06793feade --- /dev/null +++ b/swift/ArrowFlight/Sources/ArrowFlight/FlightActionType.swift @@ -0,0 +1,38 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import Foundation +public class FlightActionType { + public let type: String + public let description: String + init(_ actionType: Arrow_Flight_Protocol_ActionType) { + self.type = actionType.type + self.description = actionType.description_p + + } + public init(_ type: String, description: String) { + self.type = type + self.description = description + } + + func toProtocol() -> Arrow_Flight_Protocol_ActionType { + var actionType = Arrow_Flight_Protocol_ActionType() + actionType.type = self.type + actionType.description_p = self.description + return actionType + } +} diff --git a/swift/ArrowFlight/Sources/ArrowFlight/FlightClient.swift b/swift/ArrowFlight/Sources/ArrowFlight/FlightClient.swift new file mode 100644 index 0000000000000..f7b8564af31d7 --- /dev/null +++ b/swift/ArrowFlight/Sources/ArrowFlight/FlightClient.swift @@ -0,0 +1,144 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import struct Foundation.Data +import struct Foundation.URL +import GRPC +import NIOCore +import NIOPosix +import Arrow + +public class FlightClient { + let client: Arrow_Flight_Protocol_FlightServiceAsyncClient + public init(channel: GRPCChannel) { + client = Arrow_Flight_Protocol_FlightServiceAsyncClient(channel: channel) + } + + public func listActions(_ closure: (FlightActionType) -> Void) async throws { + let listActions = client.makeListActionsCall(Arrow_Flight_Protocol_Empty()) + for try await data in listActions.responseStream { + closure(FlightActionType(data)) + } + } + + public func listFlights(_ criteria :FlightCriteria, closure: (FlightInfo) throws -> Void) async throws { + let listFlights = client.makeListFlightsCall(criteria.toProtocol()) + for try await data in listFlights.responseStream { + try closure(FlightInfo(data)); + } + } + + + public func doAction(_ action: FlightAction, closure: (FlightResult) throws -> Void) async throws { + let actionResponse = client.makeDoActionCall(action.toProtocol()) + for try await data in actionResponse.responseStream { + try closure(FlightResult(data)); + } + } + + public func getSchema(_ descriptor: FlightDescriptor) async throws -> FlightSchemaResult { + let schemaResultResponse = client.makeGetSchemaCall(descriptor.toProtocol()) + return FlightSchemaResult(try await schemaResultResponse.response) + } + + public func doGet(_ ticket: FlightTicket, readerResultClosure: (ArrowReader.ArrowReaderResult) throws -> Void) async throws { + let getResult = client.makeDoGetCall(ticket.toProtocol()) + let reader = ArrowReader() + for try await data in getResult.responseStream { + switch reader.fromStream(data.dataBody) { + case .success(let rb): + try readerResultClosure(rb) + case .failure(let error): + throw error + } + } + } + + public func doGet(_ ticket: FlightTicket, flightDataClosure: (FlightData) throws -> Void) async throws { + let getResult = client.makeDoGetCall(ticket.toProtocol()) + for try await data in getResult.responseStream { + try flightDataClosure(FlightData(data)) + } + } + + public func doPut(_ recordBatchs: [RecordBatch], closure: (FlightPutResult) throws -> Void) async throws { + if recordBatchs.isEmpty { + throw ArrowFlightError.EmptyCollection + } + + let putCall = client.makeDoPutCall() + let writer = ArrowWriter() + let writerInfo = ArrowWriter.Info(.recordbatch, schema: recordBatchs[0].schema, batches: recordBatchs) + switch writer.toStream(writerInfo) { + case .success(let data): + try await putCall.requestStream.send(FlightData(data).toProtocol()) + putCall.requestStream.finish() + for try await response in putCall.responseStream { + try closure(FlightPutResult(response)) + } + case .failure(let error): + throw error + } + } + + public func doPut(flightData: FlightData, closure: (FlightPutResult) throws -> Void) async throws { + let putCall = client.makeDoPutCall() + try await putCall.requestStream.send(flightData.toProtocol()) + putCall.requestStream.finish() + for try await response in putCall.responseStream { + try closure(FlightPutResult(response)) + } + } + + public func doExchange(_ recordBatchs: [RecordBatch], closure: (ArrowReader.ArrowReaderResult) throws -> Void) async throws { + if recordBatchs.isEmpty { + throw ArrowFlightError.EmptyCollection + } + + let exchangeCall = client.makeDoExchangeCall() + let writer = ArrowWriter() + let info = ArrowWriter.Info(.recordbatch, schema: recordBatchs[0].schema, batches: recordBatchs) + switch writer.toStream(info) { + case .success(let data): + let request = Arrow_Flight_Protocol_FlightData.with { + $0.dataBody = data + } + try await exchangeCall.requestStream.send(request) + exchangeCall.requestStream.finish() + let reader = ArrowReader() + for try await response in exchangeCall.responseStream { + switch reader.fromStream(response.dataBody) { + case .success(let rbResult): + try closure(rbResult) + case .failure(let error): + throw error + } + } + case .failure(let error): + throw error + } + } + + public func doExchange(fligthData: FlightData, closure: (FlightData) throws -> Void) async throws { + let exchangeCall = client.makeDoExchangeCall() + try await exchangeCall.requestStream.send(fligthData.toProtocol()) + exchangeCall.requestStream.finish() + for try await response in exchangeCall.responseStream { + try closure(FlightData(response)) + } + } +} diff --git a/swift/ArrowFlight/Sources/ArrowFlight/FlightCriteria.swift b/swift/ArrowFlight/Sources/ArrowFlight/FlightCriteria.swift new file mode 100644 index 0000000000000..a887a22ad1737 --- /dev/null +++ b/swift/ArrowFlight/Sources/ArrowFlight/FlightCriteria.swift @@ -0,0 +1,37 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import Foundation + +public class FlightCriteria { + let criteria: Arrow_Flight_Protocol_Criteria + + public var expression: Data { criteria.expression } + public init(_ expression: Data = Data()) { + criteria = Arrow_Flight_Protocol_Criteria.with { + $0.expression = expression + } + } + + init(_ criteria: Arrow_Flight_Protocol_Criteria) { + self.criteria = criteria + } + + func toProtocol() -> Arrow_Flight_Protocol_Criteria { + return criteria + } +} diff --git a/swift/ArrowFlight/Sources/ArrowFlight/FlightData.swift b/swift/ArrowFlight/Sources/ArrowFlight/FlightData.swift new file mode 100644 index 0000000000000..004fb785f0c11 --- /dev/null +++ b/swift/ArrowFlight/Sources/ArrowFlight/FlightData.swift @@ -0,0 +1,46 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import Foundation + +public class FlightData { + let flight_data: Arrow_Flight_Protocol_FlightData + public var flightDescriptor: FlightDescriptor? { + get { return flight_data.hasFlightDescriptor ? FlightDescriptor(flight_data.flightDescriptor) : nil } + } + + public var dataBody: Data { flight_data.dataBody } + + init(_ flight_data: Arrow_Flight_Protocol_FlightData) { + self.flight_data = flight_data + } + + public init(_ dataBody: Data, flightDescriptor: FlightDescriptor? = nil) { + if flightDescriptor != nil { + self.flight_data = Arrow_Flight_Protocol_FlightData.with { + $0.dataBody = dataBody + $0.flightDescriptor = flightDescriptor!.toProtocol() + } + } else { + self.flight_data = Arrow_Flight_Protocol_FlightData.with { + $0.dataBody = dataBody + } + } + } + + func toProtocol() -> Arrow_Flight_Protocol_FlightData { self.flight_data } +} diff --git a/swift/ArrowFlight/Sources/ArrowFlight/FlightDescriptor.swift b/swift/ArrowFlight/Sources/ArrowFlight/FlightDescriptor.swift new file mode 100644 index 0000000000000..68bc91a3deda1 --- /dev/null +++ b/swift/ArrowFlight/Sources/ArrowFlight/FlightDescriptor.swift @@ -0,0 +1,56 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import Foundation + +public class FlightDescriptor { + public enum type { + case unknown + case path + case cmd + } + + public let type: FlightDescriptor.type + public let cmd: Data + public let paths: [String] + + init(_ descriptor: Arrow_Flight_Protocol_FlightDescriptor) { + self.type = descriptor.type == .cmd ? .cmd : .path + self.cmd = descriptor.cmd + self.paths = descriptor.path + } + + public init(cmd: Data) { + self.type = .cmd + self.cmd = cmd + self.paths = [String]() + } + + public init(paths: [String]) { + self.type = .path + self.cmd = Data() + self.paths = paths + } + + func toProtocol() -> Arrow_Flight_Protocol_FlightDescriptor { + var descriptor = Arrow_Flight_Protocol_FlightDescriptor() + descriptor.type = self.type == .cmd ? .cmd : .path + descriptor.cmd = self.cmd + descriptor.path = self.paths + return descriptor + } +} diff --git a/cpp/src/arrow/flight/try_compile/check_tls_opts_136.cc b/swift/ArrowFlight/Sources/ArrowFlight/FlightEndpoint.swift similarity index 52% rename from cpp/src/arrow/flight/try_compile/check_tls_opts_136.cc rename to swift/ArrowFlight/Sources/ArrowFlight/FlightEndpoint.swift index 638eec67ba723..7c40a2a157ae8 100644 --- a/cpp/src/arrow/flight/try_compile/check_tls_opts_136.cc +++ b/swift/ArrowFlight/Sources/ArrowFlight/FlightEndpoint.swift @@ -15,24 +15,24 @@ // specific language governing permissions and limitations // under the License. -// Dummy file for checking if TlsCredentialsOptions exists in -// the grpc::experimental namespace. gRPC starting from 1.36 -// puts it here. This is for supporting disabling server -// validation when using TLS. - -#include -#include -#include - -static void check() { - // In 1.34, there's no parameterless constructor; in 1.36, there's - // only a parameterless constructor - auto options = std::make_shared(); - options->set_server_verification_option( - grpc_tls_server_verification_option::GRPC_TLS_SERVER_VERIFICATION); -} - -int main(int argc, const char** argv) { - check(); - return 0; +import Foundation +public class FlightEndpoint { + let ticket: FlightTicket; + let locations: [FlightLocation] + init(_ endpoint: Arrow_Flight_Protocol_FlightEndpoint) { + self.ticket = FlightTicket(endpoint.ticket.ticket) + self.locations = endpoint.location.map {return FlightLocation($0)} + } + + public init(_ ticket: FlightTicket, locations: [FlightLocation]) { + self.ticket = ticket + self.locations = locations; + } + + func toProtocol() -> Arrow_Flight_Protocol_FlightEndpoint { + var endpoint = Arrow_Flight_Protocol_FlightEndpoint() + endpoint.ticket = self.ticket.toProtocol() + endpoint.location = self.locations.map { $0.toProtocol() } + return endpoint + } } diff --git a/swift/ArrowFlight/Sources/ArrowFlight/FlightInfo.swift b/swift/ArrowFlight/Sources/ArrowFlight/FlightInfo.swift new file mode 100644 index 0000000000000..b370c00db3d42 --- /dev/null +++ b/swift/ArrowFlight/Sources/ArrowFlight/FlightInfo.swift @@ -0,0 +1,55 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import Foundation +import Arrow + +public class FlightInfo { + let flight_info: Arrow_Flight_Protocol_FlightInfo + public var flightDescriptor: FlightDescriptor? { + get { return flight_info.hasFlightDescriptor ? FlightDescriptor(flight_info.flightDescriptor) : nil } + } + + public var endpoints: [FlightEndpoint] { + return self.flight_info.endpoint.map { FlightEndpoint($0) } + } + public var schema: Data { flight_info.schema } + + var endpoint: [Arrow_Flight_Protocol_FlightEndpoint] = [] + init(_ flight_info: Arrow_Flight_Protocol_FlightInfo) { + self.flight_info = flight_info + } + + public init(_ schema: Data, endpoints: [FlightEndpoint] = [FlightEndpoint](), descriptor: FlightDescriptor? = nil) { + if let localDescriptor = descriptor { + self.flight_info = Arrow_Flight_Protocol_FlightInfo.with { + $0.schema = schema + $0.flightDescriptor = localDescriptor.toProtocol() + $0.endpoint = endpoints.map { $0.toProtocol() } + } + } else { + self.flight_info = Arrow_Flight_Protocol_FlightInfo.with { + $0.schema = schema + $0.endpoint = endpoints.map { $0.toProtocol() } + } + } + } + + func toProtocol() -> Arrow_Flight_Protocol_FlightInfo { + return self.flight_info + } +} diff --git a/swift/ArrowFlight/Sources/ArrowFlight/FlightLocation.swift b/swift/ArrowFlight/Sources/ArrowFlight/FlightLocation.swift new file mode 100644 index 0000000000000..b87671c903d44 --- /dev/null +++ b/swift/ArrowFlight/Sources/ArrowFlight/FlightLocation.swift @@ -0,0 +1,36 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import Foundation + +public class FlightLocation { + public let uri: String + + init(_ location: Arrow_Flight_Protocol_Location) { + self.uri = location.uri + } + + public init(_ uri: String) { + self.uri = uri; + } + + func toProtocol() -> Arrow_Flight_Protocol_Location { + var location = Arrow_Flight_Protocol_Location() + location.uri = uri + return location + } +} diff --git a/swift/ArrowFlight/Sources/ArrowFlight/FlightPutResult.swift b/swift/ArrowFlight/Sources/ArrowFlight/FlightPutResult.swift new file mode 100644 index 0000000000000..bf73c716e39c0 --- /dev/null +++ b/swift/ArrowFlight/Sources/ArrowFlight/FlightPutResult.swift @@ -0,0 +1,35 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import Foundation + +public class FlightPutResult { + public let appMetadata: Data + public init(_ appMetadata: Data = Data()) { + self.appMetadata = appMetadata + } + + init(_ putResult: Arrow_Flight_Protocol_PutResult) { + self.appMetadata = putResult.appMetadata + } + + func toProtocol() -> Arrow_Flight_Protocol_PutResult { + var putResult = Arrow_Flight_Protocol_PutResult() + putResult.appMetadata = self.appMetadata + return putResult + } +} diff --git a/swift/ArrowFlight/Sources/ArrowFlight/FlightResult.swift b/swift/ArrowFlight/Sources/ArrowFlight/FlightResult.swift new file mode 100644 index 0000000000000..ba55bede7c70c --- /dev/null +++ b/swift/ArrowFlight/Sources/ArrowFlight/FlightResult.swift @@ -0,0 +1,35 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import Foundation + +public class FlightResult { + public let body: Data + init(_ result: Arrow_Flight_Protocol_Result) { + self.body = result.body + } + + public init(_ body: Data) { + self.body = body + } + + func toProtocol() -> Arrow_Flight_Protocol_Result { + var result = Arrow_Flight_Protocol_Result() + result.body = self.body + return result + } +} diff --git a/cpp/src/arrow/flight/try_compile/check_tls_opts_132.cc b/swift/ArrowFlight/Sources/ArrowFlight/FlightSchemaResult.swift similarity index 57% rename from cpp/src/arrow/flight/try_compile/check_tls_opts_132.cc rename to swift/ArrowFlight/Sources/ArrowFlight/FlightSchemaResult.swift index fa5ba0f43d925..8d5323b731ea8 100644 --- a/cpp/src/arrow/flight/try_compile/check_tls_opts_132.cc +++ b/swift/ArrowFlight/Sources/ArrowFlight/FlightSchemaResult.swift @@ -15,22 +15,23 @@ // specific language governing permissions and limitations // under the License. -// Dummy file for checking if TlsCredentialsOptions exists in -// the grpc::experimental namespace. gRPC versions 1.32 and higher -// put it here. This is for supporting disabling server -// validation when using TLS. +import Foundation -#include -#include -#include - -static grpc_tls_server_verification_option check( - const grpc::experimental::TlsCredentialsOptions* options) { - grpc_tls_server_verification_option server_opt = options->server_verification_option(); - return server_opt; -} - -int main(int argc, const char** argv) { - [[maybe_unused]] grpc_tls_server_verification_option opt = check(nullptr); - return 0; +public class FlightSchemaResult { + let schemaResult: Arrow_Flight_Protocol_SchemaResult + + public var schema: Data { schemaResult.schema } + public init(_ schema: Data) { + self.schemaResult = Arrow_Flight_Protocol_SchemaResult.with { + $0.schema = schema + } + } + + init(_ schemaResult: Arrow_Flight_Protocol_SchemaResult) { + self.schemaResult = schemaResult + } + + func toProtocol() -> Arrow_Flight_Protocol_SchemaResult { + return schemaResult + } } diff --git a/swift/ArrowFlight/Sources/ArrowFlight/FlightServer.swift b/swift/ArrowFlight/Sources/ArrowFlight/FlightServer.swift new file mode 100644 index 0000000000000..f67f612b0bcb4 --- /dev/null +++ b/swift/ArrowFlight/Sources/ArrowFlight/FlightServer.swift @@ -0,0 +1,162 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import Foundation +import GRPC +import NIO +import NIOConcurrencyHelpers +import SwiftProtobuf +import Arrow + +public enum ArrowFlightError: Error { + case Unknown(String?) + case NotImplemented(String? = nil) + case EmptyCollection + case IOError(String? = nil) +} + +public func schemaToArrowStream(_ schema: ArrowSchema) throws -> Data { + let arrowWriter = ArrowWriter() + switch arrowWriter.toStream(ArrowWriter.Info(.schema, schema: schema)) { + case .success(let result): + return result + case .failure(let error): + throw error + } +} + +public func streamToArrowSchema(_ schema: Data) throws -> ArrowSchema { + let schemaResult = ArrowReader().fromStream(schema) + switch schemaResult { + case .success(let result): + if let retSchema = result.schema { + return retSchema + } + + throw ArrowFlightError.IOError("Schema not found") + case .failure(let error): + throw error + } +} + +public protocol ArrowFlightServer : Sendable { + func listFlights(_ criteria: FlightCriteria, writer: FlightInfoStreamWriter) async throws + func getFlightInfo(_ request: FlightDescriptor) async throws -> FlightInfo + func getSchema(_ request: FlightDescriptor) async throws -> ArrowFlight.FlightSchemaResult + func listActions(_ writer: ActionTypeStreamWriter) async throws + func doAction(_ action: FlightAction, writer: ResultStreamWriter) async throws + func doGet(_ ticket: FlightTicket, writer: RecordBatchStreamWriter) async throws; + func doPut(_ reader: RecordBatchStreamReader, writer: PutResultDataStreamWriter) async throws + func doExchange(_ reader: RecordBatchStreamReader, writer: RecordBatchStreamWriter) async throws +} + +public func MakeFlightServer(_ handler: ArrowFlightServer) -> CallHandlerProvider { + return InternalFlightServer(handler) +} + +internal final class InternalFlightServer : Arrow_Flight_Protocol_FlightServiceAsyncProvider { + let arrowFlightServer: ArrowFlightServer? + + init(_ arrowFlightServer: ArrowFlightServer?) { + self.arrowFlightServer = arrowFlightServer + } + + func handshake(requestStream: GRPC.GRPCAsyncRequestStream, responseStream: GRPC.GRPCAsyncResponseStreamWriter, context: GRPC.GRPCAsyncServerCallContext) async throws { + throw ArrowFlightError.NotImplemented() + } + + func listFlights(request: Arrow_Flight_Protocol_Criteria, responseStream: GRPC.GRPCAsyncResponseStreamWriter, context: GRPC.GRPCAsyncServerCallContext) async throws { + if let server = arrowFlightServer { + let writer = FlightInfoStreamWriter(responseStream) + try await server.listFlights(FlightCriteria(request), writer: writer) + return + } + + throw ArrowFlightError.NotImplemented() + } + + func getFlightInfo(request: Arrow_Flight_Protocol_FlightDescriptor, context: GRPC.GRPCAsyncServerCallContext) async throws -> Arrow_Flight_Protocol_FlightInfo { + if let server = arrowFlightServer { + return try await server.getFlightInfo(FlightDescriptor(request)).toProtocol() + } + + throw ArrowFlightError.NotImplemented() + } + + func getSchema(request: Arrow_Flight_Protocol_FlightDescriptor, context: GRPC.GRPCAsyncServerCallContext) async throws -> Arrow_Flight_Protocol_SchemaResult { + if let server = arrowFlightServer { + return try await server.getSchema(FlightDescriptor(request)).toProtocol() + } + + throw ArrowFlightError.NotImplemented() + } + + func doGet(request: Arrow_Flight_Protocol_Ticket, responseStream: GRPC.GRPCAsyncResponseStreamWriter, context: GRPC.GRPCAsyncServerCallContext) async throws { + if let server = arrowFlightServer { + let writer = RecordBatchStreamWriter(responseStream) + let ticket = FlightTicket(request) + try await server.doGet(ticket, writer: writer) + return + } + + throw ArrowFlightError.NotImplemented() + } + + func doPut(requestStream: GRPC.GRPCAsyncRequestStream, responseStream: GRPC.GRPCAsyncResponseStreamWriter, context: GRPC.GRPCAsyncServerCallContext) async throws { + if let server = arrowFlightServer { + let reader = RecordBatchStreamReader(requestStream) + let writer = PutResultDataStreamWriter(responseStream) + try await server.doPut(reader, writer: writer) + return + } + + throw ArrowFlightError.NotImplemented() + } + + func doExchange(requestStream: GRPC.GRPCAsyncRequestStream, responseStream: GRPC.GRPCAsyncResponseStreamWriter, context: GRPC.GRPCAsyncServerCallContext) async throws { + if let server = arrowFlightServer { + let reader = RecordBatchStreamReader(requestStream) + let writer = RecordBatchStreamWriter(responseStream) + try await server.doExchange(reader, writer: writer) + return + } + + throw ArrowFlightError.NotImplemented() + } + + func doAction(request: Arrow_Flight_Protocol_Action, responseStream: GRPC.GRPCAsyncResponseStreamWriter, context: GRPC.GRPCAsyncServerCallContext) async throws { + if let server = arrowFlightServer { + try await server.doAction(FlightAction(request), writer: ResultStreamWriter(responseStream)) + return + } + + throw ArrowFlightError.NotImplemented() + } + + func listActions(request: Arrow_Flight_Protocol_Empty, responseStream: GRPC.GRPCAsyncResponseStreamWriter, context: GRPC.GRPCAsyncServerCallContext) async throws { + if let server = arrowFlightServer { + let writer = ActionTypeStreamWriter(responseStream) + try await server.listActions(writer) + return + } + + throw ArrowFlightError.NotImplemented() + } + + internal var interceptors: Arrow_Flight_Protocol_FlightServiceServerInterceptorFactoryProtocol? { get { return nil } } + +} diff --git a/swift/ArrowFlight/Sources/ArrowFlight/FlightTicket.swift b/swift/ArrowFlight/Sources/ArrowFlight/FlightTicket.swift new file mode 100644 index 0000000000000..f77fc3545af5c --- /dev/null +++ b/swift/ArrowFlight/Sources/ArrowFlight/FlightTicket.swift @@ -0,0 +1,35 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import Foundation + +public class FlightTicket { + public let data: Data + init(_ ticket: Arrow_Flight_Protocol_Ticket) { + self.data = ticket.ticket + } + + public init(_ data: Data) { + self.data = data + } + + func toProtocol() -> Arrow_Flight_Protocol_Ticket { + var ticket = Arrow_Flight_Protocol_Ticket() + ticket.ticket = self.data + return ticket + } +} diff --git a/swift/ArrowFlight/Sources/ArrowFlight/RecordBatchStreamReader.swift b/swift/ArrowFlight/Sources/ArrowFlight/RecordBatchStreamReader.swift new file mode 100644 index 0000000000000..a6b9ce93a9acd --- /dev/null +++ b/swift/ArrowFlight/Sources/ArrowFlight/RecordBatchStreamReader.swift @@ -0,0 +1,67 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import Foundation +import Arrow +import GRPC + +public class RecordBatchStreamReader: AsyncSequence, AsyncIteratorProtocol { + public typealias AsyncIterator = RecordBatchStreamReader + public typealias Element = RecordBatch + let reader = ArrowReader() + var batches = [RecordBatch]() + var batchIndex = 0 + var streamIterator: any AsyncIteratorProtocol + let stream: GRPC.GRPCAsyncRequestStream + init(_ stream: GRPC.GRPCAsyncRequestStream) { + self.stream = stream + self.streamIterator = self.stream.makeAsyncIterator() + } + + public func next() async throws -> Arrow.RecordBatch? { + guard !Task.isCancelled else { + return nil + } + + if batchIndex < batches.count { + let batch = batches[batchIndex] + batchIndex += 1 + return batch + } + + while true { + let flightData = try await self.streamIterator.next() + if flightData == nil { + return nil + } + + let data = (flightData as! Arrow_Flight_Protocol_FlightData).dataBody + switch reader.fromStream(data) { + case .success(let rbResult): + batches = rbResult.batches + batchIndex = 1 + return batches[0] + case .failure(let error): + throw error + } + } + } + + public func makeAsyncIterator() -> RecordBatchStreamReader { + self + } +} diff --git a/swift/ArrowFlight/Sources/ArrowFlight/RecordBatchStreamWriter.swift b/swift/ArrowFlight/Sources/ArrowFlight/RecordBatchStreamWriter.swift new file mode 100644 index 0000000000000..1efeba5310369 --- /dev/null +++ b/swift/ArrowFlight/Sources/ArrowFlight/RecordBatchStreamWriter.swift @@ -0,0 +1,91 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import Foundation +import Arrow +import GRPC + +public class ActionTypeStreamWriter { + let stream: GRPCAsyncResponseStreamWriter + init(_ stream: GRPCAsyncResponseStreamWriter) { + self.stream = stream + } + + public func write(_ actionType: FlightActionType) async throws { + try await self.stream.send(actionType.toProtocol()) + } +} + +public class ResultStreamWriter { + let stream: GRPCAsyncResponseStreamWriter + init(_ stream: GRPCAsyncResponseStreamWriter) { + self.stream = stream + } + + public func write(_ result: FlightResult) async throws { + try await self.stream.send(result.toProtocol()) + } +} + +public class FlightInfoStreamWriter { + let stream: GRPCAsyncResponseStreamWriter + init(_ stream: GRPCAsyncResponseStreamWriter) { + self.stream = stream + } + + public func write(_ result: FlightInfo) async throws { + try await self.stream.send(result.toProtocol()) + } +} + +public class PutResultDataStreamWriter { + let stream: GRPCAsyncResponseStreamWriter + init(_ stream: GRPCAsyncResponseStreamWriter) { + self.stream = stream + } + + public func write(_ result: FlightPutResult) async throws { + try await self.stream.send(result.toProtocol()) + } +} + +public class RecordBatchStreamWriter { + let writer = ArrowWriter() + let stream: GRPCAsyncResponseStreamWriter + init(_ stream: GRPCAsyncResponseStreamWriter) { + self.stream = stream + } + + public func write(_ rb: RecordBatch) async throws { + let info = ArrowWriter.Info(.recordbatch, + schema: rb.schema, + batches: [rb] + ) + + let result = writer.toStream(info) + switch result { + case .success(let rbResult): + let data = Arrow_Flight_Protocol_FlightData.with { + $0.dataBody = rbResult + } + + try await self.stream.send(data) + case .failure(let error): + throw error + } + } +} diff --git a/swift/ArrowFlight/Tests/ArrowFlightTests/FlightTest.swift b/swift/ArrowFlight/Tests/ArrowFlightTests/FlightTest.swift new file mode 100644 index 0000000000000..d0db593b10304 --- /dev/null +++ b/swift/ArrowFlight/Tests/ArrowFlightTests/FlightTest.swift @@ -0,0 +1,302 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import XCTest +import struct Foundation.Data +import struct Foundation.URL +import GRPC +import NIOCore +import NIOPosix +import Arrow + +@testable import ArrowFlight + +func makeSchema() -> ArrowSchema { + let schemaBuilder = ArrowSchema.Builder() + return schemaBuilder.addField("col1", type: ArrowType(ArrowType.ArrowUInt8), isNullable: true) + .addField("col2", type: ArrowType(ArrowType.ArrowString), isNullable: false) + .addField("col3", type: ArrowType(ArrowType.ArrowDate32), isNullable: false) + .finish() +} + +func makeRecordBatch() throws -> RecordBatch { + let uint8Builder: NumberArrayBuilder = try ArrowArrayBuilders.loadNumberArrayBuilder() + uint8Builder.append(10) + uint8Builder.append(22) + uint8Builder.append(33) + uint8Builder.append(44) + let stringBuilder = try ArrowArrayBuilders.loadStringArrayBuilder() + stringBuilder.append("test10") + stringBuilder.append("test22") + stringBuilder.append("test33") + stringBuilder.append("test44") + let date32Builder = try ArrowArrayBuilders.loadDate32ArrayBuilder() + let date2 = Date(timeIntervalSinceReferenceDate: 86400 * 1) + let date1 = Date(timeIntervalSinceReferenceDate: 86400 * 5000 + 352) + date32Builder.append(date1) + date32Builder.append(date2) + date32Builder.append(date1) + date32Builder.append(date2) + let intHolder = ArrowArrayHolder(try uint8Builder.finish()) + let stringHolder = ArrowArrayHolder(try stringBuilder.finish()) + let date32Holder = ArrowArrayHolder(try date32Builder.finish()) + let result = RecordBatch.Builder() + .addColumn("col1", arrowArray: intHolder) + .addColumn("col2", arrowArray: stringHolder) + .addColumn("col3", arrowArray: date32Holder) + .finish() + switch result { + case .success(let recordBatch): + return recordBatch + case .failure(let error): + throw error + } +} + +final class MyFlightServer : ArrowFlightServer { + func doExchange(_ reader: ArrowFlight.RecordBatchStreamReader, writer: ArrowFlight.RecordBatchStreamWriter) async throws { + do { + for try await rb in reader { + XCTAssertEqual(rb.schema.fields.count, 3) + XCTAssertEqual(rb.length, 4) + } + + let rb = try makeRecordBatch() + try await writer.write(rb) + } catch { + print("Unknown error: \(error)") + } + } + + func doPut(_ reader: ArrowFlight.RecordBatchStreamReader, writer: ArrowFlight.PutResultDataStreamWriter) async throws { + for try await rb in reader { + XCTAssertEqual(rb.schema.fields.count, 3) + XCTAssertEqual(rb.length, 4) + try await writer.write(FlightPutResult()) + } + } + + func doGet(_ ticket: ArrowFlight.FlightTicket, writer: ArrowFlight.RecordBatchStreamWriter) async throws { + try await writer.write(try makeRecordBatch()) + } + + func getSchema(_ request: ArrowFlight.FlightDescriptor) async throws -> ArrowFlight.FlightSchemaResult { + XCTAssertEqual(String(bytes: request.cmd, encoding: .utf8)!, "schema info") + XCTAssertEqual(request.type, .cmd) + return try ArrowFlight.FlightSchemaResult(schemaToArrowStream(makeSchema())) + } + + func getFlightInfo(_ request: ArrowFlight.FlightDescriptor) async throws -> ArrowFlight.FlightInfo { + return ArrowFlight.FlightInfo(Data()) + } + + func listFlights(_ criteria: ArrowFlight.FlightCriteria, writer: ArrowFlight.FlightInfoStreamWriter) async throws { + XCTAssertEqual(String(bytes: criteria.expression, encoding: .utf8), "flight criteria expression") + let flight_info = try ArrowFlight.FlightInfo(schemaToArrowStream(makeSchema())) + try await writer.write(flight_info) + } + + func listActions(_ writer: ArrowFlight.ActionTypeStreamWriter) async throws { + try await writer.write(FlightActionType("type1", description: "desc1")) + try await writer.write(FlightActionType("type2", description: "desc2")) + } + + func doAction(_ action: FlightAction, writer: ResultStreamWriter) async throws { + XCTAssertEqual(action.type, "test_action") + XCTAssertEqual(String(bytes: action.body, encoding: .utf8)!, "test_action body") + try await writer.write(FlightResult("test_action result".data(using: .utf8)!)) + } +} + +struct FlightServerImpl { + var port = 1234 + static var server: Server? + static var group: MultiThreadedEventLoopGroup? + static func run() async throws { + do { + // Create an event loop group for the server to run on. + let group = MultiThreadedEventLoopGroup(numberOfThreads: System.coreCount) + // Create a provider using the features we read. + let provider = ArrowFlight.MakeFlightServer(MyFlightServer()) + + // Start the server and print its address once it has started. + FlightServerImpl.server = try await Server.insecure(group: group) + .withServiceProviders([provider]) + .bind(host: "localhost", port: 8088) + .get() + + print("server started on port \(server!.channel.localAddress!.port!)") + + // Wait on the server's `onClose` future to stop the program from exiting. + } catch { + print("Unknown server error: \(error)") + } + } +} + +public class FlightClientTester { + var client: FlightClient? + var group: MultiThreadedEventLoopGroup? + var channel: GRPCChannel? + + init() async throws { + // Load the features. + let group = PlatformSupport.makeEventLoopGroup(loopCount: 1) + let channel = try GRPCChannelPool.with( + target: .host("localhost", port: 8088), + transportSecurity: .plaintext, + eventLoopGroup: group + ) + + client = FlightClient(channel: channel) + } + + deinit { + try? group?.syncShutdownGracefully() + try? channel?.close().wait() + } + + func listActionTest() async throws { + var actionTypes = [FlightActionType]() + try await client?.listActions( { action in + actionTypes.append(action) + }) + + XCTAssertEqual(actionTypes.count, 2) + XCTAssertEqual(actionTypes[0].type, "type1") + XCTAssertEqual(actionTypes[0].description, "desc1") + XCTAssertEqual(actionTypes[1].type, "type2") + XCTAssertEqual(actionTypes[1].description, "desc2") + } + + func listFlightsTest() async throws { + let flightCriteria = FlightCriteria("flight criteria expression".data(using: .utf8)!) + var num_calls = 0 + try await client?.listFlights(flightCriteria, closure: { data in + num_calls += 1 + let schema = try streamToArrowSchema(data.schema) + XCTAssertEqual(schema.fields.count, 3) + }) + + XCTAssertEqual(num_calls, 1) + } + + func doActionTest() async throws { + let action = FlightAction("test_action", body: "test_action body".data(using: .utf8)!) + var actionResults = [FlightResult]() + try await client?.doAction(action, closure: { result in + actionResults.append(result) + }) + + XCTAssertEqual(actionResults.count, 1) + XCTAssertEqual(String(bytes:actionResults[0].body, encoding: .utf8), "test_action result") + } + + func getSchemaTest() async throws { + let descriptor = FlightDescriptor(cmd: "schema info".data(using: .utf8)!) + let schemaResult = try await client?.getSchema(descriptor) + let schema = try streamToArrowSchema(schemaResult!.schema) + XCTAssertEqual(schema.fields.count, 3) + } + + func doGetTest() async throws { + let ticket = FlightTicket("flight_ticket test".data(using: .utf8)!) + var num_call = 0 + try await client?.doGet(ticket, readerResultClosure: { rb in + num_call += 1 + XCTAssertEqual(rb.schema!.fields.count, 3) + XCTAssertEqual(rb.batches[0].length, 4) + }) + + XCTAssertEqual(num_call, 1) + } + + func doPutTest() async throws { + let rb = try makeRecordBatch() + var num_call = 0 + try await client?.doPut([rb], closure: { result in + num_call += 1 + }) + + XCTAssertEqual(num_call, 1) + } + + func doExchangeTest() async throws { + let rb = try makeRecordBatch() + var num_call = 0 + try await client?.doExchange([rb], closure: { result in + num_call += 1 + XCTAssertEqual(result.schema?.fields.count, 3) + XCTAssertEqual(result.batches[0].length, 4) + }) + + XCTAssertEqual(num_call, 1) + } +} + +actor FlightServerData { + public var serverup = false + func SetServerUp(_ serverUp: Bool) { + self.serverup = serverUp + } + + func IsServerUp() -> Bool { + return serverup + } +} + +final class FlightTest: XCTestCase { + let serverData = FlightServerData() + + func testFlightServer() async throws { + let basicTask = Task { + try await FlightServerImpl.run() + defer { + print("server shutting down") + try! FlightServerImpl.group?.syncShutdownGracefully() + } + + await serverData.SetServerUp(true) + try await FlightServerImpl.server?.onClose.get() + return "done" + } + + let secondTask = Task { + defer { + _ = FlightServerImpl.server?.close() + } + + while await !serverData.IsServerUp() { + try await Task.sleep(nanoseconds: 1_000_000) + } + + let clientImpl = try await FlightClientTester() + try await clientImpl.listActionTest() + try await clientImpl.listFlightsTest() + try await clientImpl.doActionTest() + try await clientImpl.getSchemaTest() + try await clientImpl.doGetTest() + try await clientImpl.doPutTest() + try await clientImpl.doExchangeTest() + + return "done" + } + + let _ = try await [basicTask.value, secondTask.value] + print("done running") + } +} diff --git a/swift/gen-protobuffers.sh b/swift/gen-protobuffers.sh new file mode 100755 index 0000000000000..383a7a2f3195e --- /dev/null +++ b/swift/gen-protobuffers.sh @@ -0,0 +1,44 @@ +#!/usr/bin/env bash + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -eu +protoc --swift_out=./ArrowFlight/Sources/ArrowFlight --proto_path=../format Flight.proto +protoc --grpc-swift_out=./ArrowFlight/Sources/ArrowFlight --proto_path=../format Flight.proto +cat <

    header.swift +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +HEADER +mv ./ArrowFlight/Sources/ArrowFlight/Flight.grpc.swift ./ArrowFlight/Sources/ArrowFlight/Flight.grpc.swift.orig +cat header.swift ./ArrowFlight/Sources/ArrowFlight/Flight.grpc.swift.orig > ./ArrowFlight/Sources/ArrowFlight/Flight.grpc.swift +rm ./ArrowFlight/Sources/ArrowFlight/Flight.grpc.swift.orig +rm header.swift \ No newline at end of file