diff --git a/.github/actions/setup-builder/action.yaml b/.github/actions/setup-builder/action.yaml index 13a3008b74bc..0157caf8c296 100644 --- a/.github/actions/setup-builder/action.yaml +++ b/.github/actions/setup-builder/action.yaml @@ -25,6 +25,31 @@ inputs: runs: using: "composite" steps: + - name: Cache Cargo + uses: actions/cache@v3 + with: + # these represent dependencies downloaded by cargo + # and thus do not depend on the OS, arch nor rust version. + # + # source https://github.com/actions/cache/blob/main/examples.md#rust---cargo + path: | + /usr/local/cargo/bin/ + /usr/local/cargo/registry/index/ + /usr/local/cargo/registry/cache/ + /usr/local/cargo/git/db/ + key: cargo-cache3-${{ hashFiles('**/Cargo.toml') }} + restore-keys: cargo-cache3- + - name: Generate lockfile + shell: bash + run: cargo fetch + - name: Cache Rust dependencies + uses: actions/cache@v3 + with: + # these represent compiled steps of both dependencies and arrow + # and thus are specific for a particular OS, arch and rust version. + path: /github/home/target + key: ${{ runner.os }}-${{ runner.arch }}-target-cache3-${{ inputs.rust-version }}-${{ hashFiles('**/Cargo.lock') }} + restore-keys: ${{ runner.os }}-${{ runner.arch }}-target-cache3-${{ inputs.rust-version }}- - name: Install Build Dependencies shell: bash run: | @@ -36,4 +61,4 @@ runs: echo "Installing ${{ inputs.rust-version }}" rustup toolchain install ${{ inputs.rust-version }} rustup default ${{ inputs.rust-version }} - rustup component add rustfmt + echo "CARGO_TARGET_DIR=/github/home/target" >> $GITHUB_ENV diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 41b1dcbe8eb9..7eed6b8e94c9 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -39,7 +39,7 @@ jobs: path: rust fetch-depth: 0 - name: Setup Python - uses: actions/setup-python@v2 + uses: actions/setup-python@v3 with: python-version: 3.8 - name: Setup Archery @@ -64,17 +64,17 @@ jobs: rustup default ${{ matrix.rust }} rustup component add rustfmt clippy - name: Cache Cargo - uses: actions/cache@v2 + uses: actions/cache@v3 with: path: /home/runner/.cargo key: cargo-maturin-cache- - name: Cache Rust dependencies - uses: actions/cache@v2 + uses: actions/cache@v3 with: path: /home/runner/target # this key is not equal because maturin uses different compilation flags. key: ${{ runner.os }}-${{ matrix.arch }}-target-maturin-cache-${{ matrix.rust }}- - - uses: actions/setup-python@v2 + - uses: actions/setup-python@v3 with: python-version: '3.7' - name: Upgrade pip and setuptools diff --git a/.github/workflows/miri.sh b/.github/workflows/miri.sh index 27c6f5eecc87..56da5c5c5d3e 100755 --- a/.github/workflows/miri.sh +++ b/.github/workflows/miri.sh @@ -6,21 +6,12 @@ # rustup default nightly -export MIRIFLAGS="-Zmiri-disable-isolation" +# stacked borrows checking uses too much memory to run successfully in github actions +# re-enable if the CI is migrated to something more powerful (https://github.com/apache/arrow-rs/issues/1833) +# see also https://github.com/rust-lang/miri/issues/1367 +export MIRIFLAGS="-Zmiri-disable-isolation -Zmiri-disable-stacked-borrows" cargo miri setup cargo clean -run_miri() { - # Currently only the arrow crate is tested with miri - # IO related tests and some unsupported tests are skipped - cargo miri test -p arrow -- --skip csv --skip ipc --skip json -} - -# If MIRI fails, automatically retry -# Seems like miri is occasionally killed by the github runner -# https://github.com/apache/arrow-rs/issues/879 -for i in `seq 1 5`; do - echo "Starting Arrow MIRI run..." - run_miri && break - echo "foo" > /tmp/data.txt -done +echo "Starting Arrow MIRI run..." +cargo miri test -p arrow -- --skip csv --skip ipc --skip json diff --git a/.github/workflows/miri.yaml b/.github/workflows/miri.yaml index 73dfc0092836..7feacc07dd73 100644 --- a/.github/workflows/miri.yaml +++ b/.github/workflows/miri.yaml @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -name: Rust +name: MIRI on: # always trigger @@ -26,19 +26,15 @@ jobs: miri-checks: name: MIRI runs-on: ubuntu-latest - strategy: - matrix: - arch: [amd64] - rust: [nightly-2022-01-17] steps: - uses: actions/checkout@v2 with: submodules: true - name: Setup Rust toolchain run: | - rustup toolchain install ${{ matrix.rust }} - rustup default ${{ matrix.rust }} - rustup component add rustfmt clippy miri + rustup toolchain install nightly --component miri + rustup override set nightly + cargo miri setup - name: Run Miri Checks env: RUST_BACKTRACE: full diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 67272053e6b2..9331db745659 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -30,8 +30,8 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - arch: [amd64] - rust: [stable] + arch: [ amd64 ] + rust: [ stable ] container: image: ${{ matrix.arch }}/rust env: @@ -40,39 +40,23 @@ jobs: RUSTFLAGS: "-C debuginfo=1" steps: - uses: actions/checkout@v2 - - name: Cache Cargo - uses: actions/cache@v2 - with: - # these represent dependencies downloaded by cargo - # and thus do not depend on the OS, arch nor rust version. - path: /github/home/.cargo - key: cargo-cache3- - - name: Cache Rust dependencies - uses: actions/cache@v2 - with: - # these represent compiled steps of both dependencies and arrow - # and thus are specific for a particular OS, arch and rust version. - path: /github/home/target - key: ${{ runner.os }}-${{ matrix.arch }}-target-cache3-${{ matrix.rust }}- - name: Setup Rust toolchain uses: ./.github/actions/setup-builder with: rust-version: ${{ matrix.rust }} - name: Build Workspace run: | - export CARGO_HOME="/github/home/.cargo" - export CARGO_TARGET_DIR="/github/home/target" cargo build # test the crate linux-test: name: Test Workspace on AMD64 Rust ${{ matrix.rust }} - needs: [linux-build-lib] + needs: [ linux-build-lib ] runs-on: ubuntu-latest strategy: matrix: - arch: [amd64] - rust: [stable] + arch: [ amd64 ] + rust: [ stable ] container: image: ${{ matrix.arch }}/rust env: @@ -85,54 +69,52 @@ jobs: - uses: actions/checkout@v2 with: submodules: true - - name: Cache Cargo - uses: actions/cache@v2 - with: - path: /github/home/.cargo - # this key equals the ones on `linux-build-lib` for re-use - key: cargo-cache3- - - name: Cache Rust dependencies - uses: actions/cache@v2 - with: - path: /github/home/target - # this key equals the ones on `linux-build-lib` for re-use - key: ${{ runner.os }}-${{ matrix.arch }}-target-cache3-${{ matrix.rust }} - name: Setup Rust toolchain uses: ./.github/actions/setup-builder with: rust-version: ${{ matrix.rust }} - name: Run tests run: | - export CARGO_HOME="/github/home/.cargo" - export CARGO_TARGET_DIR="/github/home/target" - # run tests on all workspace members with default feature list cargo test - - # Switch to arrow crate - cd arrow - # re-run tests on arrow crate to ensure - # all arrays are created correctly - cargo test --features=force_validate - cargo test --features=prettyprint - # run test on arrow crate with minimal set of features - cargo test --no-default-features + - name: Re-run tests with all supported features + run: | + cargo test -p arrow --features=force_validate,prettyprint + - name: Run examples + run: | + # Test arrow examples cargo run --example builders cargo run --example dynamic_types cargo run --example read_csv cargo run --example read_csv_infer_schema - cargo check --no-default-features - - # Switch to parquet crate - cd ../parquet - # re-run tests on parquet crate with async feature enabled - cargo test --features=async - cargo check --no-default-features - - # Switch to arrow-flight - cd ../arrow-flight - cargo test --features=flight-sql-experimental - cargo check --no-default-features + - name: Test compilation of arrow library crate with different feature combinations + run: | + cargo check -p arrow + cargo check -p arrow --no-default-features + - name: Test compilation of arrow targets with different feature combinations + run: | + cargo check -p arrow --all-targets + cargo check -p arrow --no-default-features --all-targets + cargo check -p arrow --no-default-features --all-targets --features test_utils + - name: Re-run tests on arrow-flight with all features + run: | + cargo test -p arrow-flight --all-features + - name: Re-run tests on parquet crate with all features + run: | + cargo test -p parquet --all-features + - name: Test compilation of parquet library crate with different feature combinations + run: | + cargo check -p parquet + cargo check -p parquet --no-default-features + cargo check -p parquet --no-default-features --features arrow + - name: Test compilation of parquet targets with different feature combinations + run: | + cargo check -p parquet --all-targets + cargo check -p parquet --no-default-features --all-targets + cargo check -p parquet --no-default-features --features arrow --all-targets + - name: Test compilation of parquet_derive macro with different feature combinations + run: | + cargo check -p parquet_derive # test the --features "simd" of the arrow crate. This requires nightly. linux-test-simd: @@ -140,8 +122,8 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - arch: [amd64] - rust: [nightly-2022-05-23] + arch: [ amd64 ] + rust: [ nightly ] container: image: ${{ matrix.arch }}/rust env: @@ -153,40 +135,25 @@ jobs: - uses: actions/checkout@v2 with: submodules: true - - name: Cache Cargo - uses: actions/cache@v2 - with: - path: /github/home/.cargo - key: cargo-nightly-cache3- - - name: Cache Rust dependencies - uses: actions/cache@v2 - with: - path: /github/home/target - key: ${{ runner.os }}-${{ matrix.arch }}-target-nightly-cache3-${{ matrix.rust }} - name: Setup Rust toolchain uses: ./.github/actions/setup-builder with: rust-version: ${{ matrix.rust }} - name: Run tests run: | - export CARGO_HOME="/github/home/.cargo" - export CARGO_TARGET_DIR="/github/home/target" - cd arrow - cargo test --features "simd" - - name: Check new project build with simd features + cargo test -p arrow --features "simd" + - name: Check compilation with simd features run: | - export CARGO_HOME="/github/home/.cargo" - export CARGO_TARGET_DIR="/github/home/target" - cd arrow/test/dependency/simd - cargo check + cargo check -p arrow --features simd + cargo check -p arrow --features simd --all-targets windows-and-macos: name: Test on ${{ matrix.os }} Rust ${{ matrix.rust }} runs-on: ${{ matrix.os }} strategy: matrix: - os: [windows-latest, macos-latest] - rust: [stable] + os: [ windows-latest, macos-latest ] + rust: [ stable ] steps: - uses: actions/checkout@v2 with: @@ -197,7 +164,6 @@ jobs: run: | rustup toolchain install ${{ matrix.rust }} rustup default ${{ matrix.rust }} - rustup component add rustfmt - name: Run tests shell: bash run: | @@ -209,12 +175,12 @@ jobs: clippy: name: Clippy - needs: [linux-build-lib] + needs: [ linux-build-lib ] runs-on: ubuntu-latest strategy: matrix: - arch: [amd64] - rust: [stable] + arch: [ amd64 ] + rust: [ stable ] container: image: ${{ matrix.arch }}/rust env: @@ -225,29 +191,15 @@ jobs: - uses: actions/checkout@v2 with: submodules: true - - name: Cache Cargo - uses: actions/cache@v2 - with: - path: /github/home/.cargo - # this key equals the ones on `linux-build-lib` for re-use - key: cargo-cache3- - - name: Cache Rust dependencies - uses: actions/cache@v2 - with: - path: /github/home/target - # this key equals the ones on `linux-build-lib` for re-use - key: ${{ runner.os }}-${{ matrix.arch }}-target-cache3-${{ matrix.rust }} - name: Setup Rust toolchain uses: ./.github/actions/setup-builder with: rust-version: ${{ matrix.rust }} - name: Setup Clippy run: | - rustup component add rustfmt clippy + rustup component add clippy - name: Run clippy run: | - export CARGO_HOME="/github/home/.cargo" - export CARGO_TARGET_DIR="/github/home/target" cargo clippy --features test_common --features prettyprint --features=async --all-targets --workspace -- -D warnings check_benches: @@ -255,8 +207,8 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - arch: [amd64] - rust: [stable] + arch: [ amd64 ] + rust: [ stable ] container: image: ${{ matrix.arch }}/rust env: @@ -267,27 +219,13 @@ jobs: - uses: actions/checkout@v2 with: submodules: true - - name: Cache Cargo - uses: actions/cache@v2 - with: - path: /github/home/.cargo - # this key equals the ones on `linux-build-lib` for re-use - key: cargo-cache3- - - name: Cache Rust dependencies - uses: actions/cache@v2 - with: - path: /github/home/target - # this key equals the ones on `linux-build-lib` for re-use - key: ${{ runner.os }}-${{ matrix.arch }}-target-cache3-${{ matrix.rust }} - name: Setup Rust toolchain uses: ./.github/actions/setup-builder with: rust-version: ${{ matrix.rust }} - name: Check benchmarks run: | - export CARGO_HOME="/github/home/.cargo" - export CARGO_TARGET_DIR="/github/home/target" - cargo check --benches --workspace --features test_common,prettyprint,async,experimental + cargo check --benches --workspace --features test_common,prettyprint,async,experimental lint: name: Lint (cargo fmt) @@ -309,8 +247,8 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - arch: [amd64] - rust: [stable] + arch: [ amd64 ] + rust: [ stable ] steps: - uses: actions/checkout@v2 with: @@ -319,15 +257,14 @@ jobs: run: | rustup toolchain install ${{ matrix.rust }} rustup default ${{ matrix.rust }} - rustup component add rustfmt clippy - name: Cache Cargo - uses: actions/cache@v2 + uses: actions/cache@v3 with: path: /home/runner/.cargo # this key is not equal because the user is different than on a container (runner vs github) key: cargo-coverage-cache3- - name: Cache Rust dependencies - uses: actions/cache@v2 + uses: actions/cache@v3 with: path: /home/runner/target # this key is not equal because coverage uses different compilation flags. @@ -354,8 +291,8 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - arch: [amd64] - rust: [nightly-2022-05-23] + arch: [ amd64 ] + rust: [ nightly ] container: image: ${{ matrix.arch }}/rust env: @@ -369,12 +306,12 @@ jobs: with: submodules: true - name: Cache Cargo - uses: actions/cache@v2 + uses: actions/cache@v3 with: path: /github/home/.cargo key: cargo-wasm32-cache3- - name: Cache Rust dependencies - uses: actions/cache@v2 + uses: actions/cache@v3 with: path: /github/home/target key: ${{ runner.os }}-${{ matrix.arch }}-target-wasm32-cache3-${{ matrix.rust }} @@ -382,13 +319,10 @@ jobs: run: | rustup toolchain install ${{ matrix.rust }} rustup override set ${{ matrix.rust }} - rustup component add rustfmt rustup target add wasm32-unknown-unknown rustup target add wasm32-wasi - name: Build arrow crate run: | - export CARGO_HOME="/github/home/.cargo" - export CARGO_TARGET_DIR="/github/home/target" cd arrow cargo build --no-default-features --features=csv,ipc,simd --target wasm32-unknown-unknown cargo build --no-default-features --features=csv,ipc,simd --target wasm32-wasi @@ -399,14 +333,15 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - arch: [amd64] - rust: [nightly-2022-05-23] + arch: [ amd64 ] + rust: [ nightly ] container: image: ${{ matrix.arch }}/rust env: # Disable full debug symbol generation to speed up CI build and keep memory down # "1" means line tables only, which is useful for panic tracebacks. RUSTFLAGS: "-C debuginfo=1" + RUSTDOCFLAGS: "-Dwarnings" steps: - uses: actions/checkout@v2 with: @@ -415,74 +350,10 @@ jobs: run: | apt update apt install -y libpython3.9-dev - - name: Cache Cargo - uses: actions/cache@v2 - with: - path: /github/home/.cargo - key: cargo-nightly-cache3- - - name: Cache Rust dependencies - uses: actions/cache@v2 - with: - path: /github/home/target - key: ${{ runner.os }}-${{ matrix.arch }}-target-nightly-cache3-${{ matrix.rust }} - name: Setup Rust toolchain uses: ./.github/actions/setup-builder with: rust-version: ${{ matrix.rust }} - name: Run cargo doc run: | - export CARGO_HOME="/github/home/.cargo" - export CARGO_TARGET_DIR="/github/home/target" - export RUSTDOCFLAGS="-Dwarnings" cargo doc --document-private-items --no-deps --workspace --all-features - - - # test builds with various feature flag combinations outside the main workspace - default-build: - name: Feature Flag Builds ${{ matrix.rust }} - runs-on: ubuntu-latest - strategy: - matrix: - arch: [amd64] - rust: [stable] - container: - image: ${{ matrix.arch }}/rust - env: - # Disable debug symbol generation to speed up CI build and keep memory down - RUSTFLAGS: "-C debuginfo=0" - steps: - - uses: actions/checkout@v2 - - name: Cache Cargo - uses: actions/cache@v2 - with: - path: /github/home/.cargo - # this key equals the ones on `linux-build-lib` for re-use - key: cargo-cache3- - - name: Cache Rust dependencies - uses: actions/cache@v2 - with: - path: /github/home/target - # this key equals the ones on `linux-build-lib` for re-use - key: ${{ runner.os }}-${{ matrix.arch }}-target-cache3-${{ matrix.rust }} - - name: Setup Rust toolchain - uses: ./.github/actions/setup-builder - with: - rust-version: ${{ matrix.rust }} - - name: Arrow Build with default features - run: | - export CARGO_HOME="/github/home/.cargo" - export CARGO_TARGET_DIR="/github/home/target" - cd arrow/test/dependency/default-features - cargo check - - name: Arrow Build with default-features=false - run: | - export CARGO_HOME="/github/home/.cargo" - export CARGO_TARGET_DIR="/github/home/target" - cd arrow/test/dependency/no-default-features - cargo check - - name: Parquet Derive build with default-features - run: | - export CARGO_HOME="/github/home/.cargo" - export CARGO_TARGET_DIR="/github/home/target" - cd parquet_derive/test/dependency/default-features - cargo check diff --git a/CHANGELOG-old.md b/CHANGELOG-old.md new file mode 100644 index 000000000000..518697ce09a0 --- /dev/null +++ b/CHANGELOG-old.md @@ -0,0 +1,1311 @@ + + + +## [15.0.0](https://github.com/apache/arrow-rs/tree/15.0.0) (2022-05-27) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/14.0.0...15.0.0) + +**Breaking changes:** + +- Change `ArrayDataBuilder::null_bit_buffer` to accept `Option` rather than `Buffer` [\#1739](https://github.com/apache/arrow-rs/pull/1739) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) +- Remove `null_count` from `ArrayData::try_new()` [\#1721](https://github.com/apache/arrow-rs/pull/1721) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) +- Change parquet writers to use standard `std:io::Write` rather custom `ParquetWriter` trait \(\#1717\) \(\#1163\) [\#1719](https://github.com/apache/arrow-rs/pull/1719) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Add explicit column mask for selection in parquet: `ProjectionMask` \(\#1701\) [\#1716](https://github.com/apache/arrow-rs/pull/1716) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Add type\_ids in Union datatype [\#1703](https://github.com/apache/arrow-rs/pull/1703) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Fix Parquet Reader's Arrow Schema Inference [\#1682](https://github.com/apache/arrow-rs/pull/1682) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) + +**Implemented enhancements:** + +- Rename the `string` kernel to `concatenate_elements` [\#1747](https://github.com/apache/arrow-rs/issues/1747) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- `ArrayDataBuilder::null_bit_buffer` should accept `Option` as input type [\#1737](https://github.com/apache/arrow-rs/issues/1737) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Fix schema comparison for non\_canonical\_map when running flight test [\#1730](https://github.com/apache/arrow-rs/issues/1730) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add support in aggregate kernel for `BinaryArray` [\#1724](https://github.com/apache/arrow-rs/issues/1724) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Fix incorrect null\_count in `generate_unions_case` integration test [\#1712](https://github.com/apache/arrow-rs/issues/1712) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Keep type ids in Union datatype to follow Arrow spec and integrate with other implementations [\#1690](https://github.com/apache/arrow-rs/issues/1690) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support Reading Alternative List Representations to Arrow From Parquet [\#1680](https://github.com/apache/arrow-rs/issues/1680) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Speed up the offsets checking [\#1675](https://github.com/apache/arrow-rs/issues/1675) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Separate Parquet -\> Arrow Schema Conversion From ArrayBuilder [\#1655](https://github.com/apache/arrow-rs/issues/1655) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Add `leaf_columns` argument to `ArrowReader::get_record_reader_by_columns` [\#1653](https://github.com/apache/arrow-rs/issues/1653) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Implement `string_concat` kernel [\#1540](https://github.com/apache/arrow-rs/issues/1540) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Improve Unit Test Coverage of ArrayReaderBuilder [\#1484](https://github.com/apache/arrow-rs/issues/1484) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] + +**Fixed bugs:** + +- Parquet write failure \(from record batches\) when data is nested two levels deep [\#1744](https://github.com/apache/arrow-rs/issues/1744) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- IPC reader may break on projection [\#1735](https://github.com/apache/arrow-rs/issues/1735) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Latest nightly fails to build with feature simd [\#1734](https://github.com/apache/arrow-rs/issues/1734) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Trying to write parquet file in parallel results in corrupt file [\#1717](https://github.com/apache/arrow-rs/issues/1717) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Roundtrip failure when using DELTA\_BINARY\_PACKED [\#1708](https://github.com/apache/arrow-rs/issues/1708) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- `ArrayData::try_new` cannot always return expected error. [\#1707](https://github.com/apache/arrow-rs/issues/1707) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- "out of order projection is not supported" after Fix Parquet Arrow Schema Inference [\#1701](https://github.com/apache/arrow-rs/issues/1701) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Rust is not interoperability with C++ for IPC schemas with dictionaries [\#1694](https://github.com/apache/arrow-rs/issues/1694) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Incorrect Repeated Field Schema Inference [\#1681](https://github.com/apache/arrow-rs/issues/1681) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Parquet Treats Embedded Arrow Schema as Authoritative [\#1663](https://github.com/apache/arrow-rs/issues/1663) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- parquet\_to\_arrow\_schema\_by\_columns Incorrectly Handles Nested Types [\#1654](https://github.com/apache/arrow-rs/issues/1654) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Inconsistent Arrow Schema When Projecting Nested Parquet File [\#1652](https://github.com/apache/arrow-rs/issues/1652) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- StructArrayReader Cannot Handle Nested Lists [\#1651](https://github.com/apache/arrow-rs/issues/1651) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Bug \(`substring` kernel\): The null buffer is not aligned when `offset != 0` [\#1639](https://github.com/apache/arrow-rs/issues/1639) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Documentation updates:** + +- Parquet command line tool does not install "globally" [\#1710](https://github.com/apache/arrow-rs/issues/1710) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Improve integration test document to follow Arrow C++ repo CI [\#1742](https://github.com/apache/arrow-rs/pull/1742) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) + +**Merged pull requests:** + +- Test for list array equality with different offsets [\#1756](https://github.com/apache/arrow-rs/pull/1756) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Rename `string_concat` to `concat_elements_utf8` [\#1754](https://github.com/apache/arrow-rs/pull/1754) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Rename the `string` kernel to `concat_elements`. [\#1752](https://github.com/apache/arrow-rs/pull/1752) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) +- Support writing nested lists to parquet [\#1746](https://github.com/apache/arrow-rs/pull/1746) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Pin nightly version to bypass packed\_simd build error [\#1743](https://github.com/apache/arrow-rs/pull/1743) ([viirya](https://github.com/viirya)) +- Fix projection in IPC reader [\#1736](https://github.com/apache/arrow-rs/pull/1736) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([iyupeng](https://github.com/iyupeng)) +- `cargo install` installs not globally [\#1732](https://github.com/apache/arrow-rs/pull/1732) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([kazuk](https://github.com/kazuk)) +- Fix schema comparison for non\_canonical\_map when running flight test [\#1731](https://github.com/apache/arrow-rs/pull/1731) ([viirya](https://github.com/viirya)) +- Add `min_binary` and `max_binary` aggregate kernels [\#1725](https://github.com/apache/arrow-rs/pull/1725) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) +- Fix parquet benchmarks [\#1723](https://github.com/apache/arrow-rs/pull/1723) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Fix BitReader::get\_batch zero extension \(\#1708\) [\#1722](https://github.com/apache/arrow-rs/pull/1722) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Implementation string concat [\#1720](https://github.com/apache/arrow-rs/pull/1720) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Ismail-Maj](https://github.com/Ismail-Maj)) +- Check the length of `null_bit_buffer` in `ArrayData::try_new()` [\#1714](https://github.com/apache/arrow-rs/pull/1714) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) +- Fix incorrect null\_count in `generate_unions_case` integration test [\#1713](https://github.com/apache/arrow-rs/pull/1713) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Fix: Null buffer accounts for `offset` in `substring` kernel. [\#1704](https://github.com/apache/arrow-rs/pull/1704) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) +- Minor: Refine `OffsetSizeTrait` to extend `num::Integer` [\#1702](https://github.com/apache/arrow-rs/pull/1702) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) +- Fix StructArrayReader handling nested lists \(\#1651\) [\#1700](https://github.com/apache/arrow-rs/pull/1700) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Speed up the offsets checking [\#1684](https://github.com/apache/arrow-rs/pull/1684) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) + +## [14.0.0](https://github.com/apache/arrow-rs/tree/14.0.0) (2022-05-13) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/13.0.0...14.0.0) + +**Breaking changes:** + +- Use `bytes` in parquet rather than custom Buffer implementation \(\#1474\) [\#1683](https://github.com/apache/arrow-rs/pull/1683) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Rename `OffsetSize::fn is_large` to `const OffsetSize::IS_LARGE` [\#1664](https://github.com/apache/arrow-rs/pull/1664) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) +- Remove `StringOffsetTrait` and `BinaryOffsetTrait` [\#1645](https://github.com/apache/arrow-rs/pull/1645) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) +- Fix `generate_nested_dictionary_case` integration test failure [\#1636](https://github.com/apache/arrow-rs/pull/1636) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([viirya](https://github.com/viirya)) + +**Implemented enhancements:** + +- Add support for `DataType::Duration` in ffi interface [\#1688](https://github.com/apache/arrow-rs/issues/1688) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Fix `generate_unions_case` integration test [\#1676](https://github.com/apache/arrow-rs/issues/1676) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add `DictionaryArray` support for `bit_length` kernel [\#1673](https://github.com/apache/arrow-rs/issues/1673) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add `DictionaryArray` support for `length` kernel [\#1672](https://github.com/apache/arrow-rs/issues/1672) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- flight\_client\_scenarios integration test should receive schema from flight data [\#1669](https://github.com/apache/arrow-rs/issues/1669) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Unpin Flatbuffer version dependency [\#1667](https://github.com/apache/arrow-rs/issues/1667) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add dictionary array support for substring function [\#1656](https://github.com/apache/arrow-rs/issues/1656) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Exclude dict\_id and dict\_is\_ordered from equality comparison of `Field` [\#1646](https://github.com/apache/arrow-rs/issues/1646) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Remove `StringOffsetTrait` and `BinaryOffsetTrait` [\#1644](https://github.com/apache/arrow-rs/issues/1644) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add tests and examples for `UnionArray::from(data: ArrayData)` [\#1643](https://github.com/apache/arrow-rs/issues/1643) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add methods `pub fn offsets_buffer`, `pub fn types_ids_buffer`and `pub fn data_buffer` for `ArrayDataBuilder` [\#1640](https://github.com/apache/arrow-rs/issues/1640) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Fix `generate_nested_dictionary_case` integration test failure for Rust cases [\#1635](https://github.com/apache/arrow-rs/issues/1635) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Expose `ArrowWriter` row group flush in public API [\#1626](https://github.com/apache/arrow-rs/issues/1626) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Add `substring` support for `FixedSizeBinaryArray` [\#1618](https://github.com/apache/arrow-rs/issues/1618) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add PrettyPrint for `UnionArray`s [\#1594](https://github.com/apache/arrow-rs/issues/1594) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add SIMD support for the `length` kernel [\#1489](https://github.com/apache/arrow-rs/issues/1489) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support dictionary arrays in length and bit\_length [\#1674](https://github.com/apache/arrow-rs/pull/1674) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Add dictionary array support for substring function [\#1665](https://github.com/apache/arrow-rs/pull/1665) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([sunchao](https://github.com/sunchao)) +- Add `DecimalType` support in `new_null_array ` [\#1659](https://github.com/apache/arrow-rs/pull/1659) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([yjshen](https://github.com/yjshen)) + +**Fixed bugs:** + +- Docs.rs build is broken [\#1695](https://github.com/apache/arrow-rs/issues/1695) +- Interoperability with C++ for IPC schemas with dictionaries [\#1694](https://github.com/apache/arrow-rs/issues/1694) +- `UnionArray::is_null` incorrect [\#1625](https://github.com/apache/arrow-rs/issues/1625) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Published Parquet documentation missing `arrow::async_reader` [\#1617](https://github.com/apache/arrow-rs/issues/1617) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Files written with Julia's Arrow.jl in IPC format cannot be read by arrow-rs [\#1335](https://github.com/apache/arrow-rs/issues/1335) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Documentation updates:** + +- Correct arrow-flight readme version [\#1641](https://github.com/apache/arrow-rs/pull/1641) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alamb](https://github.com/alamb)) + +**Closed issues:** + +- Make `OffsetSizeTrait::IS_LARGE` as a const value [\#1658](https://github.com/apache/arrow-rs/issues/1658) +- Question: Why are there 3 types of `OffsetSizeTrait`s? [\#1638](https://github.com/apache/arrow-rs/issues/1638) +- Written Parquet file way bigger than input files [\#1627](https://github.com/apache/arrow-rs/issues/1627) +- Ensure there is a single zero in the offsets buffer for an empty ListArray. [\#1620](https://github.com/apache/arrow-rs/issues/1620) +- Filtering `UnionArray` Changes DataType [\#1595](https://github.com/apache/arrow-rs/issues/1595) + +**Merged pull requests:** + +- Fix docs.rs build [\#1696](https://github.com/apache/arrow-rs/pull/1696) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) +- support duration in ffi [\#1689](https://github.com/apache/arrow-rs/pull/1689) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([ryan-jacobs1](https://github.com/ryan-jacobs1)) +- fix bench command line options [\#1685](https://github.com/apache/arrow-rs/pull/1685) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([kazuk](https://github.com/kazuk)) +- Enable branch protection [\#1679](https://github.com/apache/arrow-rs/pull/1679) ([tustvold](https://github.com/tustvold)) +- Fix logical merge conflict in \#1588 [\#1678](https://github.com/apache/arrow-rs/pull/1678) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Fix generate\_unions\_case for Rust case [\#1677](https://github.com/apache/arrow-rs/pull/1677) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Receive schema from flight data [\#1670](https://github.com/apache/arrow-rs/pull/1670) ([viirya](https://github.com/viirya)) +- unpin flatbuffers dependency version [\#1668](https://github.com/apache/arrow-rs/pull/1668) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Cheappie](https://github.com/Cheappie)) +- Remove parquet dictionary converters \(\#1661\) [\#1662](https://github.com/apache/arrow-rs/pull/1662) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Minor: simplify the function `GenericListArray::get_type` [\#1650](https://github.com/apache/arrow-rs/pull/1650) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) +- Pretty Print `UnionArray`s [\#1648](https://github.com/apache/arrow-rs/pull/1648) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tfeda](https://github.com/tfeda)) +- Exclude `dict_id` and `dict_is_ordered` from equality comparison of `Field` [\#1647](https://github.com/apache/arrow-rs/pull/1647) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- expose row-group flush in public api [\#1634](https://github.com/apache/arrow-rs/pull/1634) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Cheappie](https://github.com/Cheappie)) +- Add `substring` support for `FixedSizeBinaryArray` [\#1633](https://github.com/apache/arrow-rs/pull/1633) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) +- Fix UnionArray is\_null [\#1632](https://github.com/apache/arrow-rs/pull/1632) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Do not assume dictionaries exists in footer [\#1631](https://github.com/apache/arrow-rs/pull/1631) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([pcjentsch](https://github.com/pcjentsch)) +- Add support for nested list arrays from parquet to arrow arrays \(\#993\) [\#1588](https://github.com/apache/arrow-rs/pull/1588) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Add `async` into doc features [\#1349](https://github.com/apache/arrow-rs/pull/1349) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([HaoYang670](https://github.com/HaoYang670)) + + +## [13.0.0](https://github.com/apache/arrow-rs/tree/13.0.0) (2022-04-29) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/12.0.0...13.0.0) + +**Breaking changes:** + +- Update `parquet::basic::LogicalType` to be more idomatic [\#1612](https://github.com/apache/arrow-rs/pull/1612) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tfeda](https://github.com/tfeda)) +- Fix Null Mask Handling in `ArrayData`, `UnionArray`, and `MapArray` [\#1589](https://github.com/apache/arrow-rs/pull/1589) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Replace `&Option` with `Option<&T>` in several `arrow` and `parquet` APIs [\#1571](https://github.com/apache/arrow-rs/pull/1571) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tfeda](https://github.com/tfeda)) + +**Implemented enhancements:** + +- Read/write nested dictionary under fixed size list in ipc stream reader/write [\#1609](https://github.com/apache/arrow-rs/issues/1609) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add support for `BinaryArray` in `substring` kernel [\#1593](https://github.com/apache/arrow-rs/issues/1593) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Read/write nested dictionary under large list in ipc stream reader/write [\#1584](https://github.com/apache/arrow-rs/issues/1584) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Read/write nested dictionary under map in ipc stream reader/write [\#1582](https://github.com/apache/arrow-rs/issues/1582) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Implement `Clone` for JSON `DecoderOptions` [\#1580](https://github.com/apache/arrow-rs/issues/1580) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add utf-8 validation checking to `substring` kernel [\#1575](https://github.com/apache/arrow-rs/issues/1575) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support casting to/from `DataType::Null` in `cast` kernel [\#1572](https://github.com/apache/arrow-rs/pull/1572) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([WinkerDu](https://github.com/WinkerDu)) + +**Fixed bugs:** + +- Parquet schema should allow scale == precision for decimal type [\#1606](https://github.com/apache/arrow-rs/issues/1606) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- ListArray::from\(ArrayData\) dereferences invalid pointer when offsets are empty [\#1601](https://github.com/apache/arrow-rs/issues/1601) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- ArrayData Equality Incorrect Null Mask Offset Handling [\#1599](https://github.com/apache/arrow-rs/issues/1599) +- Filtering UnionArray Incorrect Handles Runs [\#1598](https://github.com/apache/arrow-rs/issues/1598) +- \[Safety\] Filtering Dense UnionArray Produces Invalid Offsets [\#1596](https://github.com/apache/arrow-rs/issues/1596) +- \[Safety\] UnionBuilder Doesn't Check Types [\#1591](https://github.com/apache/arrow-rs/issues/1591) +- Union Layout Should Not Support Separate Validity Mask [\#1590](https://github.com/apache/arrow-rs/issues/1590) +- Incorrect nullable flag when reading maps \( test\_read\_maps fails when `force_validate` is active\) [\#1587](https://github.com/apache/arrow-rs/issues/1587) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Output of `ipc::reader::tests::projection_should_work` fails validation [\#1548](https://github.com/apache/arrow-rs/issues/1548) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Incorrect min/max statistics for decimals with byte-array notation [\#1532](https://github.com/apache/arrow-rs/issues/1532) + +**Documentation updates:** + +- Minor: Clarify docs on `UnionBuilder::append_null` [\#1628](https://github.com/apache/arrow-rs/pull/1628) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) + +**Closed issues:** + +- Dense UnionArray Offsets Are i32 not i8 [\#1597](https://github.com/apache/arrow-rs/issues/1597) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Replace `&Option` with `Option<&T>` in some APIs [\#1556](https://github.com/apache/arrow-rs/issues/1556) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Improve ergonomics of `parquet::basic::LogicalType` [\#1554](https://github.com/apache/arrow-rs/issues/1554) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Mark the current `substring` function as `unsafe` and rename it. [\#1541](https://github.com/apache/arrow-rs/issues/1541) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Requirements for Async Parquet API [\#1473](https://github.com/apache/arrow-rs/issues/1473) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] + +**Merged pull requests:** + +- Nit: use the standard function `div_ceil` [\#1629](https://github.com/apache/arrow-rs/pull/1629) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) +- Update flatbuffers requirement from =2.1.1 to =2.1.2 [\#1622](https://github.com/apache/arrow-rs/pull/1622) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Fix decimals min max statistics [\#1621](https://github.com/apache/arrow-rs/pull/1621) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([atefsawaed](https://github.com/atefsawaed)) +- Add example readme [\#1615](https://github.com/apache/arrow-rs/pull/1615) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Improve docs and examples links on main readme [\#1614](https://github.com/apache/arrow-rs/pull/1614) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Read/Write nested dictionaries under FixedSizeList in IPC [\#1610](https://github.com/apache/arrow-rs/pull/1610) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Add `substring` support for binary [\#1608](https://github.com/apache/arrow-rs/pull/1608) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) +- Parquet: schema validation should allow scale == precision for decimal type [\#1607](https://github.com/apache/arrow-rs/pull/1607) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([sunchao](https://github.com/sunchao)) +- Don't access and validate offset buffer in ListArray::from\(ArrayData\) [\#1602](https://github.com/apache/arrow-rs/pull/1602) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jhorstmann](https://github.com/jhorstmann)) +- Fix map nullable flag in `ParquetTypeConverter` [\#1592](https://github.com/apache/arrow-rs/pull/1592) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([viirya](https://github.com/viirya)) +- Read/write nested dictionary under large list in ipc stream reader/writer [\#1585](https://github.com/apache/arrow-rs/pull/1585) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Read/write nested dictionary under map in ipc stream reader/writer [\#1583](https://github.com/apache/arrow-rs/pull/1583) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Derive `Clone` and `PartialEq` for json `DecoderOptions` [\#1581](https://github.com/apache/arrow-rs/pull/1581) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Add utf-8 validation checking for `substring` [\#1577](https://github.com/apache/arrow-rs/pull/1577) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) +- Use `Option` rather than `Option<&T>` for copy types in substring kernel [\#1576](https://github.com/apache/arrow-rs/pull/1576) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Use littleendian arrow files for `projection_should_work` [\#1573](https://github.com/apache/arrow-rs/pull/1573) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) + + +## [12.0.0](https://github.com/apache/arrow-rs/tree/12.0.0) (2022-04-15) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/11.1.0...12.0.0) + +**Breaking changes:** + +- Add `ArrowReaderOptions` to `ParquetFileArrowReader`, add option to skip decoding arrow metadata from parquet \(\#1459\) [\#1558](https://github.com/apache/arrow-rs/pull/1558) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Support `RecordBatch` with zero columns but non zero row count, add field to `RecordBatchOptions` \(\#1536\) [\#1552](https://github.com/apache/arrow-rs/pull/1552) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Consolidate JSON Reader options and `DecoderOptions` [\#1539](https://github.com/apache/arrow-rs/pull/1539) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Update `prost`, `prost-derive` and `prost-types` to 0.10, `tonic`, and `tonic-build` to `0.7` [\#1510](https://github.com/apache/arrow-rs/pull/1510) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alamb](https://github.com/alamb)) +- Add Json `DecoderOptions` and support custom `format_string` for each field [\#1451](https://github.com/apache/arrow-rs/pull/1451) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([sum12](https://github.com/sum12)) + +**Implemented enhancements:** + +- Read/write nested dictionary in ipc stream reader/writer [\#1565](https://github.com/apache/arrow-rs/issues/1565) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support `FixedSizeBinary` in the Arrow C data interface [\#1553](https://github.com/apache/arrow-rs/issues/1553) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support Empty Column Projection in `ParquetRecordBatchReader` [\#1537](https://github.com/apache/arrow-rs/issues/1537) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Support `RecordBatch` with zero columns but non zero row count [\#1536](https://github.com/apache/arrow-rs/issues/1536) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add support for `Date32`/`Date64`\<--\> `String`/`LargeString` in `cast` kernel [\#1535](https://github.com/apache/arrow-rs/issues/1535) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support creating arrays from externally owned memory like `Vec` or `String` [\#1516](https://github.com/apache/arrow-rs/issues/1516) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Speed up the `substring` kernel [\#1511](https://github.com/apache/arrow-rs/issues/1511) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Handle Parquet Files With Inconsistent Timestamp Units [\#1459](https://github.com/apache/arrow-rs/issues/1459) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] + +**Fixed bugs:** + +- Error Infering Schema for LogicalType::UNKNOWN [\#1557](https://github.com/apache/arrow-rs/issues/1557) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Read dictionary from nested struct in ipc stream reader panics [\#1549](https://github.com/apache/arrow-rs/issues/1549) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- `filter` produces invalid sparse `UnionArray`s [\#1547](https://github.com/apache/arrow-rs/issues/1547) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Documentation for `GenericListBuilder` is not exposed. [\#1518](https://github.com/apache/arrow-rs/issues/1518) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- cannot read parquet file [\#1515](https://github.com/apache/arrow-rs/issues/1515) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- The `substring` kernel panics when chars \> U+0x007F [\#1478](https://github.com/apache/arrow-rs/issues/1478) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Hang due to infinite loop when reading some parquet files with RLE encoding and bit packing [\#1458](https://github.com/apache/arrow-rs/issues/1458) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] + +**Documentation updates:** + +- Improve JSON reader documentation [\#1559](https://github.com/apache/arrow-rs/pull/1559) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Improve doc string for `substring` kernel [\#1529](https://github.com/apache/arrow-rs/pull/1529) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) +- Expose documentation of `GenericListBuilder` [\#1525](https://github.com/apache/arrow-rs/pull/1525) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([comath](https://github.com/comath)) +- Add a diagram to `take` kernel documentation [\#1524](https://github.com/apache/arrow-rs/pull/1524) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) + +**Closed issues:** + +- Interesting benchmark results of `min_max_helper` [\#1400](https://github.com/apache/arrow-rs/issues/1400) + +**Merged pull requests:** + +- Fix incorrect `into_buffers` for UnionArray [\#1567](https://github.com/apache/arrow-rs/pull/1567) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Read/write nested dictionary in ipc stream reader/writer [\#1566](https://github.com/apache/arrow-rs/pull/1566) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Support FixedSizeBinary and FixedSizeList for the C data interface [\#1564](https://github.com/apache/arrow-rs/pull/1564) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([sunchao](https://github.com/sunchao)) +- Split out ListArrayReader into separate module \(\#1483\) [\#1563](https://github.com/apache/arrow-rs/pull/1563) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Split out `MapArray` into separate module \(\#1483\) [\#1562](https://github.com/apache/arrow-rs/pull/1562) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Support empty projection in `ParquetRecordBatchReader` [\#1560](https://github.com/apache/arrow-rs/pull/1560) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- fix infinite loop in not fully packed bit-packed runs [\#1555](https://github.com/apache/arrow-rs/pull/1555) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Add test for creating FixedSizeBinaryArray::try\_from\_sparse\_iter failed when given all Nones [\#1551](https://github.com/apache/arrow-rs/pull/1551) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Fix reading dictionaries from nested structs in ipc `StreamReader` [\#1550](https://github.com/apache/arrow-rs/pull/1550) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dispanser](https://github.com/dispanser)) +- Add support for Date32/64 \<--\> String/LargeString in `cast` kernel [\#1534](https://github.com/apache/arrow-rs/pull/1534) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([yjshen](https://github.com/yjshen)) +- fix clippy errors in 1.60 [\#1527](https://github.com/apache/arrow-rs/pull/1527) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Mark `remove-old-releases.sh` executable [\#1522](https://github.com/apache/arrow-rs/pull/1522) ([alamb](https://github.com/alamb)) +- Delete duplicate code in the `sort` kernel [\#1519](https://github.com/apache/arrow-rs/pull/1519) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) +- Fix reading nested lists from parquet files [\#1517](https://github.com/apache/arrow-rs/pull/1517) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([viirya](https://github.com/viirya)) +- Speed up the `substring` kernel by about 2x [\#1512](https://github.com/apache/arrow-rs/pull/1512) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) +- Add `new_from_strings` to create `MapArrays` [\#1507](https://github.com/apache/arrow-rs/pull/1507) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Decouple buffer deallocation from ffi and allow creating buffers from rust vec [\#1494](https://github.com/apache/arrow-rs/pull/1494) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jhorstmann](https://github.com/jhorstmann)) + +## [11.1.0](https://github.com/apache/arrow-rs/tree/11.1.0) (2022-03-31) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/11.0.0...11.1.0) + +**Implemented enhancements:** + +- Implement `size_hint` and `ExactSizedIterator` for DecimalArray [\#1505](https://github.com/apache/arrow-rs/issues/1505) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support calculate length by chars for `StringArray` [\#1493](https://github.com/apache/arrow-rs/issues/1493) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add `length` kernel support for `ListArray` [\#1470](https://github.com/apache/arrow-rs/issues/1470) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- The length kernel should work with `BinaryArray`s [\#1464](https://github.com/apache/arrow-rs/issues/1464) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- FFI for Arrow C Stream Interface [\#1348](https://github.com/apache/arrow-rs/issues/1348) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Improve performance of `DictionaryArray::try_new()` [\#1313](https://github.com/apache/arrow-rs/issues/1313) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Fixed bugs:** + +- MIRI error in math\_checked\_divide\_op/try\_from\_trusted\_len\_iter [\#1496](https://github.com/apache/arrow-rs/issues/1496) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Parquet Writer Incorrect Definition Levels for Nested NullArray [\#1480](https://github.com/apache/arrow-rs/issues/1480) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- FFI: ArrowArray::try\_from\_raw shouldn't clone [\#1425](https://github.com/apache/arrow-rs/issues/1425) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Parquet reader fails to read null list. [\#1399](https://github.com/apache/arrow-rs/issues/1399) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] + +**Documentation updates:** + +- A small mistake in the doc of `BinaryArray` and `LargeBinaryArray` [\#1455](https://github.com/apache/arrow-rs/issues/1455) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- A small mistake in the doc of `GenericBinaryArray::take_iter_unchecked` [\#1454](https://github.com/apache/arrow-rs/issues/1454) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add links in the doc of `BinaryOffsetSizeTrait` [\#1453](https://github.com/apache/arrow-rs/issues/1453) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- The doc of `FixedSizeBinaryArray` is confusing. [\#1452](https://github.com/apache/arrow-rs/issues/1452) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Clarify docs that SlicesIterator ignores null values [\#1504](https://github.com/apache/arrow-rs/pull/1504) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Update the doc of `BinaryArray` and `LargeBinaryArray` [\#1471](https://github.com/apache/arrow-rs/pull/1471) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) + +**Closed issues:** + +- `packed_simd` v.s. `portable_simd`, which should be used? [\#1492](https://github.com/apache/arrow-rs/issues/1492) +- Cleanup: Use Arrow take kernel Within parquet ListArrayReader [\#1482](https://github.com/apache/arrow-rs/issues/1482) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] + +**Merged pull requests:** + +- Implement `size_hint` and `ExactSizedIterator` for `DecimalArray` [\#1506](https://github.com/apache/arrow-rs/pull/1506) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Add `StringArray::num_chars` for calculating number of characters [\#1503](https://github.com/apache/arrow-rs/pull/1503) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) +- Workaround nightly miri error in `try_from_trusted_len_iter` [\#1497](https://github.com/apache/arrow-rs/pull/1497) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jhorstmann](https://github.com/jhorstmann)) +- update doc of array\_binary and array\_string [\#1491](https://github.com/apache/arrow-rs/pull/1491) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) +- Use Arrow take kernel within ListArrayReader [\#1490](https://github.com/apache/arrow-rs/pull/1490) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([viirya](https://github.com/viirya)) +- Add `length` kernel support for List Array [\#1488](https://github.com/apache/arrow-rs/pull/1488) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) +- Support sort for `Decimal` data type [\#1487](https://github.com/apache/arrow-rs/pull/1487) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([yjshen](https://github.com/yjshen)) +- Fix reading/writing nested null arrays \(\#1480\) \(\#1036\) \(\#1399\) [\#1481](https://github.com/apache/arrow-rs/pull/1481) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Implement ArrayEqual for UnionArray [\#1469](https://github.com/apache/arrow-rs/pull/1469) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Support the `length` kernel on Binary Array [\#1465](https://github.com/apache/arrow-rs/pull/1465) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) +- Remove Clone and copy source structs internally [\#1449](https://github.com/apache/arrow-rs/pull/1449) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Fix Parquet reader for null lists [\#1448](https://github.com/apache/arrow-rs/pull/1448) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([viirya](https://github.com/viirya)) +- Improve performance of DictionaryArray::try\_new\(\)  [\#1435](https://github.com/apache/arrow-rs/pull/1435) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jackwener](https://github.com/jackwener)) +- Add FFI for Arrow C Stream Interface [\#1384](https://github.com/apache/arrow-rs/pull/1384) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) + +## [11.0.0](https://github.com/apache/arrow-rs/tree/11.0.0) (2022-03-17) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/10.0.0...11.0.0) + +**Breaking changes:** + +- Replace `filter_row_groups` with `ReadOptions` in parquet SerializedFileReader [\#1389](https://github.com/apache/arrow-rs/pull/1389) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([yjshen](https://github.com/yjshen)) +- Implement projection for arrow `IPC Reader` file / streams [\#1339](https://github.com/apache/arrow-rs/pull/1339) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([Dandandan](https://github.com/Dandandan)) + +**Implemented enhancements:** + +- Fix generate\_interval\_case integration test failure [\#1445](https://github.com/apache/arrow-rs/issues/1445) +- Make the doc examples of `ListArray` and `LargeListArray` more readable [\#1433](https://github.com/apache/arrow-rs/issues/1433) +- Redundant `if` and `abs` in `shift()` [\#1427](https://github.com/apache/arrow-rs/issues/1427) +- Improve substring kernel performance [\#1422](https://github.com/apache/arrow-rs/issues/1422) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add missing value\_unchecked\(\) of `FixedSizeBinaryArray` [\#1419](https://github.com/apache/arrow-rs/issues/1419) +- Remove duplicate bound check in function `shift` [\#1408](https://github.com/apache/arrow-rs/issues/1408) +- Support dictionary array in C data interface [\#1397](https://github.com/apache/arrow-rs/issues/1397) +- filter kernel should work with `UnionArray`s [\#1394](https://github.com/apache/arrow-rs/issues/1394) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- filter kernel should work with `FixedSizeListArrays`s [\#1393](https://github.com/apache/arrow-rs/issues/1393) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add doc examples for creating FixedSizeListArray [\#1392](https://github.com/apache/arrow-rs/issues/1392) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Update `rust-version` to 1.59 [\#1377](https://github.com/apache/arrow-rs/issues/1377) +- Arrow IPC projection support [\#1338](https://github.com/apache/arrow-rs/issues/1338) +- Implement basic FlightSQL Server [\#1386](https://github.com/apache/arrow-rs/pull/1386) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([wangfenjin](https://github.com/wangfenjin)) + +**Fixed bugs:** + +- DictionaryArray::try\_new ignores validity bitmap of the keys [\#1429](https://github.com/apache/arrow-rs/issues/1429) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- The doc of `GenericListArray` is confusing [\#1424](https://github.com/apache/arrow-rs/issues/1424) +- DeltaBitPackDecoder Incorrectly Handles Non-Zero MiniBlock Bit Width Padding [\#1417](https://github.com/apache/arrow-rs/issues/1417) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- DeltaBitPackEncoder Pads Miniblock BitWidths With Arbitrary Values [\#1416](https://github.com/apache/arrow-rs/issues/1416) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Possible unaligned write with MutableBuffer::push [\#1410](https://github.com/apache/arrow-rs/issues/1410) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Integration Test is failing on master branch [\#1398](https://github.com/apache/arrow-rs/issues/1398) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Documentation updates:** + +- Rewrite doc of `GenericListArray` [\#1450](https://github.com/apache/arrow-rs/pull/1450) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) +- Fix integration doc about build.ninja location [\#1438](https://github.com/apache/arrow-rs/pull/1438) ([viirya](https://github.com/viirya)) + +**Merged pull requests:** + +- Rewrite doc example of `ListArray` and `LargeListArray` [\#1447](https://github.com/apache/arrow-rs/pull/1447) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) +- Fix generate\_interval\_case in integration test [\#1446](https://github.com/apache/arrow-rs/pull/1446) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Fix generate\_decimal128\_case in integration test [\#1440](https://github.com/apache/arrow-rs/pull/1440) ([viirya](https://github.com/viirya)) +- `filter` kernel should work with FixedSizeListArrays [\#1434](https://github.com/apache/arrow-rs/pull/1434) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Support nullable keys in DictionaryArray::try\_new [\#1430](https://github.com/apache/arrow-rs/pull/1430) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jhorstmann](https://github.com/jhorstmann)) +- remove redundant if/clamp\_min/abs [\#1428](https://github.com/apache/arrow-rs/pull/1428) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jackwener](https://github.com/jackwener)) +- Add doc example for creating `FixedSizeListArray` [\#1426](https://github.com/apache/arrow-rs/pull/1426) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) +- Directly write to MutableBuffer in substring [\#1423](https://github.com/apache/arrow-rs/pull/1423) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Fix possibly unaligned writes in MutableBuffer [\#1421](https://github.com/apache/arrow-rs/pull/1421) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jhorstmann](https://github.com/jhorstmann)) +- Add value\_unchecked\(\) and unit test [\#1420](https://github.com/apache/arrow-rs/pull/1420) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jackwener](https://github.com/jackwener)) +- Fix DeltaBitPack MiniBlock Bit Width Padding [\#1418](https://github.com/apache/arrow-rs/pull/1418) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Update zstd requirement from 0.10 to 0.11 [\#1415](https://github.com/apache/arrow-rs/pull/1415) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Set `default-features = false` for `zstd` in the parquet crate to support `wasm32-unknown-unknown` [\#1414](https://github.com/apache/arrow-rs/pull/1414) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([kylebarron](https://github.com/kylebarron)) +- Add support for `UnionArray` in`filter` kernel [\#1412](https://github.com/apache/arrow-rs/pull/1412) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Remove duplicate bound check in the function `shift` [\#1409](https://github.com/apache/arrow-rs/pull/1409) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) +- Add dictionary support for C data interface [\#1407](https://github.com/apache/arrow-rs/pull/1407) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([sunchao](https://github.com/sunchao)) +- Fix a small spelling mistake in docs. [\#1406](https://github.com/apache/arrow-rs/pull/1406) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) +- Add unit test to check `FixedSizeBinaryArray` input all none [\#1405](https://github.com/apache/arrow-rs/pull/1405) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jackwener](https://github.com/jackwener)) +- Move csv Parser trait and its implementations to utils module [\#1385](https://github.com/apache/arrow-rs/pull/1385) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([sum12](https://github.com/sum12)) + +## [10.0.0](https://github.com/apache/arrow-rs/tree/10.0.0) (2022-03-04) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/9.1.0...10.0.0) + +**Breaking changes:** + +- Remove existing has\_ methods for optional fields in `ColumnChunkMetaData` [\#1346](https://github.com/apache/arrow-rs/pull/1346) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([shanisolomon](https://github.com/shanisolomon)) +- Remove redundant `has_` methods in `ColumnChunkMetaData` [\#1345](https://github.com/apache/arrow-rs/pull/1345) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([shanisolomon](https://github.com/shanisolomon)) + +**Implemented enhancements:** + +- Add extract month and day in temporal.rs [\#1387](https://github.com/apache/arrow-rs/issues/1387) +- Add clone to `IpcWriteOptions` [\#1381](https://github.com/apache/arrow-rs/issues/1381) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support `MapArray` in `filter` kernel [\#1378](https://github.com/apache/arrow-rs/issues/1378) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add `week` temporal kernel [\#1375](https://github.com/apache/arrow-rs/issues/1375) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Improve performance of `compare_dict_op` [\#1371](https://github.com/apache/arrow-rs/issues/1371) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add support for LargeUtf8 in json writer [\#1357](https://github.com/apache/arrow-rs/issues/1357) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Make `arrow::array::builder::MapBuilder` public [\#1354](https://github.com/apache/arrow-rs/issues/1354) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Refactor `StructArray::from` [\#1351](https://github.com/apache/arrow-rs/issues/1351) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Refactor `RecordBatch::validate_new_batch` [\#1350](https://github.com/apache/arrow-rs/issues/1350) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Remove redundant has\_ methods for optional column metadata fields [\#1344](https://github.com/apache/arrow-rs/issues/1344) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Add `write` method to JsonWriter [\#1340](https://github.com/apache/arrow-rs/issues/1340) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Refactor the code of `Bitmap::new` [\#1337](https://github.com/apache/arrow-rs/issues/1337) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Use DictionaryArray's iterator in `compare_dict_op` [\#1329](https://github.com/apache/arrow-rs/issues/1329) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add `as_decimal_array(arr: &dyn Array) -> &DecimalArray` [\#1312](https://github.com/apache/arrow-rs/issues/1312) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- More ergonomic / idiomatic primitive array creation from iterators [\#1298](https://github.com/apache/arrow-rs/issues/1298) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Implement DictionaryArray support in `eq_dyn`, `neq_dyn`, `lt_dyn`, `lt_eq_dyn`, `gt_dyn`, `gt_eq_dyn` [\#1201](https://github.com/apache/arrow-rs/issues/1201) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Fixed bugs:** + +- `cargo clippy` fails on the `master` branch [\#1362](https://github.com/apache/arrow-rs/issues/1362) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- `ArrowArray::try_from_raw` should not assume the pointers are from Arc [\#1333](https://github.com/apache/arrow-rs/issues/1333) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Fix CSV Writer::new to accept delimiter and make WriterBuilder::build use it [\#1328](https://github.com/apache/arrow-rs/issues/1328) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Make bounds configurable via builder when reading CSV [\#1327](https://github.com/apache/arrow-rs/issues/1327) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add `with_datetime_format()` to CSV WriterBuilder [\#1272](https://github.com/apache/arrow-rs/issues/1272) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Performance improvements:** + +- Improve performance of `min` and `max` aggregation kernels without nulls [\#1373](https://github.com/apache/arrow-rs/issues/1373) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Closed issues:** + +- Consider removing redundant has\_XXX metadata functions in `ColumnChunkMetadata` [\#1332](https://github.com/apache/arrow-rs/issues/1332) + +**Merged pull requests:** + +- Support extract `day` and `month` in temporal.rs [\#1388](https://github.com/apache/arrow-rs/pull/1388) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Ted-Jiang](https://github.com/Ted-Jiang)) +- Add write method to Json Writer [\#1383](https://github.com/apache/arrow-rs/pull/1383) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([matthewmturner](https://github.com/matthewmturner)) +- Derive `Clone` for `IpcWriteOptions` [\#1382](https://github.com/apache/arrow-rs/pull/1382) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([matthewmturner](https://github.com/matthewmturner)) +- feat: support maps in MutableArrayData [\#1379](https://github.com/apache/arrow-rs/pull/1379) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([helgikrs](https://github.com/helgikrs)) +- Support extract `week` in temporal.rs [\#1376](https://github.com/apache/arrow-rs/pull/1376) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Ted-Jiang](https://github.com/Ted-Jiang)) +- Speed up the function `min_max_string` [\#1374](https://github.com/apache/arrow-rs/pull/1374) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) +- Improve performance if dictionary kernels, add benchmark and add `take_iter_unchecked` [\#1372](https://github.com/apache/arrow-rs/pull/1372) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Update pyo3 requirement from 0.15 to 0.16 [\#1369](https://github.com/apache/arrow-rs/pull/1369) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Update contributing guide [\#1368](https://github.com/apache/arrow-rs/pull/1368) ([HaoYang670](https://github.com/HaoYang670)) +- Allow primitive array creation from iterators of PrimitiveTypes \(as well as `Option`\) [\#1367](https://github.com/apache/arrow-rs/pull/1367) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Update flatbuffers requirement from =2.1.0 to =2.1.1 [\#1364](https://github.com/apache/arrow-rs/pull/1364) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Fix clippy lints [\#1363](https://github.com/apache/arrow-rs/pull/1363) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) +- Refactor `RecordBatch::validate_new_batch` [\#1361](https://github.com/apache/arrow-rs/pull/1361) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) +- Refactor `StructArray::from` [\#1360](https://github.com/apache/arrow-rs/pull/1360) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) +- Update flatbuffers requirement from =2.0.0 to =2.1.0 [\#1359](https://github.com/apache/arrow-rs/pull/1359) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- fix: add LargeUtf8 support in json writer [\#1358](https://github.com/apache/arrow-rs/pull/1358) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tiphaineruy](https://github.com/tiphaineruy)) +- Add `as_decimal_array` function [\#1356](https://github.com/apache/arrow-rs/pull/1356) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([liukun4515](https://github.com/liukun4515)) +- Publicly export arrow::array::MapBuilder [\#1355](https://github.com/apache/arrow-rs/pull/1355) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tjwilson90](https://github.com/tjwilson90)) +- Add with\_datetime\_format to csv WriterBuilder [\#1347](https://github.com/apache/arrow-rs/pull/1347) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([gsserge](https://github.com/gsserge)) +- Refactor `Bitmap::new` [\#1343](https://github.com/apache/arrow-rs/pull/1343) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) +- Remove delimiter from csv Writer [\#1342](https://github.com/apache/arrow-rs/pull/1342) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([gsserge](https://github.com/gsserge)) +- Make bounds configurable in csv ReaderBuilder [\#1341](https://github.com/apache/arrow-rs/pull/1341) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([gsserge](https://github.com/gsserge)) +- `ArrowArray::try_from_raw` should not assume the pointers are from Arc [\#1334](https://github.com/apache/arrow-rs/pull/1334) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Use DictionaryArray's iterator in `compare_dict_op` [\#1330](https://github.com/apache/arrow-rs/pull/1330) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Implement DictionaryArray support in neq\_dyn, lt\_dyn, lt\_eq\_dyn, gt\_dyn, gt\_eq\_dyn [\#1326](https://github.com/apache/arrow-rs/pull/1326) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Arrow Rust + Conbench Integration [\#1289](https://github.com/apache/arrow-rs/pull/1289) ([dianaclarke](https://github.com/dianaclarke)) + +## [9.1.0](https://github.com/apache/arrow-rs/tree/9.1.0) (2022-02-19) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/9.0.2...9.1.0) + +**Implemented enhancements:** + +- Exposing page encoding stats [\#1321](https://github.com/apache/arrow-rs/issues/1321) +- Improve filter performance by special casing high and low selectivity predicates [\#1288](https://github.com/apache/arrow-rs/issues/1288) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Speed up `DeltaBitPackDecoder` [\#1281](https://github.com/apache/arrow-rs/issues/1281) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Fix all clippy lints in arrow crate [\#1255](https://github.com/apache/arrow-rs/issues/1255) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Expose page encoding `ColumnChunkMetadata` [\#1322](https://github.com/apache/arrow-rs/pull/1322) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([shanisolomon](https://github.com/shanisolomon)) +- Expose column index and offset index in `ColumnChunkMetadata` [\#1318](https://github.com/apache/arrow-rs/pull/1318) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([shanisolomon](https://github.com/shanisolomon)) +- Expose bloom filter offset in `ColumnChunkMetadata` [\#1309](https://github.com/apache/arrow-rs/pull/1309) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([shanisolomon](https://github.com/shanisolomon)) +- Add `DictionaryArray::try_new()` to create dictionaries from pre existing arrays [\#1300](https://github.com/apache/arrow-rs/pull/1300) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Add `DictionaryArray::keys_iter`, and `take_iter` for other array types [\#1296](https://github.com/apache/arrow-rs/pull/1296) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Make `rle` decoder public under `experimental` feature [\#1271](https://github.com/apache/arrow-rs/pull/1271) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([zeevm](https://github.com/zeevm)) +- Add `DictionaryArray` support in `eq_dyn` kernel [\#1263](https://github.com/apache/arrow-rs/pull/1263) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) + +**Fixed bugs:** + +- `len` is not a parameter of `MutableArrayData::extend` [\#1316](https://github.com/apache/arrow-rs/issues/1316) +- module `data_type` is private in Rust Parquet 8.0.0 [\#1302](https://github.com/apache/arrow-rs/issues/1302) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Test failure: bit\_chunk\_iterator [\#1294](https://github.com/apache/arrow-rs/issues/1294) +- csv\_writer benchmark fails with "no such file or directory" [\#1292](https://github.com/apache/arrow-rs/issues/1292) + +**Documentation updates:** + +- Fix warnings in `cargo doc` [\#1268](https://github.com/apache/arrow-rs/pull/1268) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) + +**Performance improvements:** + +- Vectorize DeltaBitPackDecoder, up to 5x faster decoding [\#1284](https://github.com/apache/arrow-rs/pull/1284) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Skip zero-ing primitive nulls [\#1280](https://github.com/apache/arrow-rs/pull/1280) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Add specialized filter kernels in `compute` module \(up to 10x faster\) [\#1248](https://github.com/apache/arrow-rs/pull/1248) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) + +**Closed issues:** + +- Expose column and offset index metadata offset [\#1317](https://github.com/apache/arrow-rs/issues/1317) +- Expose bloom filter metadata offset [\#1308](https://github.com/apache/arrow-rs/issues/1308) +- Improve ergonomics to construct `DictionaryArrays` from `Key` and `Value` arrays [\#1299](https://github.com/apache/arrow-rs/issues/1299) +- Make it easier to iterate over `DictionaryArray` [\#1295](https://github.com/apache/arrow-rs/issues/1295) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- (WON'T FIX) Don't Interwine Bit and Byte Aligned Operations in `BitReader` [\#1282](https://github.com/apache/arrow-rs/issues/1282) +- how to create arrow::array from streamReader [\#1278](https://github.com/apache/arrow-rs/issues/1278) +- Remove scientific notation when converting floats to strings. [\#983](https://github.com/apache/arrow-rs/issues/983) + +**Merged pull requests:** + +- Update the document of function `MutableArrayData::extend` [\#1336](https://github.com/apache/arrow-rs/pull/1336) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) +- Fix clippy lint `dead_code` [\#1324](https://github.com/apache/arrow-rs/pull/1324) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([gsserge](https://github.com/gsserge)) +- fix test bug and ensure that bloom filter metadata is serialized in `to_thrift` [\#1320](https://github.com/apache/arrow-rs/pull/1320) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([shanisolomon](https://github.com/shanisolomon)) +- Enable more clippy lints in arrow [\#1315](https://github.com/apache/arrow-rs/pull/1315) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([gsserge](https://github.com/gsserge)) +- Fix clippy lint `clippy::type_complexity` [\#1310](https://github.com/apache/arrow-rs/pull/1310) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([gsserge](https://github.com/gsserge)) +- Fix clippy lint `clippy::float_equality_without_abs` [\#1305](https://github.com/apache/arrow-rs/pull/1305) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([gsserge](https://github.com/gsserge)) +- Fix clippy `clippy::vec_init_then_push` lint [\#1303](https://github.com/apache/arrow-rs/pull/1303) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([gsserge](https://github.com/gsserge)) +- Fix failing csv\_writer bench [\#1293](https://github.com/apache/arrow-rs/pull/1293) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([andygrove](https://github.com/andygrove)) +- Changes for 9.0.2 [\#1291](https://github.com/apache/arrow-rs/pull/1291) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alamb](https://github.com/alamb)) +- Fix bitmask creation also for simd comparisons with scalar [\#1290](https://github.com/apache/arrow-rs/pull/1290) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jhorstmann](https://github.com/jhorstmann)) +- Fix simd comparison kernels [\#1286](https://github.com/apache/arrow-rs/pull/1286) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jhorstmann](https://github.com/jhorstmann)) +- Restrict Decoder to compatible types \(\#1276\) [\#1277](https://github.com/apache/arrow-rs/pull/1277) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Fix some clippy lints in parquet crate, rename `LevelEncoder` variants to conform to Rust standards [\#1273](https://github.com/apache/arrow-rs/pull/1273) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([HaoYang670](https://github.com/HaoYang670)) +- Use new DecimalArray creation API in arrow crate [\#1249](https://github.com/apache/arrow-rs/pull/1249) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Improve `DecimalArray` API ergonomics: add `iter()`, `FromIterator`, `with_precision_and_scale` [\#1223](https://github.com/apache/arrow-rs/pull/1223) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) + + +## [9.0.2](https://github.com/apache/arrow-rs/tree/9.0.2) (2022-02-09) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/8.0.0...9.0.2) + +**Breaking changes:** + +- Add `Send` + `Sync` to `DataType`, `RowGroupReader`, `FileReader`, `ChunkReader`. [\#1264](https://github.com/apache/arrow-rs/issues/1264) +- Rename the function `Bitmap::len` to `Bitmap::bit_len` to clarify its meaning [\#1242](https://github.com/apache/arrow-rs/pull/1242) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) +- Remove unused / broken `memory-check` feature [\#1222](https://github.com/apache/arrow-rs/pull/1222) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jhorstmann](https://github.com/jhorstmann)) +- Potentially buffer multiple `RecordBatches` before writing a parquet row group in `ArrowWriter` [\#1214](https://github.com/apache/arrow-rs/pull/1214) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) + +**Implemented enhancements:** + +- Add `async` arrow parquet reader [\#1154](https://github.com/apache/arrow-rs/pull/1154) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Rename `Bitmap::len` to `Bitmap::bit_len` [\#1233](https://github.com/apache/arrow-rs/issues/1233) +- Extend CSV schema inference to allow scientific notation for floating point types [\#1215](https://github.com/apache/arrow-rs/issues/1215) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Write Multiple RecordBatch to Parquet Row Group [\#1211](https://github.com/apache/arrow-rs/issues/1211) +- Add doc examples for `eq_dyn` etc. [\#1202](https://github.com/apache/arrow-rs/issues/1202) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add comparison kernels for `BinaryArray` [\#1108](https://github.com/apache/arrow-rs/issues/1108) +- `impl ArrowNativeType for i128` [\#1098](https://github.com/apache/arrow-rs/issues/1098) +- Remove `Copy` trait bound from dyn scalar kernels [\#1243](https://github.com/apache/arrow-rs/pull/1243) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([matthewmturner](https://github.com/matthewmturner)) +- Add `into_inner` for IPC `FileWriter` [\#1236](https://github.com/apache/arrow-rs/pull/1236) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([yjshen](https://github.com/yjshen)) +- \[Minor\]Re-export `array::builder::make_builder` to make it available for downstream [\#1235](https://github.com/apache/arrow-rs/pull/1235) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([yjshen](https://github.com/yjshen)) + +**Fixed bugs:** + +- Parquet v8.0.0 panics when reading all null column to NullArray [\#1245](https://github.com/apache/arrow-rs/issues/1245) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Get `Unknown configuration option rust-version` when running the rust format command [\#1240](https://github.com/apache/arrow-rs/issues/1240) +- `Bitmap` Length Validation is Incorrect [\#1231](https://github.com/apache/arrow-rs/issues/1231) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Writing sliced `ListArray` or `MapArray` ignore offsets [\#1226](https://github.com/apache/arrow-rs/issues/1226) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Remove broken `memory-tracking` crate feature [\#1171](https://github.com/apache/arrow-rs/issues/1171) +- Revert making `parquet::data_type` and `parquet::arrow::schema` experimental [\#1244](https://github.com/apache/arrow-rs/pull/1244) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) + +**Documentation updates:** + +- Update parquet crate documentation and examples [\#1253](https://github.com/apache/arrow-rs/pull/1253) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Refresh parquet readme / contributing guide [\#1252](https://github.com/apache/arrow-rs/pull/1252) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) +- Add docs examples for dynamically compare functions [\#1250](https://github.com/apache/arrow-rs/pull/1250) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) +- Add Rust Docs examples for UnionArray [\#1241](https://github.com/apache/arrow-rs/pull/1241) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) +- Improve documentation for Bitmap [\#1237](https://github.com/apache/arrow-rs/pull/1237) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) + +**Performance improvements:** + +- Improve performance for arithmetic kernels with `simd` feature enabled \(except for division/modulo\) [\#1221](https://github.com/apache/arrow-rs/pull/1221) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jhorstmann](https://github.com/jhorstmann)) +- Do not concatenate identical dictionaries [\#1219](https://github.com/apache/arrow-rs/pull/1219) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Preserve dictionary encoding when decoding parquet into Arrow arrays, 60x perf improvement \(\#171\) [\#1180](https://github.com/apache/arrow-rs/pull/1180) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) + +**Closed issues:** + +- `UnalignedBitChunkIterator` to that iterates through already aligned `u64` blocks [\#1227](https://github.com/apache/arrow-rs/issues/1227) +- Remove unused `ArrowArrayReader` in parquet [\#1197](https://github.com/apache/arrow-rs/issues/1197) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] + +**Merged pull requests:** + +- Upgrade clap to 3.0.0 [\#1261](https://github.com/apache/arrow-rs/pull/1261) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Jimexist](https://github.com/Jimexist)) +- Update chrono-tz requirement from 0.4 to 0.6 [\#1259](https://github.com/apache/arrow-rs/pull/1259) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Update zstd requirement from 0.9 to 0.10 [\#1257](https://github.com/apache/arrow-rs/pull/1257) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Fix NullArrayReader \(\#1245\) [\#1246](https://github.com/apache/arrow-rs/pull/1246) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- dyn compare for binary array [\#1238](https://github.com/apache/arrow-rs/pull/1238) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) +- Remove arrow array reader \(\#1197\) [\#1234](https://github.com/apache/arrow-rs/pull/1234) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Fix null bitmap length validation \(\#1231\) [\#1232](https://github.com/apache/arrow-rs/pull/1232) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Faster bitmask iteration [\#1228](https://github.com/apache/arrow-rs/pull/1228) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add non utf8 values into the test cases of BinaryArray comparison [\#1220](https://github.com/apache/arrow-rs/pull/1220) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) +- Update DECIMAL\_RE to allow scientific notation in auto inferred schemas [\#1216](https://github.com/apache/arrow-rs/pull/1216) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([pjmore](https://github.com/pjmore)) +- Fix simd comparison kernels [\#1286](https://github.com/apache/arrow-rs/pull/1286) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jhorstmann](https://github.com/jhorstmann)) +- Fix bitmask creation also for simd comparisons with scalar [\#1290](https://github.com/apache/arrow-rs/pull/1290) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jhorstmann](https://github.com/jhorstmann)) + +## [8.0.0](https://github.com/apache/arrow-rs/tree/8.0.0) (2022-01-20) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/7.0.0...8.0.0) + +**Breaking changes:** + +- Return error from JSON writer rather than panic [\#1205](https://github.com/apache/arrow-rs/pull/1205) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Ted-Jiang](https://github.com/Ted-Jiang)) +- Remove `ArrowSignedNumericType ` to Simplify and reduce code duplication in arithmetic kernels [\#1161](https://github.com/apache/arrow-rs/pull/1161) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jhorstmann](https://github.com/jhorstmann)) +- Restrict RecordReader and friends to scalar types \(\#1132\) [\#1155](https://github.com/apache/arrow-rs/pull/1155) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Move more parquet functionality behind experimental feature flag \(\#1032\) [\#1134](https://github.com/apache/arrow-rs/pull/1134) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) + +**Implemented enhancements:** + +- Parquet reader should be able to read structs within list [\#1186](https://github.com/apache/arrow-rs/issues/1186) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Disable serde\_json `arbitrary_precision` feature flag [\#1174](https://github.com/apache/arrow-rs/issues/1174) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Simplify and reduce code duplication in arithmetic.rs [\#1160](https://github.com/apache/arrow-rs/issues/1160) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Return `Err` from JSON writer rather than `panic!` for unsupported types [\#1157](https://github.com/apache/arrow-rs/issues/1157) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support `scalar` mathematics kernels for `Array` and scalar value [\#1153](https://github.com/apache/arrow-rs/issues/1153) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support `DecimalArray` in sort kernel [\#1137](https://github.com/apache/arrow-rs/issues/1137) +- Parquet Fuzz Tests [\#1053](https://github.com/apache/arrow-rs/issues/1053) +- BooleanBufferBuilder Append Packed [\#1038](https://github.com/apache/arrow-rs/issues/1038) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- parquet Performance Optimization: StructArrayReader Redundant Level & Bitmap Computation [\#1034](https://github.com/apache/arrow-rs/issues/1034) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Reduce Public Parquet API [\#1032](https://github.com/apache/arrow-rs/issues/1032) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Add `from_iter_values` for binary array [\#1188](https://github.com/apache/arrow-rs/pull/1188) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Jimexist](https://github.com/Jimexist)) +- Add support for `MapArray` in json writer [\#1149](https://github.com/apache/arrow-rs/pull/1149) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([helgikrs](https://github.com/helgikrs)) + +**Fixed bugs:** + +- Empty string arrays with no nulls are not equal [\#1208](https://github.com/apache/arrow-rs/issues/1208) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Pretty print a `RecordBatch` containing `Float16` triggers a panic [\#1193](https://github.com/apache/arrow-rs/issues/1193) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Writing structs nested in lists produces an incorrect output [\#1184](https://github.com/apache/arrow-rs/issues/1184) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Undefined behavior for `GenericStringArray::from_iter_values` if reported iterator upper bound is incorrect [\#1144](https://github.com/apache/arrow-rs/issues/1144) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Interval comparisons with `simd` feature asserts [\#1136](https://github.com/apache/arrow-rs/issues/1136) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- RecordReader Permits Illegal Types [\#1132](https://github.com/apache/arrow-rs/issues/1132) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] + +**Security fixes:** + +- Fix undefined behavor in GenericStringArray::from\_iter\_values [\#1145](https://github.com/apache/arrow-rs/pull/1145) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- parquet: Optimized ByteArrayReader, Add UTF-8 Validation \(\#1040\) [\#1082](https://github.com/apache/arrow-rs/pull/1082) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) + +**Documentation updates:** + +- Update parquet crate readme [\#1192](https://github.com/apache/arrow-rs/pull/1192) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) +- Document safety justification of some uses of `from_trusted_len_iter` [\#1148](https://github.com/apache/arrow-rs/pull/1148) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) + +**Performance improvements:** + +- Improve parquet reading performance for columns with nulls by preserving bitmask when possible \(\#1037\) [\#1054](https://github.com/apache/arrow-rs/pull/1054) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Improve parquet performance: Skip levels computation for required struct arrays in parquet [\#1035](https://github.com/apache/arrow-rs/pull/1035) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) + +**Closed issues:** + +- Generify ColumnReaderImpl and RecordReader [\#1040](https://github.com/apache/arrow-rs/issues/1040) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Parquet Preserve BitMask [\#1037](https://github.com/apache/arrow-rs/issues/1037) + +**Merged pull requests:** + +- fix a bug in variable sized equality [\#1209](https://github.com/apache/arrow-rs/pull/1209) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([helgikrs](https://github.com/helgikrs)) +- Pin WASM / packed SIMD tests to nightly-2022-01-17 [\#1204](https://github.com/apache/arrow-rs/pull/1204) ([alamb](https://github.com/alamb)) +- feat: add support for casting Duration/Interval to Int64Array [\#1196](https://github.com/apache/arrow-rs/pull/1196) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([e-dard](https://github.com/e-dard)) +- Add comparison support for fully qualified BinaryArray [\#1195](https://github.com/apache/arrow-rs/pull/1195) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) +- Fix in display of `Float16Array` [\#1194](https://github.com/apache/arrow-rs/pull/1194) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([helgikrs](https://github.com/helgikrs)) +- update nightly version for miri [\#1189](https://github.com/apache/arrow-rs/pull/1189) ([Jimexist](https://github.com/Jimexist)) +- feat\(parquet\): support for reading structs nested within lists [\#1187](https://github.com/apache/arrow-rs/pull/1187) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([helgikrs](https://github.com/helgikrs)) +- fix: Fix a bug in how definition levels are calculated for nested structs in a list [\#1185](https://github.com/apache/arrow-rs/pull/1185) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([helgikrs](https://github.com/helgikrs)) +- Truncate bitmask on BooleanBufferBuilder::resize: [\#1183](https://github.com/apache/arrow-rs/pull/1183) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add ticket reference for false positive in clippy [\#1181](https://github.com/apache/arrow-rs/pull/1181) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Fix record formatting in 1.58 [\#1178](https://github.com/apache/arrow-rs/pull/1178) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Serialize i128 as JSON string [\#1175](https://github.com/apache/arrow-rs/pull/1175) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Support DecimalType in `sort` and `take` kernels [\#1172](https://github.com/apache/arrow-rs/pull/1172) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([liukun4515](https://github.com/liukun4515)) +- Fix new clippy lints introduced in Rust 1.58 [\#1170](https://github.com/apache/arrow-rs/pull/1170) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Fix compilation error with simd feature [\#1169](https://github.com/apache/arrow-rs/pull/1169) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jhorstmann](https://github.com/jhorstmann)) +- Fix bug while writing parquet with empty lists of structs [\#1166](https://github.com/apache/arrow-rs/pull/1166) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([helgikrs](https://github.com/helgikrs)) +- Use tempfile for parquet tests [\#1165](https://github.com/apache/arrow-rs/pull/1165) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Remove left over dev/README.md file from arrow/arrow-rs split [\#1162](https://github.com/apache/arrow-rs/pull/1162) ([alamb](https://github.com/alamb)) +- Add multiply\_scalar kernel [\#1159](https://github.com/apache/arrow-rs/pull/1159) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Fuzz test different parquet encodings [\#1156](https://github.com/apache/arrow-rs/pull/1156) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Add subtract\_scalar kernel [\#1152](https://github.com/apache/arrow-rs/pull/1152) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Add add\_scalar kernel [\#1151](https://github.com/apache/arrow-rs/pull/1151) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Move simd right out of for\_each loop [\#1150](https://github.com/apache/arrow-rs/pull/1150) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Internal Remove `GenericStringArray::from_vec` and `GenericStringArray::from_opt_vec` [\#1147](https://github.com/apache/arrow-rs/pull/1147) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Implement SIMD comparison operations for types with less than 4 lanes \(i128\) [\#1146](https://github.com/apache/arrow-rs/pull/1146) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jhorstmann](https://github.com/jhorstmann)) +- Extends parquet fuzz tests to also tests nulls, dictionaries and row groups with multiple pages \(\#1053\) [\#1110](https://github.com/apache/arrow-rs/pull/1110) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Generify ColumnReaderImpl and RecordReader \(\#1040\) [\#1041](https://github.com/apache/arrow-rs/pull/1041) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- BooleanBufferBuilder::append\_packed \(\#1038\) [\#1039](https://github.com/apache/arrow-rs/pull/1039) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) + +## [7.0.0](https://github.com/apache/arrow-rs/tree/7.0.0) (2022-1-07) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/6.5.0...7.0.0) + +### Arrow + +**Breaking changes:** +- `pretty_format_batches` now returns `Result` rather than `String`: [#975](https://github.com/apache/arrow-rs/pull/975) +- `MutableBuffer::typed_data_mut` is marked `unsafe`: [#1029](https://github.com/apache/arrow-rs/pull/1029) +- UnionArray updated match latest Arrow spec, added `UnionMode`, `UnionArray::new()` marked `unsafe`: [#885](https://github.com/apache/arrow-rs/pull/885) + +**New Features:** +- Support for `Float16Array` types [#888](https://github.com/apache/arrow-rs/pull/888) +- IPC support for `UnionArray` [#654](https://github.com/apache/arrow-rs/issues/654) +- Dynamic comparison kernels for scalars (e.g. `eq_dyn_scalar`), including `DictionaryArray`: [#1113](https://github.com/apache/arrow-rs/issues/1113) + +**Enhancements:** +- Added `Schema::with_metadata` and `Field::with_metadata` [#1092](https://github.com/apache/arrow-rs/pull/1092) +- Support for custom datetime format for inference and parsing csv files [#1112](https://github.com/apache/arrow-rs/pull/1112) +- Implement `Array` for `ArrayRef` for easier use [#1129](https://github.com/apache/arrow-rs/pull/1129) +- Pretty printing display support for `FixedSizeBinaryArray` [#1097](https://github.com/apache/arrow-rs/pull/1097) +- Dependency Upgrades: `pyo3`, `parquet-format`, `prost`, `tonic` +- Avoid allocating vector of indices in `lexicographical_partition_ranges`[#998](https://github.com/apache/arrow-rs/pull/998) + +### Parquet + +**Fixed bugs:** +- (parquet) Fix reading of dictionary encoded pages with null values: [#1130](https://github.com/apache/arrow-rs/pull/1130) + + +# Changelog + +## [6.5.0](https://github.com/apache/arrow-rs/tree/6.5.0) (2021-12-23) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/6.4.0...6.5.0) + +* [092fc64bbb019244887ebd0d9c9a2d3e3a9aebc0](https://github.com/apache/arrow-rs/commit/092fc64bbb019244887ebd0d9c9a2d3e3a9aebc0) support cast decimal to decimal ([#1084](https://github.com/apache/arrow-rs/pull/1084)) ([#1093](https://github.com/apache/arrow-rs/pull/1093)) +* [01459762ed18b504e00e7b2818fce91f19188b1e](https://github.com/apache/arrow-rs/commit/01459762ed18b504e00e7b2818fce91f19188b1e) Fix like regex escaping ([#1085](https://github.com/apache/arrow-rs/pull/1085)) ([#1090](https://github.com/apache/arrow-rs/pull/1090)) +* [7c748bfccbc2eac0c1138378736b70dcb7e26a5b](https://github.com/apache/arrow-rs/commit/7c748bfccbc2eac0c1138378736b70dcb7e26a5b) support cast decimal to signed numeric ([#1073](https://github.com/apache/arrow-rs/pull/1073)) ([#1089](https://github.com/apache/arrow-rs/pull/1089)) +* [bd3600b6483c253ae57a38928a636d39a6b7cb02](https://github.com/apache/arrow-rs/commit/bd3600b6483c253ae57a38928a636d39a6b7cb02) parquet: Use constant for RLE decoder buffer size ([#1070](https://github.com/apache/arrow-rs/pull/1070)) ([#1088](https://github.com/apache/arrow-rs/pull/1088)) +* [2b5c53ecd92468fd95328637a15de7f35b6fcf28](https://github.com/apache/arrow-rs/commit/2b5c53ecd92468fd95328637a15de7f35b6fcf28) Box RleDecoder index buffer ([#1061](https://github.com/apache/arrow-rs/pull/1061)) ([#1062](https://github.com/apache/arrow-rs/pull/1062)) ([#1081](https://github.com/apache/arrow-rs/pull/1081)) +* [78721bc1a467177679ad6196b994759cf4d73377](https://github.com/apache/arrow-rs/commit/78721bc1a467177679ad6196b994759cf4d73377) BooleanBufferBuilder correct buffer length ([#1051](https://github.com/apache/arrow-rs/pull/1051)) ([#1052](https://github.com/apache/arrow-rs/pull/1052)) ([#1080](https://github.com/apache/arrow-rs/pull/1080)) +* [3a5e3541d3a4db61a828011ed95c8539adf1d57c](https://github.com/apache/arrow-rs/commit/3a5e3541d3a4db61a828011ed95c8539adf1d57c) support cast signed numeric to decimal ([#1044](https://github.com/apache/arrow-rs/pull/1044)) ([#1079](https://github.com/apache/arrow-rs/pull/1079)) +* [000bdb3053098255d43288aa3e8665e8b1892a6c](https://github.com/apache/arrow-rs/commit/000bdb3053098255d43288aa3e8665e8b1892a6c) fix(compute): LIKE escape parenthesis ([#1042](https://github.com/apache/arrow-rs/pull/1042)) ([#1078](https://github.com/apache/arrow-rs/pull/1078)) +* [e0abdb9e62772a2f853974e68e744246e7f47569](https://github.com/apache/arrow-rs/commit/e0abdb9e62772a2f853974e68e744246e7f47569) Add Schema::project and RecordBatch::project functions ([#1033](https://github.com/apache/arrow-rs/pull/1033)) ([#1077](https://github.com/apache/arrow-rs/pull/1077)) +* [31911a4d6328d889d98796b896412b3997f73e13](https://github.com/apache/arrow-rs/commit/31911a4d6328d889d98796b896412b3997f73e13) Remove outdated safety example from doc ([#1050](https://github.com/apache/arrow-rs/pull/1050)) ([#1058](https://github.com/apache/arrow-rs/pull/1058)) +* [71ac8620993a65a7f1f57278c3495556625356b3](https://github.com/apache/arrow-rs/commit/71ac8620993a65a7f1f57278c3495556625356b3) Use existing array type in `take` kernel ([#1046](https://github.com/apache/arrow-rs/pull/1046)) ([#1057](https://github.com/apache/arrow-rs/pull/1057)) +* [1c5902376b7f7d56cb5249db4f98a6a370ead919](https://github.com/apache/arrow-rs/commit/1c5902376b7f7d56cb5249db4f98a6a370ead919) Extract method to drive PageIterator -> RecordReader ([#1031](https://github.com/apache/arrow-rs/pull/1031)) ([#1056](https://github.com/apache/arrow-rs/pull/1056)) +* [7ca39361f8733b86bc0cef5ed5d74093e2c6b14d](https://github.com/apache/arrow-rs/commit/7ca39361f8733b86bc0cef5ed5d74093e2c6b14d) Clarify governance of arrow crate ([#1030](https://github.com/apache/arrow-rs/pull/1030)) ([#1055](https://github.com/apache/arrow-rs/pull/1055)) + + +## [6.4.0](https://github.com/apache/arrow-rs/tree/6.4.0) (2021-12-10) + + +[Full Changelog](https://github.com/apache/arrow-rs/compare/6.3.0...6.4.0) + + +* [049f48559f578243935b6e512d06c4c2df360bf1](https://github.com/apache/arrow-rs/commit/049f48559f578243935b6e512d06c4c2df360bf1) Force new cargo and target caching to fix CI ([#1023](https://github.com/apache/arrow-rs/pull/1023)) ([#1024](https://github.com/apache/arrow-rs/pull/1024)) +* [ef37da3b60f71a52d5ad67e9ca810dca38b29f00](https://github.com/apache/arrow-rs/commit/ef37da3b60f71a52d5ad67e9ca810dca38b29f00) Fix a broken link and some missing styling in the main arrow crate docs ([#1013](https://github.com/apache/arrow-rs/pull/1013)) ([#1019](https://github.com/apache/arrow-rs/pull/1019)) +* [f2c746a9b968714cfe05d35fcee8658371acd899](https://github.com/apache/arrow-rs/commit/f2c746a9b968714cfe05d35fcee8658371acd899) Remove out of date comment ([#1008](https://github.com/apache/arrow-rs/pull/1008)) ([#1018](https://github.com/apache/arrow-rs/pull/1018)) +* [557fc11e3b2a09a680c0cfbf38d27b13101b63fe](https://github.com/apache/arrow-rs/commit/557fc11e3b2a09a680c0cfbf38d27b13101b63fe) Remove unneeded `rc` feature of serde ([#990](https://github.com/apache/arrow-rs/pull/990)) ([#1016](https://github.com/apache/arrow-rs/pull/1016)) +* [b28385e096b1cf8f5fb2773d49b160f93d94fbac](https://github.com/apache/arrow-rs/commit/b28385e096b1cf8f5fb2773d49b160f93d94fbac) Docstrings for Timestamp*Array. ([#988](https://github.com/apache/arrow-rs/pull/988)) ([#1015](https://github.com/apache/arrow-rs/pull/1015)) +* [a92672e40217670d2566a85d70b0b59fffac594c](https://github.com/apache/arrow-rs/commit/a92672e40217670d2566a85d70b0b59fffac594c) Add full data validation for ArrayData::try_new() ([#1007](https://github.com/apache/arrow-rs/pull/1007)) +* [6c8b2936d7b07e1e2f5d1d48eea425a385382dfb](https://github.com/apache/arrow-rs/commit/6c8b2936d7b07e1e2f5d1d48eea425a385382dfb) Add boolean comparison to scalar kernels for less then, greater than ([#977](https://github.com/apache/arrow-rs/pull/977)) ([#1005](https://github.com/apache/arrow-rs/pull/1005)) +* [14d140aeca608a23a8a6b2c251c8f53ffd377e61](https://github.com/apache/arrow-rs/commit/14d140aeca608a23a8a6b2c251c8f53ffd377e61) Fix some typos in code and comments ([#985](https://github.com/apache/arrow-rs/pull/985)) ([#1006](https://github.com/apache/arrow-rs/pull/1006)) +* [b4507f562fb0eddfb79840871cd2733dc0e337cd](https://github.com/apache/arrow-rs/commit/b4507f562fb0eddfb79840871cd2733dc0e337cd) Fix warnings introduced by Rust/Clippy 1.57.0 ([#1004](https://github.com/apache/arrow-rs/pull/1004)) + + +## [6.3.0](https://github.com/apache/arrow-rs/tree/6.3.0) (2021-11-26) + + +[Full Changelog](https://github.com/apache/arrow-rs/compare/6.2.0...6.3.0) + + +**Changes:** +* [7e51df015ce851a5de444ca08b57b38e7ee959a3](https://github.com/apache/arrow-rs/commit/7e51df015ce851a5de444ca08b57b38e7ee959a3) add more error test case and change the code style ([#952](https://github.com/apache/arrow-rs/pull/952)) ([#976](https://github.com/apache/arrow-rs/pull/976)) +* [6c570cfe98d6a7a4ec74b139b733c5c72ed10015](https://github.com/apache/arrow-rs/commit/6c570cfe98d6a7a4ec74b139b733c5c72ed10015) Support read decimal data from csv reader if user provide the schema with decimal data type ([#941](https://github.com/apache/arrow-rs/pull/941)) ([#974](https://github.com/apache/arrow-rs/pull/974)) +* [4fa0d4d7f7d9ca0a3da2a6dfe3eae6dc2d51a79a](https://github.com/apache/arrow-rs/commit/4fa0d4d7f7d9ca0a3da2a6dfe3eae6dc2d51a79a) Adding Pretty Print Support For Fixed Size List ([#958](https://github.com/apache/arrow-rs/pull/958)) ([#968](https://github.com/apache/arrow-rs/pull/968)) +* [9d453a3128013c03e8ed854ded76b15cc6f28be4](https://github.com/apache/arrow-rs/commit/9d453a3128013c03e8ed854ded76b15cc6f28be4) Fix bug in temporal utilities due to DST being ignored. ([#955](https://github.com/apache/arrow-rs/pull/955)) ([#967](https://github.com/apache/arrow-rs/pull/967)) +* [1b9fd9e3fb2653236513bb7dda5aa2fa14d1d831](https://github.com/apache/arrow-rs/commit/1b9fd9e3fb2653236513bb7dda5aa2fa14d1d831) Inferring 2. as Float64 for issue [#929](https://github.com/apache/arrow-rs/pull/929) ([#950](https://github.com/apache/arrow-rs/pull/950)) ([#966](https://github.com/apache/arrow-rs/pull/966)) +* [e6c5e1c877bd94b3d6e545567f901d9962257cf8](https://github.com/apache/arrow-rs/commit/e6c5e1c877bd94b3d6e545567f901d9962257cf8) Fix CI for latest nightly ([#970](https://github.com/apache/arrow-rs/pull/970)) ([#973](https://github.com/apache/arrow-rs/pull/973)) +* [c96e8de457442806e18944f0b26dd06ba4cb1aee](https://github.com/apache/arrow-rs/commit/c96e8de457442806e18944f0b26dd06ba4cb1aee) Fix primitive sort when input contains more nulls than the given sort limit ([#954](https://github.com/apache/arrow-rs/pull/954)) ([#965](https://github.com/apache/arrow-rs/pull/965)) +* [094037d418381584178db1d886cad3b5024b414a](https://github.com/apache/arrow-rs/commit/094037d418381584178db1d886cad3b5024b414a) Update comfy-table to 5.0 ([#957](https://github.com/apache/arrow-rs/pull/957)) ([#964](https://github.com/apache/arrow-rs/pull/964)) +* [9f635021eee6786c5377c891218c5f88ebce07c3](https://github.com/apache/arrow-rs/commit/9f635021eee6786c5377c891218c5f88ebce07c3) Fix csv writing of timestamps to show timezone. ([#849](https://github.com/apache/arrow-rs/pull/849)) ([#963](https://github.com/apache/arrow-rs/pull/963)) +* [f7deba4c3a050a52608462ee8a827bb8f6364140](https://github.com/apache/arrow-rs/commit/f7deba4c3a050a52608462ee8a827bb8f6364140) Adding ability to parse float from number with leading decimal ([#831](https://github.com/apache/arrow-rs/pull/831)) ([#962](https://github.com/apache/arrow-rs/pull/962)) +* [59f96e842d05b63882f7ba285c66a9739761cf84](https://github.com/apache/arrow-rs/commit/59f96e842d05b63882f7ba285c66a9739761cf84) add ilike comparitor ([#874](https://github.com/apache/arrow-rs/pull/874)) ([#961](https://github.com/apache/arrow-rs/pull/961)) +* [54023c8a5543c9f9fa4955afa01189029f3e96f5](https://github.com/apache/arrow-rs/commit/54023c8a5543c9f9fa4955afa01189029f3e96f5) Remove unpassable cargo publish check from verify-release-candidate.sh ([#882](https://github.com/apache/arrow-rs/pull/882)) ([#949](https://github.com/apache/arrow-rs/pull/949)) + + + +## [6.2.0](https://github.com/apache/arrow-rs/tree/6.2.0) (2021-11-12) + + +[Full Changelog](https://github.com/apache/arrow-rs/compare/6.1.0...6.2.0) + +**Features / Fixes:** + + +* [4037933e43cad9e4de027039ce14caa65f78300a](https://github.com/apache/arrow-rs/commit/4037933e43cad9e4de027039ce14caa65f78300a) Fix validation for offsets of StructArrays ([#942](https://github.com/apache/arrow-rs/pull/942)) ([#946](https://github.com/apache/arrow-rs/pull/946)) +* [1af9ca5d363d870550026a7b1abcb749befbb371](https://github.com/apache/arrow-rs/commit/1af9ca5d363d870550026a7b1abcb749befbb371) implement take kernel for null arrays ([#939](https://github.com/apache/arrow-rs/pull/939)) ([#944](https://github.com/apache/arrow-rs/pull/944)) +* [320de1c20aefbf204f6888e2ad3663863afeba9f](https://github.com/apache/arrow-rs/commit/320de1c20aefbf204f6888e2ad3663863afeba9f) add checker for appending i128 to decimal builder ([#928](https://github.com/apache/arrow-rs/pull/928)) ([#943](https://github.com/apache/arrow-rs/pull/943)) +* [dff14113884ad4246a8cafb9be579ebdb4e1481f](https://github.com/apache/arrow-rs/commit/dff14113884ad4246a8cafb9be579ebdb4e1481f) Validate arguments to ArrayData::new and null bit buffer and buffers ([#810](https://github.com/apache/arrow-rs/pull/810)) ([#936](https://github.com/apache/arrow-rs/pull/936)) +* [c3eae1ec56303b97c9e15263063a6a13122ef194](https://github.com/apache/arrow-rs/commit/c3eae1ec56303b97c9e15263063a6a13122ef194) fix some warning about unused variables in panic tests ([#894](https://github.com/apache/arrow-rs/pull/894)) ([#933](https://github.com/apache/arrow-rs/pull/933)) +* [e80bb018450f13a30811ffd244c42917d8bf8a62](https://github.com/apache/arrow-rs/commit/e80bb018450f13a30811ffd244c42917d8bf8a62) fix some clippy warnings ([#896](https://github.com/apache/arrow-rs/pull/896)) ([#930](https://github.com/apache/arrow-rs/pull/930)) +* [bde89463b627be3f60b5569d038ca36c434da71d](https://github.com/apache/arrow-rs/commit/bde89463b627be3f60b5569d038ca36c434da71d) feat(ipc): add support for deserializing messages with nested dictionary fields ([#923](https://github.com/apache/arrow-rs/pull/923)) ([#931](https://github.com/apache/arrow-rs/pull/931)) +* [792544b5fb7b84224ef9745ecb9f330663c14fb4](https://github.com/apache/arrow-rs/commit/792544b5fb7b84224ef9745ecb9f330663c14fb4) refactor regexp_is_match_utf8_scalar to try to mitigate miri failures ([#895](https://github.com/apache/arrow-rs/pull/895)) ([#932](https://github.com/apache/arrow-rs/pull/932)) +* [3f0e252811cbb6e3f7c774959787dcfec985d03e](https://github.com/apache/arrow-rs/commit/3f0e252811cbb6e3f7c774959787dcfec985d03e) Automatically retry failed MIRI runs to work around intermittent failures ([#934](https://github.com/apache/arrow-rs/pull/934)) +* [c9a9515c46d560ced00e23ff57cb10a1c97573cb](https://github.com/apache/arrow-rs/commit/c9a9515c46d560ced00e23ff57cb10a1c97573cb) Update mod.rs ([#909](https://github.com/apache/arrow-rs/pull/909)) ([#919](https://github.com/apache/arrow-rs/pull/919)) +* [64ed79ece67141b92dc45b8a1d43cb9d909aa6a9](https://github.com/apache/arrow-rs/commit/64ed79ece67141b92dc45b8a1d43cb9d909aa6a9) Mark boolean kernels public ([#913](https://github.com/apache/arrow-rs/pull/913)) ([#920](https://github.com/apache/arrow-rs/pull/920)) +* [8b95fe0bbf03588c5cc00f67365c5b0dac4d7a34](https://github.com/apache/arrow-rs/commit/8b95fe0bbf03588c5cc00f67365c5b0dac4d7a34) doc example mistype ([#904](https://github.com/apache/arrow-rs/pull/904)) ([#918](https://github.com/apache/arrow-rs/pull/918)) +* [34c5eab4862cab16fdfd5f5ed6c68dce6298dfa4](https://github.com/apache/arrow-rs/commit/34c5eab4862cab16fdfd5f5ed6c68dce6298dfa4) allow null array to be cast to all other types ([#884](https://github.com/apache/arrow-rs/pull/884)) ([#917](https://github.com/apache/arrow-rs/pull/917)) +* [3c69752e55ed0c58f5a8faed918a22b45cd93766](https://github.com/apache/arrow-rs/commit/3c69752e55ed0c58f5a8faed918a22b45cd93766) Fix instances of UB that cause tests to not pass under miri ([#878](https://github.com/apache/arrow-rs/pull/878)) ([#916](https://github.com/apache/arrow-rs/pull/916)) +* [85402148c3af03d0855e81f855715ea98a7491c5](https://github.com/apache/arrow-rs/commit/85402148c3af03d0855e81f855715ea98a7491c5) feat(ipc): Support writing dictionaries nested in structs and unions ([#870](https://github.com/apache/arrow-rs/pull/870)) ([#915](https://github.com/apache/arrow-rs/pull/915)) +* [03d95e626cb0e654775fefa77786674ea41be4a2](https://github.com/apache/arrow-rs/commit/03d95e626cb0e654775fefa77786674ea41be4a2) Fix references to changelog ([#905](https://github.com/apache/arrow-rs/pull/905)) + + +## [6.1.0](https://github.com/apache/arrow-rs/tree/6.1.0) (2021-10-29) + + +[Full Changelog](https://github.com/apache/arrow-rs/compare/6.0.0...6.1.0) + +**Features / Fixes:** + +* [b42649b0088fe7762c713a41a23c1abdf8d0496d](https://github.com/apache/arrow-rs/commit/b42649b0088fe7762c713a41a23c1abdf8d0496d) implement eq_dyn and neq_dyn ([#858](https://github.com/apache/arrow-rs/pull/858)) ([#867](https://github.com/apache/arrow-rs/pull/867)) +* [01743f3f10a377c1ca857cd554acbf84155766d8](https://github.com/apache/arrow-rs/commit/01743f3f10a377c1ca857cd554acbf84155766d8) fix: fix a bug in offset calculation for unions ([#863](https://github.com/apache/arrow-rs/pull/863)) ([#871](https://github.com/apache/arrow-rs/pull/871)) +* [8bfff793a23f0e71008c7a9eea7a54d6b913ecff](https://github.com/apache/arrow-rs/commit/8bfff793a23f0e71008c7a9eea7a54d6b913ecff) add lt_bool, lt_eq_bool, gt_bool, gt_eq_bool ([#860](https://github.com/apache/arrow-rs/pull/860)) ([#868](https://github.com/apache/arrow-rs/pull/868)) +* [8845e91d4ab584c822e9ee903db7069551b124af](https://github.com/apache/arrow-rs/commit/8845e91d4ab584c822e9ee903db7069551b124af) fix(ipc): Support serializing structs containing dictionaries ([#848](https://github.com/apache/arrow-rs/pull/848)) ([#865](https://github.com/apache/arrow-rs/pull/865)) +* [620282a0d9fdd2a8ed7e8313d17ba3dec64c80e5](https://github.com/apache/arrow-rs/commit/620282a0d9fdd2a8ed7e8313d17ba3dec64c80e5) Implement boolean equality kernels ([#844](https://github.com/apache/arrow-rs/pull/844)) ([#857](https://github.com/apache/arrow-rs/pull/857)) +* [94cddcacf785be982e69689291ce034ef00220b4](https://github.com/apache/arrow-rs/commit/94cddcacf785be982e69689291ce034ef00220b4) Cherry pick fix parquet_derive with default features (and fix cargo publish) ([#856](https://github.com/apache/arrow-rs/pull/856)) +* [733fd583ddb3dbe6b4d58a809c444ee16ac0eae8](https://github.com/apache/arrow-rs/commit/733fd583ddb3dbe6b4d58a809c444ee16ac0eae8) Use kernel utility for parsing timestamps in csv reader. ([#832](https://github.com/apache/arrow-rs/pull/832)) ([#853](https://github.com/apache/arrow-rs/pull/853)) +* [2cc64937a153f632796915d2d9869d5c2a501d28](https://github.com/apache/arrow-rs/commit/2cc64937a153f632796915d2d9869d5c2a501d28) [Minor] Fix clippy errors with new rust version (1.56) and float formatting with nightly ([#845](https://github.com/apache/arrow-rs/pull/845)) ([#850](https://github.com/apache/arrow-rs/pull/850)) + +**Other:** +* [bfac9e5a027e3bd78b7a1ec90c75a3e385bd66bb](https://github.com/apache/arrow-rs/commit/bfac9e5a027e3bd78b7a1ec90c75a3e385bd66bb) Test out new tarpaulin version ([#852](https://github.com/apache/arrow-rs/pull/852)) ([#866](https://github.com/apache/arrow-rs/pull/866)) +* [809350ced392cfc78d8a1a46228d4ffc25dea9ff](https://github.com/apache/arrow-rs/commit/809350ced392cfc78d8a1a46228d4ffc25dea9ff) Update README.md ([#834](https://github.com/apache/arrow-rs/pull/834)) ([#854](https://github.com/apache/arrow-rs/pull/854)) +* [70582f40dd21f5c710c4946266d0563a92b92337](https://github.com/apache/arrow-rs/commit/70582f40dd21f5c710c4946266d0563a92b92337) [MINOR] Delete temp file from docs ([#836](https://github.com/apache/arrow-rs/pull/836)) ([#855](https://github.com/apache/arrow-rs/pull/855)) +* [a721e00014015a7e598946b6efb9b1da8080ec85](https://github.com/apache/arrow-rs/commit/a721e00014015a7e598946b6efb9b1da8080ec85) Force fresh cargo cache key in CI ([#839](https://github.com/apache/arrow-rs/pull/839)) ([#851](https://github.com/apache/arrow-rs/pull/851)) + + +## [6.0.0](https://github.com/apache/arrow-rs/tree/6.0.0) (2021-10-13) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/5.5.0...6.0.0) + +**Breaking changes:** + +- Replace `ArrayData::new()` with `ArrayData::try_new()` and `unsafe ArrayData::new_unchecked` [\#822](https://github.com/apache/arrow-rs/pull/822) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Update Bitmap::len to return bits rather than bytes [\#749](https://github.com/apache/arrow-rs/pull/749) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([matthewmturner](https://github.com/matthewmturner)) +- use sort\_unstable\_by in primitive sorting [\#552](https://github.com/apache/arrow-rs/pull/552) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Jimexist](https://github.com/Jimexist)) +- New MapArray support [\#491](https://github.com/apache/arrow-rs/pull/491) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([nevi-me](https://github.com/nevi-me)) + +**Implemented enhancements:** + +- Improve parquet binary writer speed by reducing allocations [\#819](https://github.com/apache/arrow-rs/issues/819) +- Expose buffer operations [\#808](https://github.com/apache/arrow-rs/issues/808) +- Add doc examples of writing parquet files using `ArrowWriter` [\#788](https://github.com/apache/arrow-rs/issues/788) + +**Fixed bugs:** + +- JSON reader can create null struct children on empty lists [\#825](https://github.com/apache/arrow-rs/issues/825) +- Incorrect null count for cast kernel for list arrays [\#815](https://github.com/apache/arrow-rs/issues/815) +- `minute` and `second` temporal kernels do not respect timezone [\#500](https://github.com/apache/arrow-rs/issues/500) +- Fix data corruption in json decoder f64-to-i64 cast [\#652](https://github.com/apache/arrow-rs/pull/652) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([xianwill](https://github.com/xianwill)) + +**Documentation updates:** + +- Doctest for PrimitiveArray using from\_iter\_values. [\#694](https://github.com/apache/arrow-rs/pull/694) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([novemberkilo](https://github.com/novemberkilo)) +- Doctests for BinaryArray and LargeBinaryArray. [\#625](https://github.com/apache/arrow-rs/pull/625) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([novemberkilo](https://github.com/novemberkilo)) +- Add links in docstrings [\#605](https://github.com/apache/arrow-rs/pull/605) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) + + +## [5.5.0](https://github.com/apache/arrow-rs/tree/5.5.0) (2021-09-24) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/5.4.0...5.5.0) + +**Implemented enhancements:** + +- parquet should depend on a small set of arrow features [\#800](https://github.com/apache/arrow-rs/issues/800) +- Support equality on RecordBatch [\#735](https://github.com/apache/arrow-rs/issues/735) + +**Fixed bugs:** + +- Converting from string to timestamp uses microseconds instead of milliseconds [\#780](https://github.com/apache/arrow-rs/issues/780) +- Document has no link to `RowColumIter` [\#762](https://github.com/apache/arrow-rs/issues/762) +- length on slices with null doesn't work [\#744](https://github.com/apache/arrow-rs/issues/744) + +## [5.4.0](https://github.com/apache/arrow-rs/tree/5.4.0) (2021-09-10) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/5.3.0...5.4.0) + +**Implemented enhancements:** + +- Upgrade lexical-core to 0.8 [\#747](https://github.com/apache/arrow-rs/issues/747) +- `append_nulls` and `append_trusted_len_iter` for PrimitiveBuilder [\#725](https://github.com/apache/arrow-rs/issues/725) +- Optimize MutableArrayData::extend for null buffers [\#397](https://github.com/apache/arrow-rs/issues/397) + +**Fixed bugs:** + +- Arithmetic with scalars doesn't work on slices [\#742](https://github.com/apache/arrow-rs/issues/742) +- Comparisons with scalar don't work on slices [\#740](https://github.com/apache/arrow-rs/issues/740) +- `unary` kernel doesn't respect offset [\#738](https://github.com/apache/arrow-rs/issues/738) +- `new_null_array` creates invalid struct arrays [\#734](https://github.com/apache/arrow-rs/issues/734) +- --no-default-features is broken for parquet [\#733](https://github.com/apache/arrow-rs/issues/733) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- `Bitmap::len` returns the number of bytes, not bits. [\#730](https://github.com/apache/arrow-rs/issues/730) +- Decimal logical type is formatted incorrectly by print\_schema [\#713](https://github.com/apache/arrow-rs/issues/713) +- parquet\_derive does not support chrono time values [\#711](https://github.com/apache/arrow-rs/issues/711) +- Numeric overflow when formatting Decimal type [\#710](https://github.com/apache/arrow-rs/issues/710) +- The integration tests are not running [\#690](https://github.com/apache/arrow-rs/issues/690) + +**Closed issues:** + +- Question: Is there no way to create a DictionaryArray with a pre-arranged mapping? [\#729](https://github.com/apache/arrow-rs/issues/729) + +## [5.3.0](https://github.com/apache/arrow-rs/tree/5.3.0) (2021-08-26) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/5.2.0...5.3.0) + +**Implemented enhancements:** + +- Add optimized filter kernel for regular expression matching [\#697](https://github.com/apache/arrow-rs/issues/697) +- Can't cast from timestamp array to string array [\#587](https://github.com/apache/arrow-rs/issues/587) + +**Fixed bugs:** + +- 'Encoding DELTA\_BYTE\_ARRAY is not supported' with parquet arrow readers [\#708](https://github.com/apache/arrow-rs/issues/708) +- Support reading json string into binary data type. [\#701](https://github.com/apache/arrow-rs/issues/701) + +**Closed issues:** + +- Resolve Issues with `prettytable-rs` dependency [\#69](https://github.com/apache/arrow-rs/issues/69) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +## [5.2.0](https://github.com/apache/arrow-rs/tree/5.2.0) (2021-08-12) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/5.1.0...5.2.0) + +**Implemented enhancements:** + +- Make rand an optional dependency [\#671](https://github.com/apache/arrow-rs/issues/671) +- Remove undefined behavior in `value` method of boolean and primitive arrays [\#645](https://github.com/apache/arrow-rs/issues/645) +- Avoid materialization of indices in filter\_record\_batch for single arrays [\#636](https://github.com/apache/arrow-rs/issues/636) +- Add a note about arrow crate security / safety [\#627](https://github.com/apache/arrow-rs/issues/627) +- Allow the creation of String arrays from an interator of &Option\<&str\> [\#598](https://github.com/apache/arrow-rs/issues/598) +- Support arrow map datatype [\#395](https://github.com/apache/arrow-rs/issues/395) + +**Fixed bugs:** + +- Parquet fixed length byte array columns write byte array statistics [\#660](https://github.com/apache/arrow-rs/issues/660) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Parquet boolean columns write Int32 statistics [\#659](https://github.com/apache/arrow-rs/issues/659) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Writing Parquet with a boolean column fails [\#657](https://github.com/apache/arrow-rs/issues/657) +- JSON decoder data corruption for large i64/u64 [\#653](https://github.com/apache/arrow-rs/issues/653) +- Incorrect min/max statistics for strings in parquet files [\#641](https://github.com/apache/arrow-rs/issues/641) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] + +**Closed issues:** + +- Release candidate verifying script seems work on macOS [\#640](https://github.com/apache/arrow-rs/issues/640) +- Update CONTRIBUTING [\#342](https://github.com/apache/arrow-rs/issues/342) + +## [5.1.0](https://github.com/apache/arrow-rs/tree/5.1.0) (2021-07-29) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/5.0.0...5.1.0) + +**Implemented enhancements:** + +- Make FFI\_ArrowArray empty\(\) public [\#602](https://github.com/apache/arrow-rs/issues/602) +- exponential sort can be used to speed up lexico partition kernel [\#586](https://github.com/apache/arrow-rs/issues/586) +- Implement sort\(\) for binary array [\#568](https://github.com/apache/arrow-rs/issues/568) +- primitive sorting can be improved and more consistent with and without `limit` if sorted unstably [\#553](https://github.com/apache/arrow-rs/issues/553) + +**Fixed bugs:** + +- Confusing memory usage with CSV reader [\#623](https://github.com/apache/arrow-rs/issues/623) +- FFI implementation deviates from specification for array release [\#595](https://github.com/apache/arrow-rs/issues/595) +- Parquet file content is different if `~/.cargo` is in a git checkout [\#589](https://github.com/apache/arrow-rs/issues/589) +- Ensure output of MIRI is checked for success [\#581](https://github.com/apache/arrow-rs/issues/581) +- MIRI failure in `array::ffi::tests::test_struct` and other ffi tests [\#580](https://github.com/apache/arrow-rs/issues/580) +- ListArray equality check may return wrong result [\#570](https://github.com/apache/arrow-rs/issues/570) +- cargo audit failed [\#561](https://github.com/apache/arrow-rs/issues/561) +- ArrayData::slice\(\) does not work for nested types such as StructArray [\#554](https://github.com/apache/arrow-rs/issues/554) + +**Documentation updates:** + +- More examples of how to construct Arrays [\#301](https://github.com/apache/arrow-rs/issues/301) + +**Closed issues:** + +- Implement StringBuilder::append\_option [\#263](https://github.com/apache/arrow-rs/issues/263) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +## [5.0.0](https://github.com/apache/arrow-rs/tree/5.0.0) (2021-07-14) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/4.4.0...5.0.0) + +**Breaking changes:** + +- Remove lifetime from DynComparator [\#543](https://github.com/apache/arrow-rs/issues/543) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Simplify interactions with arrow flight APIs [\#376](https://github.com/apache/arrow-rs/issues/376) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- refactor: remove lifetime from DynComparator [\#542](https://github.com/apache/arrow-rs/pull/542) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([e-dard](https://github.com/e-dard)) +- use iterator for partition kernel instead of generating vec [\#438](https://github.com/apache/arrow-rs/pull/438) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Jimexist](https://github.com/Jimexist)) +- Remove DictionaryArray::keys\_array method [\#419](https://github.com/apache/arrow-rs/pull/419) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jhorstmann](https://github.com/jhorstmann)) +- simplify interactions with arrow flight APIs [\#377](https://github.com/apache/arrow-rs/pull/377) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([garyanaplan](https://github.com/garyanaplan)) +- return reference from DictionaryArray::values\(\) \(\#313\) [\#314](https://github.com/apache/arrow-rs/pull/314) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) + +**Implemented enhancements:** + +- Allow creation of StringArrays from Vec\ [\#519](https://github.com/apache/arrow-rs/issues/519) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Implement RecordBatch::concat [\#461](https://github.com/apache/arrow-rs/issues/461) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Implement RecordBatch::slice\(\) to slice RecordBatches [\#460](https://github.com/apache/arrow-rs/issues/460) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add a RecordBatch::split to split large batches into a set of smaller batches [\#343](https://github.com/apache/arrow-rs/issues/343) +- generate parquet schema from rust struct [\#539](https://github.com/apache/arrow-rs/pull/539) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([nevi-me](https://github.com/nevi-me)) +- Implement `RecordBatch::concat` [\#537](https://github.com/apache/arrow-rs/pull/537) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([silathdiir](https://github.com/silathdiir)) +- Implement function slice for RecordBatch [\#490](https://github.com/apache/arrow-rs/pull/490) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([b41sh](https://github.com/b41sh)) +- add lexicographically partition points and ranges [\#424](https://github.com/apache/arrow-rs/pull/424) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Jimexist](https://github.com/Jimexist)) +- allow to read non-standard CSV [\#326](https://github.com/apache/arrow-rs/pull/326) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([kazuk](https://github.com/kazuk)) +- parquet: Speed up `BitReader`/`DeltaBitPackDecoder` [\#325](https://github.com/apache/arrow-rs/pull/325) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([kornholi](https://github.com/kornholi)) +- ARROW-12343: \[Rust\] Support auto-vectorization for min/max [\#9](https://github.com/apache/arrow-rs/pull/9) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Dandandan](https://github.com/Dandandan)) +- ARROW-12411: \[Rust\] Create RecordBatches from Iterators [\#7](https://github.com/apache/arrow-rs/pull/7) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) + +**Fixed bugs:** + +- Error building on master - error: cyclic package dependency: package `ahash v0.7.4` depends on itself. Cycle [\#544](https://github.com/apache/arrow-rs/issues/544) +- IPC reader panics with out of bounds error [\#541](https://github.com/apache/arrow-rs/issues/541) +- Take kernel doesn't handle nulls and structs correctly [\#530](https://github.com/apache/arrow-rs/issues/530) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- master fails to compile with `default-features=false` [\#529](https://github.com/apache/arrow-rs/issues/529) +- README developer instructions out of date [\#523](https://github.com/apache/arrow-rs/issues/523) +- Update rustc and packed\_simd in CI before 5.0 release [\#517](https://github.com/apache/arrow-rs/issues/517) +- Incorrect memory usage calculation for dictionary arrays [\#503](https://github.com/apache/arrow-rs/issues/503) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- sliced null buffers lead to incorrect result in take kernel \(and probably on other places\) [\#502](https://github.com/apache/arrow-rs/issues/502) +- Cast of utf8 types and list container types don't respect offset [\#334](https://github.com/apache/arrow-rs/issues/334) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- fix take kernel null handling on structs [\#531](https://github.com/apache/arrow-rs/pull/531) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([bjchambers](https://github.com/bjchambers)) +- Correct array memory usage calculation for dictionary arrays [\#505](https://github.com/apache/arrow-rs/pull/505) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jhorstmann](https://github.com/jhorstmann)) +- parquet: improve BOOLEAN writing logic and report error on encoding fail [\#443](https://github.com/apache/arrow-rs/pull/443) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([garyanaplan](https://github.com/garyanaplan)) +- Fix bug with null buffer offset in boolean not kernel [\#418](https://github.com/apache/arrow-rs/pull/418) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jhorstmann](https://github.com/jhorstmann)) +- respect offset in utf8 and list casts [\#335](https://github.com/apache/arrow-rs/pull/335) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([ritchie46](https://github.com/ritchie46)) +- Fix comparison of dictionaries with different values arrays \(\#332\) [\#333](https://github.com/apache/arrow-rs/pull/333) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- ensure null-counts are written for all-null columns [\#307](https://github.com/apache/arrow-rs/pull/307) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([crepererum](https://github.com/crepererum)) +- fix invalid null handling in filter [\#296](https://github.com/apache/arrow-rs/pull/296) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([ritchie46](https://github.com/ritchie46)) +- fix NaN handling in parquet statistics [\#256](https://github.com/apache/arrow-rs/pull/256) ([crepererum](https://github.com/crepererum)) + +**Documentation updates:** + +- Improve arrow's crate's readme on crates.io [\#463](https://github.com/apache/arrow-rs/issues/463) +- Clean up README.md in advance of the 5.0 release [\#536](https://github.com/apache/arrow-rs/pull/536) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) +- fix readme instructions to reflect new structure [\#524](https://github.com/apache/arrow-rs/pull/524) ([marcvanheerden](https://github.com/marcvanheerden)) +- Improve docs for NullArray, new\_null\_array and new\_empty\_array [\#240](https://github.com/apache/arrow-rs/pull/240) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) + +**Merged pull requests:** + +- Fix default arrow build [\#533](https://github.com/apache/arrow-rs/pull/533) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Add tests for building applications using arrow with different feature flags [\#532](https://github.com/apache/arrow-rs/pull/532) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Remove unused futures dependency from arrow-flight [\#528](https://github.com/apache/arrow-rs/pull/528) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alamb](https://github.com/alamb)) +- CI: update rust nightly and packed\_simd [\#525](https://github.com/apache/arrow-rs/pull/525) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([ritchie46](https://github.com/ritchie46)) +- Support `StringArray` creation from String Vec [\#522](https://github.com/apache/arrow-rs/pull/522) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([silathdiir](https://github.com/silathdiir)) +- Fix parquet benchmark schema [\#513](https://github.com/apache/arrow-rs/pull/513) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([nevi-me](https://github.com/nevi-me)) +- Fix parquet definition levels [\#511](https://github.com/apache/arrow-rs/pull/511) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([nevi-me](https://github.com/nevi-me)) +- Fix for primitive and boolean take kernel for nullable indices with an offset [\#509](https://github.com/apache/arrow-rs/pull/509) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jhorstmann](https://github.com/jhorstmann)) +- Bump flatbuffers [\#499](https://github.com/apache/arrow-rs/pull/499) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([PsiACE](https://github.com/PsiACE)) +- implement second/minute helpers for temporal [\#493](https://github.com/apache/arrow-rs/pull/493) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([ovr](https://github.com/ovr)) +- special case concatenating single element array shortcut [\#492](https://github.com/apache/arrow-rs/pull/492) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Jimexist](https://github.com/Jimexist)) +- update docs to reflect recent changes \(joins and window functions\) [\#489](https://github.com/apache/arrow-rs/pull/489) ([Jimexist](https://github.com/Jimexist)) +- Update rand, proc-macro and zstd dependencies [\#488](https://github.com/apache/arrow-rs/pull/488) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) +- Doctest for GenericListArray. [\#474](https://github.com/apache/arrow-rs/pull/474) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([novemberkilo](https://github.com/novemberkilo)) +- remove stale comment on `ArrayData` equality and update unit tests [\#472](https://github.com/apache/arrow-rs/pull/472) ([Jimexist](https://github.com/Jimexist)) +- remove unused patch file [\#471](https://github.com/apache/arrow-rs/pull/471) ([Jimexist](https://github.com/Jimexist)) +- fix clippy warnings for rust 1.53 [\#470](https://github.com/apache/arrow-rs/pull/470) ([Jimexist](https://github.com/Jimexist)) +- Fix PR labeler [\#468](https://github.com/apache/arrow-rs/pull/468) ([Dandandan](https://github.com/Dandandan)) +- Tweak dev backporting docs [\#466](https://github.com/apache/arrow-rs/pull/466) ([alamb](https://github.com/alamb)) +- Unvendor Archery [\#459](https://github.com/apache/arrow-rs/pull/459) ([kszucs](https://github.com/kszucs)) +- Add sort boolean benchmark [\#457](https://github.com/apache/arrow-rs/pull/457) ([alamb](https://github.com/alamb)) +- Add C data interface for decimal128 and timestamp [\#453](https://github.com/apache/arrow-rs/pull/453) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alippai](https://github.com/alippai)) +- Implement the Iterator trait for the json Reader. [\#451](https://github.com/apache/arrow-rs/pull/451) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([LaurentMazare](https://github.com/LaurentMazare)) +- Update release docs + release email template [\#450](https://github.com/apache/arrow-rs/pull/450) ([alamb](https://github.com/alamb)) +- remove clippy unnecessary wraps suppresions in cast kernel [\#449](https://github.com/apache/arrow-rs/pull/449) ([Jimexist](https://github.com/Jimexist)) +- Use partition for bool sort [\#448](https://github.com/apache/arrow-rs/pull/448) ([Jimexist](https://github.com/Jimexist)) +- remove unnecessary wraps in sort [\#445](https://github.com/apache/arrow-rs/pull/445) ([Jimexist](https://github.com/Jimexist)) +- Python FFI bridge for Schema, Field and DataType [\#439](https://github.com/apache/arrow-rs/pull/439) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([kszucs](https://github.com/kszucs)) +- Update release Readme.md [\#436](https://github.com/apache/arrow-rs/pull/436) ([alamb](https://github.com/alamb)) +- Derive Eq and PartialEq for SortOptions [\#425](https://github.com/apache/arrow-rs/pull/425) ([tustvold](https://github.com/tustvold)) +- refactor lexico sort for future code reuse [\#423](https://github.com/apache/arrow-rs/pull/423) ([Jimexist](https://github.com/Jimexist)) +- Reenable MIRI check on PRs [\#421](https://github.com/apache/arrow-rs/pull/421) ([alamb](https://github.com/alamb)) +- Sort by float lists [\#420](https://github.com/apache/arrow-rs/pull/420) ([medwards](https://github.com/medwards)) +- Fix out of bounds read in bit chunk iterator [\#416](https://github.com/apache/arrow-rs/pull/416) ([jhorstmann](https://github.com/jhorstmann)) +- Doctests for DecimalArray. [\#414](https://github.com/apache/arrow-rs/pull/414) ([novemberkilo](https://github.com/novemberkilo)) +- Add Decimal to CsvWriter and improve debug display [\#406](https://github.com/apache/arrow-rs/pull/406) ([alippai](https://github.com/alippai)) +- MINOR: update install instruction [\#400](https://github.com/apache/arrow-rs/pull/400) ([alippai](https://github.com/alippai)) +- use prettier to auto format md files [\#398](https://github.com/apache/arrow-rs/pull/398) ([Jimexist](https://github.com/Jimexist)) +- window::shift to work for all array types [\#388](https://github.com/apache/arrow-rs/pull/388) ([Jimexist](https://github.com/Jimexist)) +- add more tests for window::shift and handle boundary cases [\#386](https://github.com/apache/arrow-rs/pull/386) ([Jimexist](https://github.com/Jimexist)) +- Implement faster arrow array reader [\#384](https://github.com/apache/arrow-rs/pull/384) ([yordan-pavlov](https://github.com/yordan-pavlov)) +- Add set\_bit to BooleanBufferBuilder to allow mutating bit in index [\#383](https://github.com/apache/arrow-rs/pull/383) ([boazberman](https://github.com/boazberman)) +- make sure that only concat preallocates buffers [\#382](https://github.com/apache/arrow-rs/pull/382) ([ritchie46](https://github.com/ritchie46)) +- Respect max rowgroup size in Arrow writer [\#381](https://github.com/apache/arrow-rs/pull/381) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([nevi-me](https://github.com/nevi-me)) +- Fix typo in release script, update release location [\#380](https://github.com/apache/arrow-rs/pull/380) ([alamb](https://github.com/alamb)) +- Doctests for FixedSizeBinaryArray [\#378](https://github.com/apache/arrow-rs/pull/378) ([novemberkilo](https://github.com/novemberkilo)) +- Simplify shift kernel using new\_null\_array [\#370](https://github.com/apache/arrow-rs/pull/370) ([Dandandan](https://github.com/Dandandan)) +- allow `SliceableCursor` to be constructed from an `Arc` directly [\#369](https://github.com/apache/arrow-rs/pull/369) ([crepererum](https://github.com/crepererum)) +- Add doctest for ArrayBuilder [\#367](https://github.com/apache/arrow-rs/pull/367) ([alippai](https://github.com/alippai)) +- Fix version in readme [\#365](https://github.com/apache/arrow-rs/pull/365) ([domoritz](https://github.com/domoritz)) +- Remove superfluous space [\#363](https://github.com/apache/arrow-rs/pull/363) ([domoritz](https://github.com/domoritz)) +- Add crate badges [\#362](https://github.com/apache/arrow-rs/pull/362) ([domoritz](https://github.com/domoritz)) +- Disable MIRI check until it runs cleanly on CI [\#360](https://github.com/apache/arrow-rs/pull/360) ([alamb](https://github.com/alamb)) +- Only register Flight.proto with cargo if it exists [\#351](https://github.com/apache/arrow-rs/pull/351) ([tustvold](https://github.com/tustvold)) +- Reduce memory usage of concat \(large\)utf8 [\#348](https://github.com/apache/arrow-rs/pull/348) ([ritchie46](https://github.com/ritchie46)) +- Fix filter UB and add fast path [\#341](https://github.com/apache/arrow-rs/pull/341) ([ritchie46](https://github.com/ritchie46)) +- Automatic cherry-pick script [\#339](https://github.com/apache/arrow-rs/pull/339) ([alamb](https://github.com/alamb)) +- Doctests for BooleanArray. [\#338](https://github.com/apache/arrow-rs/pull/338) ([novemberkilo](https://github.com/novemberkilo)) +- feature gate ipc reader/writer [\#336](https://github.com/apache/arrow-rs/pull/336) ([ritchie46](https://github.com/ritchie46)) +- Add ported Rust release verification script [\#331](https://github.com/apache/arrow-rs/pull/331) ([wesm](https://github.com/wesm)) +- Doctests for StringArray and LargeStringArray. [\#330](https://github.com/apache/arrow-rs/pull/330) ([novemberkilo](https://github.com/novemberkilo)) +- inline PrimitiveArray::value [\#329](https://github.com/apache/arrow-rs/pull/329) ([ritchie46](https://github.com/ritchie46)) +- Enable wasm32 as a target architecture for the SIMD feature [\#324](https://github.com/apache/arrow-rs/pull/324) ([roee88](https://github.com/roee88)) +- Fix undefined behavior in FFI and enable MIRI checks on CI [\#323](https://github.com/apache/arrow-rs/pull/323) ([roee88](https://github.com/roee88)) +- Mutablebuffer::shrink\_to\_fit [\#318](https://github.com/apache/arrow-rs/pull/318) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([ritchie46](https://github.com/ritchie46)) +- Add \(simd\) modulus op [\#317](https://github.com/apache/arrow-rs/pull/317) ([gangliao](https://github.com/gangliao)) +- feature gate csv functionality [\#312](https://github.com/apache/arrow-rs/pull/312) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([ritchie46](https://github.com/ritchie46)) +- \[Minor\] Version upgrades [\#304](https://github.com/apache/arrow-rs/pull/304) ([Dandandan](https://github.com/Dandandan)) +- Remove old release scripts [\#293](https://github.com/apache/arrow-rs/pull/293) ([alamb](https://github.com/alamb)) +- Add Send to the ArrayBuilder trait [\#291](https://github.com/apache/arrow-rs/pull/291) ([Max-Meldrum](https://github.com/Max-Meldrum)) +- Added changelog generator script and configuration. [\#289](https://github.com/apache/arrow-rs/pull/289) ([jorgecarleitao](https://github.com/jorgecarleitao)) +- manually bump development version [\#288](https://github.com/apache/arrow-rs/pull/288) ([nevi-me](https://github.com/nevi-me)) +- Fix FFI and add support for Struct type [\#287](https://github.com/apache/arrow-rs/pull/287) ([roee88](https://github.com/roee88)) +- Fix subtraction underflow when sorting string arrays with many nulls [\#285](https://github.com/apache/arrow-rs/pull/285) ([medwards](https://github.com/medwards)) +- Speed up bound checking in `take` [\#281](https://github.com/apache/arrow-rs/pull/281) ([Dandandan](https://github.com/Dandandan)) +- Update PR template by commenting out instructions [\#278](https://github.com/apache/arrow-rs/pull/278) ([nevi-me](https://github.com/nevi-me)) +- Added Decimal support to pretty-print display utility \(\#230\) [\#273](https://github.com/apache/arrow-rs/pull/273) ([mgill25](https://github.com/mgill25)) +- Fix null struct and list roundtrip [\#270](https://github.com/apache/arrow-rs/pull/270) ([nevi-me](https://github.com/nevi-me)) +- 1.52 clippy fixes [\#267](https://github.com/apache/arrow-rs/pull/267) ([nevi-me](https://github.com/nevi-me)) +- Fix typo in csv/reader.rs [\#265](https://github.com/apache/arrow-rs/pull/265) ([domoritz](https://github.com/domoritz)) +- Fix empty Schema::metadata deserialization error [\#260](https://github.com/apache/arrow-rs/pull/260) ([hulunbier](https://github.com/hulunbier)) +- update datafusion and ballista doc links [\#259](https://github.com/apache/arrow-rs/pull/259) ([Jimexist](https://github.com/Jimexist)) +- support full u32 and u64 roundtrip through parquet [\#258](https://github.com/apache/arrow-rs/pull/258) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([crepererum](https://github.com/crepererum)) +- \[MINOR\] Added env to run rust in integration. [\#253](https://github.com/apache/arrow-rs/pull/253) ([jorgecarleitao](https://github.com/jorgecarleitao)) +- \[Minor\] Made integration tests always run. [\#248](https://github.com/apache/arrow-rs/pull/248) ([jorgecarleitao](https://github.com/jorgecarleitao)) +- fix parquet max\_definition for non-null structs [\#246](https://github.com/apache/arrow-rs/pull/246) ([nevi-me](https://github.com/nevi-me)) +- Disabled rebase needed until demonstrate working. [\#243](https://github.com/apache/arrow-rs/pull/243) ([jorgecarleitao](https://github.com/jorgecarleitao)) +- pin flatbuffers to 0.8.4 [\#239](https://github.com/apache/arrow-rs/pull/239) ([ritchie46](https://github.com/ritchie46)) +- sort\_primitive result is capped to the min of limit or values.len [\#236](https://github.com/apache/arrow-rs/pull/236) ([medwards](https://github.com/medwards)) +- Read list field correctly [\#234](https://github.com/apache/arrow-rs/pull/234) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([nevi-me](https://github.com/nevi-me)) +- Fix code examples for RecordBatch::try\_from\_iter [\#231](https://github.com/apache/arrow-rs/pull/231) ([alamb](https://github.com/alamb)) +- Support string dictionaries in csv reader \(\#228\) [\#229](https://github.com/apache/arrow-rs/pull/229) ([tustvold](https://github.com/tustvold)) +- support LargeUtf8 in sort kernel [\#26](https://github.com/apache/arrow-rs/pull/26) ([ritchie46](https://github.com/ritchie46)) +- Removed unused files [\#22](https://github.com/apache/arrow-rs/pull/22) ([jorgecarleitao](https://github.com/jorgecarleitao)) +- ARROW-12504: Buffer::from\_slice\_ref set correct capacity [\#18](https://github.com/apache/arrow-rs/pull/18) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add GitHub templates [\#17](https://github.com/apache/arrow-rs/pull/17) ([andygrove](https://github.com/andygrove)) +- ARROW-12493: Add support for writing dictionary arrays to CSV and JSON [\#16](https://github.com/apache/arrow-rs/pull/16) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- ARROW-12426: \[Rust\] Fix concatentation of arrow dictionaries [\#15](https://github.com/apache/arrow-rs/pull/15) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Update repository and homepage urls [\#14](https://github.com/apache/arrow-rs/pull/14) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Dandandan](https://github.com/Dandandan)) +- Added rebase-needed bot [\#13](https://github.com/apache/arrow-rs/pull/13) ([jorgecarleitao](https://github.com/jorgecarleitao)) +- Added Integration tests against arrow [\#10](https://github.com/apache/arrow-rs/pull/10) ([jorgecarleitao](https://github.com/jorgecarleitao)) + +## [4.4.0](https://github.com/apache/arrow-rs/tree/4.4.0) (2021-06-24) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/4.3.0...4.4.0) + +**Breaking changes:** + +- migrate partition kernel to use Iterator trait [\#437](https://github.com/apache/arrow-rs/issues/437) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Remove DictionaryArray::keys\_array [\#391](https://github.com/apache/arrow-rs/issues/391) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Implemented enhancements:** + +- sort kernel boolean sort can be O\(n\) [\#447](https://github.com/apache/arrow-rs/issues/447) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- C data interface for decimal128, timestamp, date32 and date64 [\#413](https://github.com/apache/arrow-rs/issues/413) +- Add Decimal to CsvWriter [\#405](https://github.com/apache/arrow-rs/issues/405) +- Use iterators to increase performance of creating Arrow arrays [\#200](https://github.com/apache/arrow-rs/issues/200) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] + +**Fixed bugs:** + +- Release Audit Tool \(RAT\) is not being triggered [\#481](https://github.com/apache/arrow-rs/issues/481) +- Security Vulnerabilities: flatbuffers: `read_scalar` and `read_scalar_at` allow transmuting values without `unsafe` blocks [\#476](https://github.com/apache/arrow-rs/issues/476) +- Clippy broken after upgrade to rust 1.53 [\#467](https://github.com/apache/arrow-rs/issues/467) +- Pull Request Labeler is not working [\#462](https://github.com/apache/arrow-rs/issues/462) +- Arrow 4.3 release: error\[E0658\]: use of unstable library feature 'partition\_point': new API [\#456](https://github.com/apache/arrow-rs/issues/456) +- parquet reading hangs when row\_group contains more than 2048 rows of data [\#349](https://github.com/apache/arrow-rs/issues/349) +- Fail to build arrow [\#247](https://github.com/apache/arrow-rs/issues/247) +- JSON reader does not implement iterator [\#193](https://github.com/apache/arrow-rs/issues/193) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Security fixes:** + +- Ensure a successful MIRI Run on CI [\#227](https://github.com/apache/arrow-rs/issues/227) + +**Closed issues:** + +- sort kernel has a lot of unnecessary wrapping [\#446](https://github.com/apache/arrow-rs/issues/446) +- \[Parquet\] Plain encoded boolean column chunks limited to 2048 values [\#48](https://github.com/apache/arrow-rs/issues/48) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] + +## [4.3.0](https://github.com/apache/arrow-rs/tree/4.3.0) (2021-06-10) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/4.2.0...4.3.0) + +**Implemented enhancements:** + +- Add partitioning kernel for sorted arrays [\#428](https://github.com/apache/arrow-rs/issues/428) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Implement sort by float lists [\#427](https://github.com/apache/arrow-rs/issues/427) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Derive Eq and PartialEq for SortOptions [\#426](https://github.com/apache/arrow-rs/issues/426) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- use prettier and github action to normalize markdown document syntax [\#399](https://github.com/apache/arrow-rs/issues/399) +- window::shift can work for more than just primitive array type [\#392](https://github.com/apache/arrow-rs/issues/392) +- Doctest for ArrayBuilder [\#366](https://github.com/apache/arrow-rs/issues/366) + +**Fixed bugs:** + +- Boolean `not` kernel does not take offset of null buffer into account [\#417](https://github.com/apache/arrow-rs/issues/417) +- my contribution not marged in 4.2 release [\#394](https://github.com/apache/arrow-rs/issues/394) +- window::shift shall properly handle boundary cases [\#387](https://github.com/apache/arrow-rs/issues/387) +- Parquet `WriterProperties.max_row_group_size` not wired up [\#257](https://github.com/apache/arrow-rs/issues/257) +- Out of bound reads in chunk iterator [\#198](https://github.com/apache/arrow-rs/issues/198) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +## [4.2.0](https://github.com/apache/arrow-rs/tree/4.2.0) (2021-05-29) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/4.1.0...4.2.0) + +**Breaking changes:** + +- DictionaryArray::values\(\) clones the underlying ArrayRef [\#313](https://github.com/apache/arrow-rs/issues/313) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Implemented enhancements:** + +- Simplify shift kernel using null array [\#371](https://github.com/apache/arrow-rs/issues/371) +- Provide `Arc`-based constructor for `parquet::util::cursor::SliceableCursor` [\#368](https://github.com/apache/arrow-rs/issues/368) +- Add badges to crates [\#361](https://github.com/apache/arrow-rs/issues/361) +- Consider inlining PrimitiveArray::value [\#328](https://github.com/apache/arrow-rs/issues/328) +- Implement automated release verification script [\#327](https://github.com/apache/arrow-rs/issues/327) +- Add wasm32 to the list of target architectures of the simd feature [\#316](https://github.com/apache/arrow-rs/issues/316) +- add with\_escape for csv::ReaderBuilder [\#315](https://github.com/apache/arrow-rs/issues/315) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- IPC feature gate [\#310](https://github.com/apache/arrow-rs/issues/310) +- csv feature gate [\#309](https://github.com/apache/arrow-rs/issues/309) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add `shrink_to` / `shrink_to_fit` to `MutableBuffer` [\#297](https://github.com/apache/arrow-rs/issues/297) + +**Fixed bugs:** + +- Incorrect crate setup instructions [\#364](https://github.com/apache/arrow-rs/issues/364) +- Arrow-flight only register rerun-if-changed if file exists [\#350](https://github.com/apache/arrow-rs/issues/350) +- Dictionary Comparison Uses Wrong Values Array [\#332](https://github.com/apache/arrow-rs/issues/332) +- Undefined behavior in FFI implementation [\#322](https://github.com/apache/arrow-rs/issues/322) +- All-null column get wrong parquet null-counts [\#306](https://github.com/apache/arrow-rs/issues/306) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Filter has inconsistent null handling [\#295](https://github.com/apache/arrow-rs/issues/295) + +## [4.1.0](https://github.com/apache/arrow-rs/tree/4.1.0) (2021-05-17) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/4.0.0...4.1.0) + +**Implemented enhancements:** + +- Add Send to ArrayBuilder [\#290](https://github.com/apache/arrow-rs/issues/290) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Improve performance of bound checking option [\#280](https://github.com/apache/arrow-rs/issues/280) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- extend compute kernel arity to include nullary functions [\#276](https://github.com/apache/arrow-rs/issues/276) +- Implement FFI / CDataInterface for Struct Arrays [\#251](https://github.com/apache/arrow-rs/issues/251) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add support for pretty-printing Decimal numbers [\#230](https://github.com/apache/arrow-rs/issues/230) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- CSV Reader String Dictionary Support [\#228](https://github.com/apache/arrow-rs/issues/228) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add Builder interface for adding Arrays to record batches [\#210](https://github.com/apache/arrow-rs/issues/210) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support auto-vectorization for min/max [\#209](https://github.com/apache/arrow-rs/issues/209) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support LargeUtf8 in sort kernel [\#25](https://github.com/apache/arrow-rs/issues/25) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Fixed bugs:** + +- no method named `select_nth_unstable_by` found for mutable reference `&mut [T]` [\#283](https://github.com/apache/arrow-rs/issues/283) +- Rust 1.52 Clippy error [\#266](https://github.com/apache/arrow-rs/issues/266) +- NaNs can break parquet statistics [\#255](https://github.com/apache/arrow-rs/issues/255) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- u64::MAX does not roundtrip through parquet [\#254](https://github.com/apache/arrow-rs/issues/254) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Integration tests failing to compile \(flatbuffer\) [\#249](https://github.com/apache/arrow-rs/issues/249) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Fix compatibility quirks between arrow and parquet structs [\#245](https://github.com/apache/arrow-rs/issues/245) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Unable to write non-null Arrow structs to Parquet [\#244](https://github.com/apache/arrow-rs/issues/244) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- schema: missing field `metadata` when deserialize [\#241](https://github.com/apache/arrow-rs/issues/241) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Arrow does not compile due to flatbuffers upgrade [\#238](https://github.com/apache/arrow-rs/issues/238) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Sort with limit panics for the limit includes some but not all nulls, for large arrays [\#235](https://github.com/apache/arrow-rs/issues/235) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- arrow-rs contains a copy of the "format" directory [\#233](https://github.com/apache/arrow-rs/issues/233) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Fix SEGFAULT/ SIGILL in child-data ffi [\#206](https://github.com/apache/arrow-rs/issues/206) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Read list field correctly in \\> [\#167](https://github.com/apache/arrow-rs/issues/167) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- FFI listarray lead to undefined behavior. [\#20](https://github.com/apache/arrow-rs/issues/20) + +**Security fixes:** + +- Fix MIRI build on CI [\#226](https://github.com/apache/arrow-rs/issues/226) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Get MIRI running again [\#224](https://github.com/apache/arrow-rs/issues/224) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Documentation updates:** + +- Comment out the instructions in the PR template [\#277](https://github.com/apache/arrow-rs/issues/277) +- Update links to datafusion and ballista in README.md [\#19](https://github.com/apache/arrow-rs/issues/19) +- Update "repository" in Cargo.toml [\#12](https://github.com/apache/arrow-rs/issues/12) + +**Closed issues:** + +- Arrow Aligned Vec [\#268](https://github.com/apache/arrow-rs/issues/268) +- \[Rust\]: Tracking issue for AVX-512 [\#220](https://github.com/apache/arrow-rs/issues/220) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Umbrella issue for clippy integration [\#217](https://github.com/apache/arrow-rs/issues/217) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support sort [\#215](https://github.com/apache/arrow-rs/issues/215) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support stable Rust [\#214](https://github.com/apache/arrow-rs/issues/214) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Remove Rust and point integration tests to arrow-rs repo [\#211](https://github.com/apache/arrow-rs/issues/211) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- ArrayData buffers are inconsistent accross implementations [\#207](https://github.com/apache/arrow-rs/issues/207) +- 3.0.1 patch release [\#204](https://github.com/apache/arrow-rs/issues/204) +- Document patch release process [\#202](https://github.com/apache/arrow-rs/issues/202) +- Simplify Offset [\#186](https://github.com/apache/arrow-rs/issues/186) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Typed Bytes [\#185](https://github.com/apache/arrow-rs/issues/185) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- \[CI\]docker-compose setup should enable caching [\#175](https://github.com/apache/arrow-rs/issues/175) +- Improve take primitive performance [\#174](https://github.com/apache/arrow-rs/issues/174) +- \[CI\] Try out buildkite [\#165](https://github.com/apache/arrow-rs/issues/165) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Update assignees in JIRA where missing [\#160](https://github.com/apache/arrow-rs/issues/160) +- \[Rust\]: From\ implementations should validate data type [\#103](https://github.com/apache/arrow-rs/issues/103) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- \[DataFusion\] Verify that projection push down does not remove aliases columns [\#99](https://github.com/apache/arrow-rs/issues/99) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- \[Rust\]\[DataFusion\] Implement modulus expression [\#98](https://github.com/apache/arrow-rs/issues/98) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- \[DataFusion\] Add constant folding to expressions during logically planning [\#96](https://github.com/apache/arrow-rs/issues/96) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- \[DataFusion\] DataFrame.collect should return RecordBatchReader [\#95](https://github.com/apache/arrow-rs/issues/95) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- \[Rust\]\[DataFusion\] Add FORMAT to explain plan and an easy to visualize format [\#94](https://github.com/apache/arrow-rs/issues/94) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- \[DataFusion\] Implement metrics framework [\#90](https://github.com/apache/arrow-rs/issues/90) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- \[DataFusion\] Implement micro benchmarks for each operator [\#89](https://github.com/apache/arrow-rs/issues/89) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- \[DataFusion\] Implement pretty print for physical query plan [\#88](https://github.com/apache/arrow-rs/issues/88) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- \[Archery\] Support rust clippy in the lint command [\#83](https://github.com/apache/arrow-rs/issues/83) +- \[rust\]\[datafusion\] optimize count\(\*\) queries on parquet sources [\#75](https://github.com/apache/arrow-rs/issues/75) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- \[Rust\]\[DataFusion\] Improve like/nlike performance [\#71](https://github.com/apache/arrow-rs/issues/71) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- \[DataFusion\] Implement optimizer rule to remove redundant projections [\#56](https://github.com/apache/arrow-rs/issues/56) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- \[DataFusion\] Parquet data source does not support complex types [\#39](https://github.com/apache/arrow-rs/issues/39) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Merge utils from Parquet and Arrow [\#32](https://github.com/apache/arrow-rs/issues/32) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Add benchmarks for Parquet [\#30](https://github.com/apache/arrow-rs/issues/30) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Mark methods that do not perform bounds checking as unsafe [\#28](https://github.com/apache/arrow-rs/issues/28) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Test issue [\#24](https://github.com/apache/arrow-rs/issues/24) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- This is a test issue [\#11](https://github.com/apache/arrow-rs/issues/11) diff --git a/CHANGELOG.md b/CHANGELOG.md index 890e70bd5d12..549d4da1a6b0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,1297 +19,82 @@ # Changelog -## [15.0.0](https://github.com/apache/arrow-rs/tree/15.0.0) (2022-05-27) +## [16.0.0](https://github.com/apache/arrow-rs/tree/16.0.0) (2022-06-10) -[Full Changelog](https://github.com/apache/arrow-rs/compare/14.0.0...15.0.0) +[Full Changelog](https://github.com/apache/arrow-rs/compare/15.0.0...16.0.0) **Breaking changes:** -- Change `ArrayDataBuilder::null_bit_buffer` to accept `Option` rather than `Buffer` [\#1739](https://github.com/apache/arrow-rs/pull/1739) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) -- Remove `null_count` from `ArrayData::try_new()` [\#1721](https://github.com/apache/arrow-rs/pull/1721) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) -- Change parquet writers to use standard `std:io::Write` rather custom `ParquetWriter` trait \(\#1717\) \(\#1163\) [\#1719](https://github.com/apache/arrow-rs/pull/1719) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Add explicit column mask for selection in parquet: `ProjectionMask` \(\#1701\) [\#1716](https://github.com/apache/arrow-rs/pull/1716) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Add type\_ids in Union datatype [\#1703](https://github.com/apache/arrow-rs/pull/1703) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Fix Parquet Reader's Arrow Schema Inference [\#1682](https://github.com/apache/arrow-rs/pull/1682) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Seal `ArrowNativeType` and `OffsetSizeTrait` for safety \(\#1028\) [\#1819](https://github.com/apache/arrow-rs/pull/1819) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Improve API for `csv::infer_file_schema` by removing redundant ref [\#1776](https://github.com/apache/arrow-rs/pull/1776) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) **Implemented enhancements:** -- Rename the `string` kernel to `concatenate_elements` [\#1747](https://github.com/apache/arrow-rs/issues/1747) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- `ArrayDataBuilder::null_bit_buffer` should accept `Option` as input type [\#1737](https://github.com/apache/arrow-rs/issues/1737) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Fix schema comparison for non\_canonical\_map when running flight test [\#1730](https://github.com/apache/arrow-rs/issues/1730) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Add support in aggregate kernel for `BinaryArray` [\#1724](https://github.com/apache/arrow-rs/issues/1724) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Fix incorrect null\_count in `generate_unions_case` integration test [\#1712](https://github.com/apache/arrow-rs/issues/1712) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Keep type ids in Union datatype to follow Arrow spec and integrate with other implementations [\#1690](https://github.com/apache/arrow-rs/issues/1690) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Support Reading Alternative List Representations to Arrow From Parquet [\#1680](https://github.com/apache/arrow-rs/issues/1680) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Speed up the offsets checking [\#1675](https://github.com/apache/arrow-rs/issues/1675) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Separate Parquet -\> Arrow Schema Conversion From ArrayBuilder [\#1655](https://github.com/apache/arrow-rs/issues/1655) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Add `leaf_columns` argument to `ArrowReader::get_record_reader_by_columns` [\#1653](https://github.com/apache/arrow-rs/issues/1653) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Implement `string_concat` kernel [\#1540](https://github.com/apache/arrow-rs/issues/1540) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Improve Unit Test Coverage of ArrayReaderBuilder [\#1484](https://github.com/apache/arrow-rs/issues/1484) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- List equality method should work on empty offset `ListArray` [\#1817](https://github.com/apache/arrow-rs/issues/1817) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Command line tool for convert CSV to Parquet [\#1797](https://github.com/apache/arrow-rs/issues/1797) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- IPC writer should write validity buffer for `UnionArray` in V4 IPC message [\#1793](https://github.com/apache/arrow-rs/issues/1793) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add function for row alignment with page mask [\#1790](https://github.com/apache/arrow-rs/issues/1790) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Rust IPC Read should be able to read V4 UnionType Array [\#1788](https://github.com/apache/arrow-rs/issues/1788) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- `combine_option_bitmap` should accept arbitrary number of input arrays. [\#1780](https://github.com/apache/arrow-rs/issues/1780) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add `substring_by_char` kernels for slicing on character boundaries [\#1768](https://github.com/apache/arrow-rs/issues/1768) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support reading `PageIndex` from column metadata [\#1761](https://github.com/apache/arrow-rs/issues/1761) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Support casting from `DataType::Utf8` to `DataType::Boolean` [\#1740](https://github.com/apache/arrow-rs/issues/1740) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Make current position available in `FileWriter`. [\#1691](https://github.com/apache/arrow-rs/issues/1691) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Support writing parquet to `stdout` [\#1687](https://github.com/apache/arrow-rs/issues/1687) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] **Fixed bugs:** -- Parquet write failure \(from record batches\) when data is nested two levels deep [\#1744](https://github.com/apache/arrow-rs/issues/1744) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- IPC reader may break on projection [\#1735](https://github.com/apache/arrow-rs/issues/1735) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Latest nightly fails to build with feature simd [\#1734](https://github.com/apache/arrow-rs/issues/1734) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Trying to write parquet file in parallel results in corrupt file [\#1717](https://github.com/apache/arrow-rs/issues/1717) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Roundtrip failure when using DELTA\_BINARY\_PACKED [\#1708](https://github.com/apache/arrow-rs/issues/1708) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- `ArrayData::try_new` cannot always return expected error. [\#1707](https://github.com/apache/arrow-rs/issues/1707) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- "out of order projection is not supported" after Fix Parquet Arrow Schema Inference [\#1701](https://github.com/apache/arrow-rs/issues/1701) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Rust is not interoperability with C++ for IPC schemas with dictionaries [\#1694](https://github.com/apache/arrow-rs/issues/1694) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Incorrect Repeated Field Schema Inference [\#1681](https://github.com/apache/arrow-rs/issues/1681) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Parquet Treats Embedded Arrow Schema as Authoritative [\#1663](https://github.com/apache/arrow-rs/issues/1663) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- parquet\_to\_arrow\_schema\_by\_columns Incorrectly Handles Nested Types [\#1654](https://github.com/apache/arrow-rs/issues/1654) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Inconsistent Arrow Schema When Projecting Nested Parquet File [\#1652](https://github.com/apache/arrow-rs/issues/1652) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- StructArrayReader Cannot Handle Nested Lists [\#1651](https://github.com/apache/arrow-rs/issues/1651) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Bug \(`substring` kernel\): The null buffer is not aligned when `offset != 0` [\#1639](https://github.com/apache/arrow-rs/issues/1639) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Incorrect Offset Validation for Sliced List Array Children [\#1814](https://github.com/apache/arrow-rs/issues/1814) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Parquet Snappy Codec overwrites Existing Data in Decompression Buffer [\#1806](https://github.com/apache/arrow-rs/issues/1806) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- `flight_data_to_arrow_batch` does not support `RecordBatch`es with no columns [\#1783](https://github.com/apache/arrow-rs/issues/1783) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- parquet does not compile with `features=["zstd"]` [\#1630](https://github.com/apache/arrow-rs/issues/1630) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] **Documentation updates:** -- Parquet command line tool does not install "globally" [\#1710](https://github.com/apache/arrow-rs/issues/1710) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Improve integration test document to follow Arrow C++ repo CI [\#1742](https://github.com/apache/arrow-rs/pull/1742) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) - -**Merged pull requests:** - -- Test for list array equality with different offsets [\#1756](https://github.com/apache/arrow-rs/pull/1756) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) -- Rename `string_concat` to `concat_elements_utf8` [\#1754](https://github.com/apache/arrow-rs/pull/1754) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) -- Rename the `string` kernel to `concat_elements`. [\#1752](https://github.com/apache/arrow-rs/pull/1752) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) -- Support writing nested lists to parquet [\#1746](https://github.com/apache/arrow-rs/pull/1746) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Pin nightly version to bypass packed\_simd build error [\#1743](https://github.com/apache/arrow-rs/pull/1743) ([viirya](https://github.com/viirya)) -- Fix projection in IPC reader [\#1736](https://github.com/apache/arrow-rs/pull/1736) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([iyupeng](https://github.com/iyupeng)) -- `cargo install` installs not globally [\#1732](https://github.com/apache/arrow-rs/pull/1732) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([kazuk](https://github.com/kazuk)) -- Fix schema comparison for non\_canonical\_map when running flight test [\#1731](https://github.com/apache/arrow-rs/pull/1731) ([viirya](https://github.com/viirya)) -- Add `min_binary` and `max_binary` aggregate kernels [\#1725](https://github.com/apache/arrow-rs/pull/1725) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) -- Fix parquet benchmarks [\#1723](https://github.com/apache/arrow-rs/pull/1723) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Fix BitReader::get\_batch zero extension \(\#1708\) [\#1722](https://github.com/apache/arrow-rs/pull/1722) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Implementation string concat [\#1720](https://github.com/apache/arrow-rs/pull/1720) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Ismail-Maj](https://github.com/Ismail-Maj)) -- Check the length of `null_bit_buffer` in `ArrayData::try_new()` [\#1714](https://github.com/apache/arrow-rs/pull/1714) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) -- Fix incorrect null\_count in `generate_unions_case` integration test [\#1713](https://github.com/apache/arrow-rs/pull/1713) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Fix: Null buffer accounts for `offset` in `substring` kernel. [\#1704](https://github.com/apache/arrow-rs/pull/1704) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) -- Minor: Refine `OffsetSizeTrait` to extend `num::Integer` [\#1702](https://github.com/apache/arrow-rs/pull/1702) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) -- Fix StructArrayReader handling nested lists \(\#1651\) [\#1700](https://github.com/apache/arrow-rs/pull/1700) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Speed up the offsets checking [\#1684](https://github.com/apache/arrow-rs/pull/1684) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) - -## [14.0.0](https://github.com/apache/arrow-rs/tree/14.0.0) (2022-05-13) - -[Full Changelog](https://github.com/apache/arrow-rs/compare/13.0.0...14.0.0) - -**Breaking changes:** - -- Use `bytes` in parquet rather than custom Buffer implementation \(\#1474\) [\#1683](https://github.com/apache/arrow-rs/pull/1683) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Rename `OffsetSize::fn is_large` to `const OffsetSize::IS_LARGE` [\#1664](https://github.com/apache/arrow-rs/pull/1664) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) -- Remove `StringOffsetTrait` and `BinaryOffsetTrait` [\#1645](https://github.com/apache/arrow-rs/pull/1645) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) -- Fix `generate_nested_dictionary_case` integration test failure [\#1636](https://github.com/apache/arrow-rs/pull/1636) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([viirya](https://github.com/viirya)) - -**Implemented enhancements:** - -- Add support for `DataType::Duration` in ffi interface [\#1688](https://github.com/apache/arrow-rs/issues/1688) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Fix `generate_unions_case` integration test [\#1676](https://github.com/apache/arrow-rs/issues/1676) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Add `DictionaryArray` support for `bit_length` kernel [\#1673](https://github.com/apache/arrow-rs/issues/1673) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Add `DictionaryArray` support for `length` kernel [\#1672](https://github.com/apache/arrow-rs/issues/1672) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- flight\_client\_scenarios integration test should receive schema from flight data [\#1669](https://github.com/apache/arrow-rs/issues/1669) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Unpin Flatbuffer version dependency [\#1667](https://github.com/apache/arrow-rs/issues/1667) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Add dictionary array support for substring function [\#1656](https://github.com/apache/arrow-rs/issues/1656) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Exclude dict\_id and dict\_is\_ordered from equality comparison of `Field` [\#1646](https://github.com/apache/arrow-rs/issues/1646) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Remove `StringOffsetTrait` and `BinaryOffsetTrait` [\#1644](https://github.com/apache/arrow-rs/issues/1644) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Add tests and examples for `UnionArray::from(data: ArrayData)` [\#1643](https://github.com/apache/arrow-rs/issues/1643) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Add methods `pub fn offsets_buffer`, `pub fn types_ids_buffer`and `pub fn data_buffer` for `ArrayDataBuilder` [\#1640](https://github.com/apache/arrow-rs/issues/1640) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Fix `generate_nested_dictionary_case` integration test failure for Rust cases [\#1635](https://github.com/apache/arrow-rs/issues/1635) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Expose `ArrowWriter` row group flush in public API [\#1626](https://github.com/apache/arrow-rs/issues/1626) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Add `substring` support for `FixedSizeBinaryArray` [\#1618](https://github.com/apache/arrow-rs/issues/1618) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Add PrettyPrint for `UnionArray`s [\#1594](https://github.com/apache/arrow-rs/issues/1594) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Add SIMD support for the `length` kernel [\#1489](https://github.com/apache/arrow-rs/issues/1489) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Support dictionary arrays in length and bit\_length [\#1674](https://github.com/apache/arrow-rs/pull/1674) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Add dictionary array support for substring function [\#1665](https://github.com/apache/arrow-rs/pull/1665) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([sunchao](https://github.com/sunchao)) -- Add `DecimalType` support in `new_null_array ` [\#1659](https://github.com/apache/arrow-rs/pull/1659) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([yjshen](https://github.com/yjshen)) - -**Fixed bugs:** - -- Docs.rs build is broken [\#1695](https://github.com/apache/arrow-rs/issues/1695) -- Interoperability with C++ for IPC schemas with dictionaries [\#1694](https://github.com/apache/arrow-rs/issues/1694) -- `UnionArray::is_null` incorrect [\#1625](https://github.com/apache/arrow-rs/issues/1625) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Published Parquet documentation missing `arrow::async_reader` [\#1617](https://github.com/apache/arrow-rs/issues/1617) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Files written with Julia's Arrow.jl in IPC format cannot be read by arrow-rs [\#1335](https://github.com/apache/arrow-rs/issues/1335) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] - -**Documentation updates:** - -- Correct arrow-flight readme version [\#1641](https://github.com/apache/arrow-rs/pull/1641) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alamb](https://github.com/alamb)) - -**Closed issues:** - -- Make `OffsetSizeTrait::IS_LARGE` as a const value [\#1658](https://github.com/apache/arrow-rs/issues/1658) -- Question: Why are there 3 types of `OffsetSizeTrait`s? [\#1638](https://github.com/apache/arrow-rs/issues/1638) -- Written Parquet file way bigger than input files [\#1627](https://github.com/apache/arrow-rs/issues/1627) -- Ensure there is a single zero in the offsets buffer for an empty ListArray. [\#1620](https://github.com/apache/arrow-rs/issues/1620) -- Filtering `UnionArray` Changes DataType [\#1595](https://github.com/apache/arrow-rs/issues/1595) - -**Merged pull requests:** - -- Fix docs.rs build [\#1696](https://github.com/apache/arrow-rs/pull/1696) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) -- support duration in ffi [\#1689](https://github.com/apache/arrow-rs/pull/1689) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([ryan-jacobs1](https://github.com/ryan-jacobs1)) -- fix bench command line options [\#1685](https://github.com/apache/arrow-rs/pull/1685) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([kazuk](https://github.com/kazuk)) -- Enable branch protection [\#1679](https://github.com/apache/arrow-rs/pull/1679) ([tustvold](https://github.com/tustvold)) -- Fix logical merge conflict in \#1588 [\#1678](https://github.com/apache/arrow-rs/pull/1678) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Fix generate\_unions\_case for Rust case [\#1677](https://github.com/apache/arrow-rs/pull/1677) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Receive schema from flight data [\#1670](https://github.com/apache/arrow-rs/pull/1670) ([viirya](https://github.com/viirya)) -- unpin flatbuffers dependency version [\#1668](https://github.com/apache/arrow-rs/pull/1668) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Cheappie](https://github.com/Cheappie)) -- Remove parquet dictionary converters \(\#1661\) [\#1662](https://github.com/apache/arrow-rs/pull/1662) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Minor: simplify the function `GenericListArray::get_type` [\#1650](https://github.com/apache/arrow-rs/pull/1650) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) -- Pretty Print `UnionArray`s [\#1648](https://github.com/apache/arrow-rs/pull/1648) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tfeda](https://github.com/tfeda)) -- Exclude `dict_id` and `dict_is_ordered` from equality comparison of `Field` [\#1647](https://github.com/apache/arrow-rs/pull/1647) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- expose row-group flush in public api [\#1634](https://github.com/apache/arrow-rs/pull/1634) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Cheappie](https://github.com/Cheappie)) -- Add `substring` support for `FixedSizeBinaryArray` [\#1633](https://github.com/apache/arrow-rs/pull/1633) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) -- Fix UnionArray is\_null [\#1632](https://github.com/apache/arrow-rs/pull/1632) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Do not assume dictionaries exists in footer [\#1631](https://github.com/apache/arrow-rs/pull/1631) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([pcjentsch](https://github.com/pcjentsch)) -- Add support for nested list arrays from parquet to arrow arrays \(\#993\) [\#1588](https://github.com/apache/arrow-rs/pull/1588) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Add `async` into doc features [\#1349](https://github.com/apache/arrow-rs/pull/1349) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([HaoYang670](https://github.com/HaoYang670)) - - -## [13.0.0](https://github.com/apache/arrow-rs/tree/13.0.0) (2022-04-29) - -[Full Changelog](https://github.com/apache/arrow-rs/compare/12.0.0...13.0.0) - -**Breaking changes:** - -- Update `parquet::basic::LogicalType` to be more idomatic [\#1612](https://github.com/apache/arrow-rs/pull/1612) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tfeda](https://github.com/tfeda)) -- Fix Null Mask Handling in `ArrayData`, `UnionArray`, and `MapArray` [\#1589](https://github.com/apache/arrow-rs/pull/1589) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Replace `&Option` with `Option<&T>` in several `arrow` and `parquet` APIs [\#1571](https://github.com/apache/arrow-rs/pull/1571) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tfeda](https://github.com/tfeda)) - -**Implemented enhancements:** - -- Read/write nested dictionary under fixed size list in ipc stream reader/write [\#1609](https://github.com/apache/arrow-rs/issues/1609) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Add support for `BinaryArray` in `substring` kernel [\#1593](https://github.com/apache/arrow-rs/issues/1593) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Read/write nested dictionary under large list in ipc stream reader/write [\#1584](https://github.com/apache/arrow-rs/issues/1584) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Read/write nested dictionary under map in ipc stream reader/write [\#1582](https://github.com/apache/arrow-rs/issues/1582) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Implement `Clone` for JSON `DecoderOptions` [\#1580](https://github.com/apache/arrow-rs/issues/1580) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Add utf-8 validation checking to `substring` kernel [\#1575](https://github.com/apache/arrow-rs/issues/1575) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Support casting to/from `DataType::Null` in `cast` kernel [\#1572](https://github.com/apache/arrow-rs/pull/1572) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([WinkerDu](https://github.com/WinkerDu)) - -**Fixed bugs:** - -- Parquet schema should allow scale == precision for decimal type [\#1606](https://github.com/apache/arrow-rs/issues/1606) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- ListArray::from\(ArrayData\) dereferences invalid pointer when offsets are empty [\#1601](https://github.com/apache/arrow-rs/issues/1601) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- ArrayData Equality Incorrect Null Mask Offset Handling [\#1599](https://github.com/apache/arrow-rs/issues/1599) -- Filtering UnionArray Incorrect Handles Runs [\#1598](https://github.com/apache/arrow-rs/issues/1598) -- \[Safety\] Filtering Dense UnionArray Produces Invalid Offsets [\#1596](https://github.com/apache/arrow-rs/issues/1596) -- \[Safety\] UnionBuilder Doesn't Check Types [\#1591](https://github.com/apache/arrow-rs/issues/1591) -- Union Layout Should Not Support Separate Validity Mask [\#1590](https://github.com/apache/arrow-rs/issues/1590) -- Incorrect nullable flag when reading maps \( test\_read\_maps fails when `force_validate` is active\) [\#1587](https://github.com/apache/arrow-rs/issues/1587) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Output of `ipc::reader::tests::projection_should_work` fails validation [\#1548](https://github.com/apache/arrow-rs/issues/1548) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Incorrect min/max statistics for decimals with byte-array notation [\#1532](https://github.com/apache/arrow-rs/issues/1532) - -**Documentation updates:** - -- Minor: Clarify docs on `UnionBuilder::append_null` [\#1628](https://github.com/apache/arrow-rs/pull/1628) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) - -**Closed issues:** - -- Dense UnionArray Offsets Are i32 not i8 [\#1597](https://github.com/apache/arrow-rs/issues/1597) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Replace `&Option` with `Option<&T>` in some APIs [\#1556](https://github.com/apache/arrow-rs/issues/1556) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Improve ergonomics of `parquet::basic::LogicalType` [\#1554](https://github.com/apache/arrow-rs/issues/1554) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Mark the current `substring` function as `unsafe` and rename it. [\#1541](https://github.com/apache/arrow-rs/issues/1541) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Requirements for Async Parquet API [\#1473](https://github.com/apache/arrow-rs/issues/1473) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] - -**Merged pull requests:** - -- Nit: use the standard function `div_ceil` [\#1629](https://github.com/apache/arrow-rs/pull/1629) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) -- Update flatbuffers requirement from =2.1.1 to =2.1.2 [\#1622](https://github.com/apache/arrow-rs/pull/1622) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- Fix decimals min max statistics [\#1621](https://github.com/apache/arrow-rs/pull/1621) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([atefsawaed](https://github.com/atefsawaed)) -- Add example readme [\#1615](https://github.com/apache/arrow-rs/pull/1615) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) -- Improve docs and examples links on main readme [\#1614](https://github.com/apache/arrow-rs/pull/1614) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) -- Read/Write nested dictionaries under FixedSizeList in IPC [\#1610](https://github.com/apache/arrow-rs/pull/1610) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Add `substring` support for binary [\#1608](https://github.com/apache/arrow-rs/pull/1608) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) -- Parquet: schema validation should allow scale == precision for decimal type [\#1607](https://github.com/apache/arrow-rs/pull/1607) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([sunchao](https://github.com/sunchao)) -- Don't access and validate offset buffer in ListArray::from\(ArrayData\) [\#1602](https://github.com/apache/arrow-rs/pull/1602) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jhorstmann](https://github.com/jhorstmann)) -- Fix map nullable flag in `ParquetTypeConverter` [\#1592](https://github.com/apache/arrow-rs/pull/1592) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([viirya](https://github.com/viirya)) -- Read/write nested dictionary under large list in ipc stream reader/writer [\#1585](https://github.com/apache/arrow-rs/pull/1585) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Read/write nested dictionary under map in ipc stream reader/writer [\#1583](https://github.com/apache/arrow-rs/pull/1583) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Derive `Clone` and `PartialEq` for json `DecoderOptions` [\#1581](https://github.com/apache/arrow-rs/pull/1581) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) -- Add utf-8 validation checking for `substring` [\#1577](https://github.com/apache/arrow-rs/pull/1577) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) -- Use `Option` rather than `Option<&T>` for copy types in substring kernel [\#1576](https://github.com/apache/arrow-rs/pull/1576) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Use littleendian arrow files for `projection_should_work` [\#1573](https://github.com/apache/arrow-rs/pull/1573) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) - - -## [12.0.0](https://github.com/apache/arrow-rs/tree/12.0.0) (2022-04-15) - -[Full Changelog](https://github.com/apache/arrow-rs/compare/11.1.0...12.0.0) - -**Breaking changes:** - -- Add `ArrowReaderOptions` to `ParquetFileArrowReader`, add option to skip decoding arrow metadata from parquet \(\#1459\) [\#1558](https://github.com/apache/arrow-rs/pull/1558) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Support `RecordBatch` with zero columns but non zero row count, add field to `RecordBatchOptions` \(\#1536\) [\#1552](https://github.com/apache/arrow-rs/pull/1552) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Consolidate JSON Reader options and `DecoderOptions` [\#1539](https://github.com/apache/arrow-rs/pull/1539) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) -- Update `prost`, `prost-derive` and `prost-types` to 0.10, `tonic`, and `tonic-build` to `0.7` [\#1510](https://github.com/apache/arrow-rs/pull/1510) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alamb](https://github.com/alamb)) -- Add Json `DecoderOptions` and support custom `format_string` for each field [\#1451](https://github.com/apache/arrow-rs/pull/1451) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([sum12](https://github.com/sum12)) - -**Implemented enhancements:** - -- Read/write nested dictionary in ipc stream reader/writer [\#1565](https://github.com/apache/arrow-rs/issues/1565) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Support `FixedSizeBinary` in the Arrow C data interface [\#1553](https://github.com/apache/arrow-rs/issues/1553) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Support Empty Column Projection in `ParquetRecordBatchReader` [\#1537](https://github.com/apache/arrow-rs/issues/1537) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Support `RecordBatch` with zero columns but non zero row count [\#1536](https://github.com/apache/arrow-rs/issues/1536) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Add support for `Date32`/`Date64`\<--\> `String`/`LargeString` in `cast` kernel [\#1535](https://github.com/apache/arrow-rs/issues/1535) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Support creating arrays from externally owned memory like `Vec` or `String` [\#1516](https://github.com/apache/arrow-rs/issues/1516) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Speed up the `substring` kernel [\#1511](https://github.com/apache/arrow-rs/issues/1511) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Handle Parquet Files With Inconsistent Timestamp Units [\#1459](https://github.com/apache/arrow-rs/issues/1459) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] - -**Fixed bugs:** - -- Error Infering Schema for LogicalType::UNKNOWN [\#1557](https://github.com/apache/arrow-rs/issues/1557) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Read dictionary from nested struct in ipc stream reader panics [\#1549](https://github.com/apache/arrow-rs/issues/1549) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- `filter` produces invalid sparse `UnionArray`s [\#1547](https://github.com/apache/arrow-rs/issues/1547) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Documentation for `GenericListBuilder` is not exposed. [\#1518](https://github.com/apache/arrow-rs/issues/1518) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- cannot read parquet file [\#1515](https://github.com/apache/arrow-rs/issues/1515) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- The `substring` kernel panics when chars \> U+0x007F [\#1478](https://github.com/apache/arrow-rs/issues/1478) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Hang due to infinite loop when reading some parquet files with RLE encoding and bit packing [\#1458](https://github.com/apache/arrow-rs/issues/1458) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] - -**Documentation updates:** - -- Improve JSON reader documentation [\#1559](https://github.com/apache/arrow-rs/pull/1559) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) -- Improve doc string for `substring` kernel [\#1529](https://github.com/apache/arrow-rs/pull/1529) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) -- Expose documentation of `GenericListBuilder` [\#1525](https://github.com/apache/arrow-rs/pull/1525) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([comath](https://github.com/comath)) -- Add a diagram to `take` kernel documentation [\#1524](https://github.com/apache/arrow-rs/pull/1524) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) - -**Closed issues:** - -- Interesting benchmark results of `min_max_helper` [\#1400](https://github.com/apache/arrow-rs/issues/1400) - -**Merged pull requests:** - -- Fix incorrect `into_buffers` for UnionArray [\#1567](https://github.com/apache/arrow-rs/pull/1567) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Read/write nested dictionary in ipc stream reader/writer [\#1566](https://github.com/apache/arrow-rs/pull/1566) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Support FixedSizeBinary and FixedSizeList for the C data interface [\#1564](https://github.com/apache/arrow-rs/pull/1564) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([sunchao](https://github.com/sunchao)) -- Split out ListArrayReader into separate module \(\#1483\) [\#1563](https://github.com/apache/arrow-rs/pull/1563) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Split out `MapArray` into separate module \(\#1483\) [\#1562](https://github.com/apache/arrow-rs/pull/1562) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Support empty projection in `ParquetRecordBatchReader` [\#1560](https://github.com/apache/arrow-rs/pull/1560) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- fix infinite loop in not fully packed bit-packed runs [\#1555](https://github.com/apache/arrow-rs/pull/1555) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Add test for creating FixedSizeBinaryArray::try\_from\_sparse\_iter failed when given all Nones [\#1551](https://github.com/apache/arrow-rs/pull/1551) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) -- Fix reading dictionaries from nested structs in ipc `StreamReader` [\#1550](https://github.com/apache/arrow-rs/pull/1550) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dispanser](https://github.com/dispanser)) -- Add support for Date32/64 \<--\> String/LargeString in `cast` kernel [\#1534](https://github.com/apache/arrow-rs/pull/1534) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([yjshen](https://github.com/yjshen)) -- fix clippy errors in 1.60 [\#1527](https://github.com/apache/arrow-rs/pull/1527) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) -- Mark `remove-old-releases.sh` executable [\#1522](https://github.com/apache/arrow-rs/pull/1522) ([alamb](https://github.com/alamb)) -- Delete duplicate code in the `sort` kernel [\#1519](https://github.com/apache/arrow-rs/pull/1519) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) -- Fix reading nested lists from parquet files [\#1517](https://github.com/apache/arrow-rs/pull/1517) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([viirya](https://github.com/viirya)) -- Speed up the `substring` kernel by about 2x [\#1512](https://github.com/apache/arrow-rs/pull/1512) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) -- Add `new_from_strings` to create `MapArrays` [\#1507](https://github.com/apache/arrow-rs/pull/1507) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Decouple buffer deallocation from ffi and allow creating buffers from rust vec [\#1494](https://github.com/apache/arrow-rs/pull/1494) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jhorstmann](https://github.com/jhorstmann)) - -## [11.1.0](https://github.com/apache/arrow-rs/tree/11.1.0) (2022-03-31) - -[Full Changelog](https://github.com/apache/arrow-rs/compare/11.0.0...11.1.0) - -**Implemented enhancements:** - -- Implement `size_hint` and `ExactSizedIterator` for DecimalArray [\#1505](https://github.com/apache/arrow-rs/issues/1505) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Support calculate length by chars for `StringArray` [\#1493](https://github.com/apache/arrow-rs/issues/1493) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Add `length` kernel support for `ListArray` [\#1470](https://github.com/apache/arrow-rs/issues/1470) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- The length kernel should work with `BinaryArray`s [\#1464](https://github.com/apache/arrow-rs/issues/1464) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- FFI for Arrow C Stream Interface [\#1348](https://github.com/apache/arrow-rs/issues/1348) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Improve performance of `DictionaryArray::try_new()` [\#1313](https://github.com/apache/arrow-rs/issues/1313) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] - -**Fixed bugs:** - -- MIRI error in math\_checked\_divide\_op/try\_from\_trusted\_len\_iter [\#1496](https://github.com/apache/arrow-rs/issues/1496) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Parquet Writer Incorrect Definition Levels for Nested NullArray [\#1480](https://github.com/apache/arrow-rs/issues/1480) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- FFI: ArrowArray::try\_from\_raw shouldn't clone [\#1425](https://github.com/apache/arrow-rs/issues/1425) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Parquet reader fails to read null list. [\#1399](https://github.com/apache/arrow-rs/issues/1399) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] - -**Documentation updates:** - -- A small mistake in the doc of `BinaryArray` and `LargeBinaryArray` [\#1455](https://github.com/apache/arrow-rs/issues/1455) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- A small mistake in the doc of `GenericBinaryArray::take_iter_unchecked` [\#1454](https://github.com/apache/arrow-rs/issues/1454) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Add links in the doc of `BinaryOffsetSizeTrait` [\#1453](https://github.com/apache/arrow-rs/issues/1453) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- The doc of `FixedSizeBinaryArray` is confusing. [\#1452](https://github.com/apache/arrow-rs/issues/1452) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Clarify docs that SlicesIterator ignores null values [\#1504](https://github.com/apache/arrow-rs/pull/1504) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) -- Update the doc of `BinaryArray` and `LargeBinaryArray` [\#1471](https://github.com/apache/arrow-rs/pull/1471) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) - -**Closed issues:** - -- `packed_simd` v.s. `portable_simd`, which should be used? [\#1492](https://github.com/apache/arrow-rs/issues/1492) -- Cleanup: Use Arrow take kernel Within parquet ListArrayReader [\#1482](https://github.com/apache/arrow-rs/issues/1482) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] - -**Merged pull requests:** - -- Implement `size_hint` and `ExactSizedIterator` for `DecimalArray` [\#1506](https://github.com/apache/arrow-rs/pull/1506) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) -- Add `StringArray::num_chars` for calculating number of characters [\#1503](https://github.com/apache/arrow-rs/pull/1503) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) -- Workaround nightly miri error in `try_from_trusted_len_iter` [\#1497](https://github.com/apache/arrow-rs/pull/1497) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jhorstmann](https://github.com/jhorstmann)) -- update doc of array\_binary and array\_string [\#1491](https://github.com/apache/arrow-rs/pull/1491) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) -- Use Arrow take kernel within ListArrayReader [\#1490](https://github.com/apache/arrow-rs/pull/1490) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([viirya](https://github.com/viirya)) -- Add `length` kernel support for List Array [\#1488](https://github.com/apache/arrow-rs/pull/1488) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) -- Support sort for `Decimal` data type [\#1487](https://github.com/apache/arrow-rs/pull/1487) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([yjshen](https://github.com/yjshen)) -- Fix reading/writing nested null arrays \(\#1480\) \(\#1036\) \(\#1399\) [\#1481](https://github.com/apache/arrow-rs/pull/1481) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Implement ArrayEqual for UnionArray [\#1469](https://github.com/apache/arrow-rs/pull/1469) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Support the `length` kernel on Binary Array [\#1465](https://github.com/apache/arrow-rs/pull/1465) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) -- Remove Clone and copy source structs internally [\#1449](https://github.com/apache/arrow-rs/pull/1449) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Fix Parquet reader for null lists [\#1448](https://github.com/apache/arrow-rs/pull/1448) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([viirya](https://github.com/viirya)) -- Improve performance of DictionaryArray::try\_new\(\)  [\#1435](https://github.com/apache/arrow-rs/pull/1435) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jackwener](https://github.com/jackwener)) -- Add FFI for Arrow C Stream Interface [\#1384](https://github.com/apache/arrow-rs/pull/1384) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) - -## [11.0.0](https://github.com/apache/arrow-rs/tree/11.0.0) (2022-03-17) - -[Full Changelog](https://github.com/apache/arrow-rs/compare/10.0.0...11.0.0) - -**Breaking changes:** - -- Replace `filter_row_groups` with `ReadOptions` in parquet SerializedFileReader [\#1389](https://github.com/apache/arrow-rs/pull/1389) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([yjshen](https://github.com/yjshen)) -- Implement projection for arrow `IPC Reader` file / streams [\#1339](https://github.com/apache/arrow-rs/pull/1339) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([Dandandan](https://github.com/Dandandan)) - -**Implemented enhancements:** - -- Fix generate\_interval\_case integration test failure [\#1445](https://github.com/apache/arrow-rs/issues/1445) -- Make the doc examples of `ListArray` and `LargeListArray` more readable [\#1433](https://github.com/apache/arrow-rs/issues/1433) -- Redundant `if` and `abs` in `shift()` [\#1427](https://github.com/apache/arrow-rs/issues/1427) -- Improve substring kernel performance [\#1422](https://github.com/apache/arrow-rs/issues/1422) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Add missing value\_unchecked\(\) of `FixedSizeBinaryArray` [\#1419](https://github.com/apache/arrow-rs/issues/1419) -- Remove duplicate bound check in function `shift` [\#1408](https://github.com/apache/arrow-rs/issues/1408) -- Support dictionary array in C data interface [\#1397](https://github.com/apache/arrow-rs/issues/1397) -- filter kernel should work with `UnionArray`s [\#1394](https://github.com/apache/arrow-rs/issues/1394) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- filter kernel should work with `FixedSizeListArrays`s [\#1393](https://github.com/apache/arrow-rs/issues/1393) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Add doc examples for creating FixedSizeListArray [\#1392](https://github.com/apache/arrow-rs/issues/1392) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Update `rust-version` to 1.59 [\#1377](https://github.com/apache/arrow-rs/issues/1377) -- Arrow IPC projection support [\#1338](https://github.com/apache/arrow-rs/issues/1338) -- Implement basic FlightSQL Server [\#1386](https://github.com/apache/arrow-rs/pull/1386) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([wangfenjin](https://github.com/wangfenjin)) - -**Fixed bugs:** - -- DictionaryArray::try\_new ignores validity bitmap of the keys [\#1429](https://github.com/apache/arrow-rs/issues/1429) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- The doc of `GenericListArray` is confusing [\#1424](https://github.com/apache/arrow-rs/issues/1424) -- DeltaBitPackDecoder Incorrectly Handles Non-Zero MiniBlock Bit Width Padding [\#1417](https://github.com/apache/arrow-rs/issues/1417) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- DeltaBitPackEncoder Pads Miniblock BitWidths With Arbitrary Values [\#1416](https://github.com/apache/arrow-rs/issues/1416) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Possible unaligned write with MutableBuffer::push [\#1410](https://github.com/apache/arrow-rs/issues/1410) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Integration Test is failing on master branch [\#1398](https://github.com/apache/arrow-rs/issues/1398) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] - -**Documentation updates:** - -- Rewrite doc of `GenericListArray` [\#1450](https://github.com/apache/arrow-rs/pull/1450) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) -- Fix integration doc about build.ninja location [\#1438](https://github.com/apache/arrow-rs/pull/1438) ([viirya](https://github.com/viirya)) - -**Merged pull requests:** - -- Rewrite doc example of `ListArray` and `LargeListArray` [\#1447](https://github.com/apache/arrow-rs/pull/1447) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) -- Fix generate\_interval\_case in integration test [\#1446](https://github.com/apache/arrow-rs/pull/1446) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Fix generate\_decimal128\_case in integration test [\#1440](https://github.com/apache/arrow-rs/pull/1440) ([viirya](https://github.com/viirya)) -- `filter` kernel should work with FixedSizeListArrays [\#1434](https://github.com/apache/arrow-rs/pull/1434) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Support nullable keys in DictionaryArray::try\_new [\#1430](https://github.com/apache/arrow-rs/pull/1430) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jhorstmann](https://github.com/jhorstmann)) -- remove redundant if/clamp\_min/abs [\#1428](https://github.com/apache/arrow-rs/pull/1428) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jackwener](https://github.com/jackwener)) -- Add doc example for creating `FixedSizeListArray` [\#1426](https://github.com/apache/arrow-rs/pull/1426) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) -- Directly write to MutableBuffer in substring [\#1423](https://github.com/apache/arrow-rs/pull/1423) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Fix possibly unaligned writes in MutableBuffer [\#1421](https://github.com/apache/arrow-rs/pull/1421) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jhorstmann](https://github.com/jhorstmann)) -- Add value\_unchecked\(\) and unit test [\#1420](https://github.com/apache/arrow-rs/pull/1420) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jackwener](https://github.com/jackwener)) -- Fix DeltaBitPack MiniBlock Bit Width Padding [\#1418](https://github.com/apache/arrow-rs/pull/1418) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Update zstd requirement from 0.10 to 0.11 [\#1415](https://github.com/apache/arrow-rs/pull/1415) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- Set `default-features = false` for `zstd` in the parquet crate to support `wasm32-unknown-unknown` [\#1414](https://github.com/apache/arrow-rs/pull/1414) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([kylebarron](https://github.com/kylebarron)) -- Add support for `UnionArray` in`filter` kernel [\#1412](https://github.com/apache/arrow-rs/pull/1412) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Remove duplicate bound check in the function `shift` [\#1409](https://github.com/apache/arrow-rs/pull/1409) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) -- Add dictionary support for C data interface [\#1407](https://github.com/apache/arrow-rs/pull/1407) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([sunchao](https://github.com/sunchao)) -- Fix a small spelling mistake in docs. [\#1406](https://github.com/apache/arrow-rs/pull/1406) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) -- Add unit test to check `FixedSizeBinaryArray` input all none [\#1405](https://github.com/apache/arrow-rs/pull/1405) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jackwener](https://github.com/jackwener)) -- Move csv Parser trait and its implementations to utils module [\#1385](https://github.com/apache/arrow-rs/pull/1385) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([sum12](https://github.com/sum12)) - -## [10.0.0](https://github.com/apache/arrow-rs/tree/10.0.0) (2022-03-04) - -[Full Changelog](https://github.com/apache/arrow-rs/compare/9.1.0...10.0.0) - -**Breaking changes:** - -- Remove existing has\_ methods for optional fields in `ColumnChunkMetaData` [\#1346](https://github.com/apache/arrow-rs/pull/1346) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([shanisolomon](https://github.com/shanisolomon)) -- Remove redundant `has_` methods in `ColumnChunkMetaData` [\#1345](https://github.com/apache/arrow-rs/pull/1345) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([shanisolomon](https://github.com/shanisolomon)) - -**Implemented enhancements:** - -- Add extract month and day in temporal.rs [\#1387](https://github.com/apache/arrow-rs/issues/1387) -- Add clone to `IpcWriteOptions` [\#1381](https://github.com/apache/arrow-rs/issues/1381) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Support `MapArray` in `filter` kernel [\#1378](https://github.com/apache/arrow-rs/issues/1378) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Add `week` temporal kernel [\#1375](https://github.com/apache/arrow-rs/issues/1375) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Improve performance of `compare_dict_op` [\#1371](https://github.com/apache/arrow-rs/issues/1371) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Add support for LargeUtf8 in json writer [\#1357](https://github.com/apache/arrow-rs/issues/1357) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Make `arrow::array::builder::MapBuilder` public [\#1354](https://github.com/apache/arrow-rs/issues/1354) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Refactor `StructArray::from` [\#1351](https://github.com/apache/arrow-rs/issues/1351) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Refactor `RecordBatch::validate_new_batch` [\#1350](https://github.com/apache/arrow-rs/issues/1350) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Remove redundant has\_ methods for optional column metadata fields [\#1344](https://github.com/apache/arrow-rs/issues/1344) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Add `write` method to JsonWriter [\#1340](https://github.com/apache/arrow-rs/issues/1340) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Refactor the code of `Bitmap::new` [\#1337](https://github.com/apache/arrow-rs/issues/1337) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Use DictionaryArray's iterator in `compare_dict_op` [\#1329](https://github.com/apache/arrow-rs/issues/1329) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Add `as_decimal_array(arr: &dyn Array) -> &DecimalArray` [\#1312](https://github.com/apache/arrow-rs/issues/1312) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- More ergonomic / idiomatic primitive array creation from iterators [\#1298](https://github.com/apache/arrow-rs/issues/1298) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Implement DictionaryArray support in `eq_dyn`, `neq_dyn`, `lt_dyn`, `lt_eq_dyn`, `gt_dyn`, `gt_eq_dyn` [\#1201](https://github.com/apache/arrow-rs/issues/1201) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] - -**Fixed bugs:** - -- `cargo clippy` fails on the `master` branch [\#1362](https://github.com/apache/arrow-rs/issues/1362) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- `ArrowArray::try_from_raw` should not assume the pointers are from Arc [\#1333](https://github.com/apache/arrow-rs/issues/1333) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Fix CSV Writer::new to accept delimiter and make WriterBuilder::build use it [\#1328](https://github.com/apache/arrow-rs/issues/1328) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Make bounds configurable via builder when reading CSV [\#1327](https://github.com/apache/arrow-rs/issues/1327) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Add `with_datetime_format()` to CSV WriterBuilder [\#1272](https://github.com/apache/arrow-rs/issues/1272) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] - -**Performance improvements:** - -- Improve performance of `min` and `max` aggregation kernels without nulls [\#1373](https://github.com/apache/arrow-rs/issues/1373) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] - -**Closed issues:** - -- Consider removing redundant has\_XXX metadata functions in `ColumnChunkMetadata` [\#1332](https://github.com/apache/arrow-rs/issues/1332) - -**Merged pull requests:** - -- Support extract `day` and `month` in temporal.rs [\#1388](https://github.com/apache/arrow-rs/pull/1388) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Ted-Jiang](https://github.com/Ted-Jiang)) -- Add write method to Json Writer [\#1383](https://github.com/apache/arrow-rs/pull/1383) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([matthewmturner](https://github.com/matthewmturner)) -- Derive `Clone` for `IpcWriteOptions` [\#1382](https://github.com/apache/arrow-rs/pull/1382) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([matthewmturner](https://github.com/matthewmturner)) -- feat: support maps in MutableArrayData [\#1379](https://github.com/apache/arrow-rs/pull/1379) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([helgikrs](https://github.com/helgikrs)) -- Support extract `week` in temporal.rs [\#1376](https://github.com/apache/arrow-rs/pull/1376) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Ted-Jiang](https://github.com/Ted-Jiang)) -- Speed up the function `min_max_string` [\#1374](https://github.com/apache/arrow-rs/pull/1374) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) -- Improve performance if dictionary kernels, add benchmark and add `take_iter_unchecked` [\#1372](https://github.com/apache/arrow-rs/pull/1372) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Update pyo3 requirement from 0.15 to 0.16 [\#1369](https://github.com/apache/arrow-rs/pull/1369) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- Update contributing guide [\#1368](https://github.com/apache/arrow-rs/pull/1368) ([HaoYang670](https://github.com/HaoYang670)) -- Allow primitive array creation from iterators of PrimitiveTypes \(as well as `Option`\) [\#1367](https://github.com/apache/arrow-rs/pull/1367) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Update flatbuffers requirement from =2.1.0 to =2.1.1 [\#1364](https://github.com/apache/arrow-rs/pull/1364) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- Fix clippy lints [\#1363](https://github.com/apache/arrow-rs/pull/1363) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) -- Refactor `RecordBatch::validate_new_batch` [\#1361](https://github.com/apache/arrow-rs/pull/1361) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) -- Refactor `StructArray::from` [\#1360](https://github.com/apache/arrow-rs/pull/1360) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) -- Update flatbuffers requirement from =2.0.0 to =2.1.0 [\#1359](https://github.com/apache/arrow-rs/pull/1359) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- fix: add LargeUtf8 support in json writer [\#1358](https://github.com/apache/arrow-rs/pull/1358) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tiphaineruy](https://github.com/tiphaineruy)) -- Add `as_decimal_array` function [\#1356](https://github.com/apache/arrow-rs/pull/1356) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([liukun4515](https://github.com/liukun4515)) -- Publicly export arrow::array::MapBuilder [\#1355](https://github.com/apache/arrow-rs/pull/1355) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tjwilson90](https://github.com/tjwilson90)) -- Add with\_datetime\_format to csv WriterBuilder [\#1347](https://github.com/apache/arrow-rs/pull/1347) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([gsserge](https://github.com/gsserge)) -- Refactor `Bitmap::new` [\#1343](https://github.com/apache/arrow-rs/pull/1343) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) -- Remove delimiter from csv Writer [\#1342](https://github.com/apache/arrow-rs/pull/1342) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([gsserge](https://github.com/gsserge)) -- Make bounds configurable in csv ReaderBuilder [\#1341](https://github.com/apache/arrow-rs/pull/1341) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([gsserge](https://github.com/gsserge)) -- `ArrowArray::try_from_raw` should not assume the pointers are from Arc [\#1334](https://github.com/apache/arrow-rs/pull/1334) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Use DictionaryArray's iterator in `compare_dict_op` [\#1330](https://github.com/apache/arrow-rs/pull/1330) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Implement DictionaryArray support in neq\_dyn, lt\_dyn, lt\_eq\_dyn, gt\_dyn, gt\_eq\_dyn [\#1326](https://github.com/apache/arrow-rs/pull/1326) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Arrow Rust + Conbench Integration [\#1289](https://github.com/apache/arrow-rs/pull/1289) ([dianaclarke](https://github.com/dianaclarke)) - -## [9.1.0](https://github.com/apache/arrow-rs/tree/9.1.0) (2022-02-19) - -[Full Changelog](https://github.com/apache/arrow-rs/compare/9.0.2...9.1.0) - -**Implemented enhancements:** - -- Exposing page encoding stats [\#1321](https://github.com/apache/arrow-rs/issues/1321) -- Improve filter performance by special casing high and low selectivity predicates [\#1288](https://github.com/apache/arrow-rs/issues/1288) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Speed up `DeltaBitPackDecoder` [\#1281](https://github.com/apache/arrow-rs/issues/1281) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Fix all clippy lints in arrow crate [\#1255](https://github.com/apache/arrow-rs/issues/1255) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Expose page encoding `ColumnChunkMetadata` [\#1322](https://github.com/apache/arrow-rs/pull/1322) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([shanisolomon](https://github.com/shanisolomon)) -- Expose column index and offset index in `ColumnChunkMetadata` [\#1318](https://github.com/apache/arrow-rs/pull/1318) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([shanisolomon](https://github.com/shanisolomon)) -- Expose bloom filter offset in `ColumnChunkMetadata` [\#1309](https://github.com/apache/arrow-rs/pull/1309) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([shanisolomon](https://github.com/shanisolomon)) -- Add `DictionaryArray::try_new()` to create dictionaries from pre existing arrays [\#1300](https://github.com/apache/arrow-rs/pull/1300) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) -- Add `DictionaryArray::keys_iter`, and `take_iter` for other array types [\#1296](https://github.com/apache/arrow-rs/pull/1296) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Make `rle` decoder public under `experimental` feature [\#1271](https://github.com/apache/arrow-rs/pull/1271) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([zeevm](https://github.com/zeevm)) -- Add `DictionaryArray` support in `eq_dyn` kernel [\#1263](https://github.com/apache/arrow-rs/pull/1263) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) - -**Fixed bugs:** - -- `len` is not a parameter of `MutableArrayData::extend` [\#1316](https://github.com/apache/arrow-rs/issues/1316) -- module `data_type` is private in Rust Parquet 8.0.0 [\#1302](https://github.com/apache/arrow-rs/issues/1302) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Test failure: bit\_chunk\_iterator [\#1294](https://github.com/apache/arrow-rs/issues/1294) -- csv\_writer benchmark fails with "no such file or directory" [\#1292](https://github.com/apache/arrow-rs/issues/1292) - -**Documentation updates:** - -- Fix warnings in `cargo doc` [\#1268](https://github.com/apache/arrow-rs/pull/1268) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) - -**Performance improvements:** - -- Vectorize DeltaBitPackDecoder, up to 5x faster decoding [\#1284](https://github.com/apache/arrow-rs/pull/1284) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Skip zero-ing primitive nulls [\#1280](https://github.com/apache/arrow-rs/pull/1280) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Add specialized filter kernels in `compute` module \(up to 10x faster\) [\#1248](https://github.com/apache/arrow-rs/pull/1248) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) - -**Closed issues:** - -- Expose column and offset index metadata offset [\#1317](https://github.com/apache/arrow-rs/issues/1317) -- Expose bloom filter metadata offset [\#1308](https://github.com/apache/arrow-rs/issues/1308) -- Improve ergonomics to construct `DictionaryArrays` from `Key` and `Value` arrays [\#1299](https://github.com/apache/arrow-rs/issues/1299) -- Make it easier to iterate over `DictionaryArray` [\#1295](https://github.com/apache/arrow-rs/issues/1295) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- (WON'T FIX) Don't Interwine Bit and Byte Aligned Operations in `BitReader` [\#1282](https://github.com/apache/arrow-rs/issues/1282) -- how to create arrow::array from streamReader [\#1278](https://github.com/apache/arrow-rs/issues/1278) -- Remove scientific notation when converting floats to strings. [\#983](https://github.com/apache/arrow-rs/issues/983) - -**Merged pull requests:** - -- Update the document of function `MutableArrayData::extend` [\#1336](https://github.com/apache/arrow-rs/pull/1336) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) -- Fix clippy lint `dead_code` [\#1324](https://github.com/apache/arrow-rs/pull/1324) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([gsserge](https://github.com/gsserge)) -- fix test bug and ensure that bloom filter metadata is serialized in `to_thrift` [\#1320](https://github.com/apache/arrow-rs/pull/1320) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([shanisolomon](https://github.com/shanisolomon)) -- Enable more clippy lints in arrow [\#1315](https://github.com/apache/arrow-rs/pull/1315) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([gsserge](https://github.com/gsserge)) -- Fix clippy lint `clippy::type_complexity` [\#1310](https://github.com/apache/arrow-rs/pull/1310) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([gsserge](https://github.com/gsserge)) -- Fix clippy lint `clippy::float_equality_without_abs` [\#1305](https://github.com/apache/arrow-rs/pull/1305) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([gsserge](https://github.com/gsserge)) -- Fix clippy `clippy::vec_init_then_push` lint [\#1303](https://github.com/apache/arrow-rs/pull/1303) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([gsserge](https://github.com/gsserge)) -- Fix failing csv\_writer bench [\#1293](https://github.com/apache/arrow-rs/pull/1293) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([andygrove](https://github.com/andygrove)) -- Changes for 9.0.2 [\#1291](https://github.com/apache/arrow-rs/pull/1291) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alamb](https://github.com/alamb)) -- Fix bitmask creation also for simd comparisons with scalar [\#1290](https://github.com/apache/arrow-rs/pull/1290) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jhorstmann](https://github.com/jhorstmann)) -- Fix simd comparison kernels [\#1286](https://github.com/apache/arrow-rs/pull/1286) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jhorstmann](https://github.com/jhorstmann)) -- Restrict Decoder to compatible types \(\#1276\) [\#1277](https://github.com/apache/arrow-rs/pull/1277) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Fix some clippy lints in parquet crate, rename `LevelEncoder` variants to conform to Rust standards [\#1273](https://github.com/apache/arrow-rs/pull/1273) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([HaoYang670](https://github.com/HaoYang670)) -- Use new DecimalArray creation API in arrow crate [\#1249](https://github.com/apache/arrow-rs/pull/1249) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) -- Improve `DecimalArray` API ergonomics: add `iter()`, `FromIterator`, `with_precision_and_scale` [\#1223](https://github.com/apache/arrow-rs/pull/1223) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) - - -## [9.0.2](https://github.com/apache/arrow-rs/tree/9.0.2) (2022-02-09) - -[Full Changelog](https://github.com/apache/arrow-rs/compare/8.0.0...9.0.2) - -**Breaking changes:** - -- Add `Send` + `Sync` to `DataType`, `RowGroupReader`, `FileReader`, `ChunkReader`. [\#1264](https://github.com/apache/arrow-rs/issues/1264) -- Rename the function `Bitmap::len` to `Bitmap::bit_len` to clarify its meaning [\#1242](https://github.com/apache/arrow-rs/pull/1242) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) -- Remove unused / broken `memory-check` feature [\#1222](https://github.com/apache/arrow-rs/pull/1222) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jhorstmann](https://github.com/jhorstmann)) -- Potentially buffer multiple `RecordBatches` before writing a parquet row group in `ArrowWriter` [\#1214](https://github.com/apache/arrow-rs/pull/1214) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) - -**Implemented enhancements:** - -- Add `async` arrow parquet reader [\#1154](https://github.com/apache/arrow-rs/pull/1154) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Rename `Bitmap::len` to `Bitmap::bit_len` [\#1233](https://github.com/apache/arrow-rs/issues/1233) -- Extend CSV schema inference to allow scientific notation for floating point types [\#1215](https://github.com/apache/arrow-rs/issues/1215) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Write Multiple RecordBatch to Parquet Row Group [\#1211](https://github.com/apache/arrow-rs/issues/1211) -- Add doc examples for `eq_dyn` etc. [\#1202](https://github.com/apache/arrow-rs/issues/1202) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Add comparison kernels for `BinaryArray` [\#1108](https://github.com/apache/arrow-rs/issues/1108) -- `impl ArrowNativeType for i128` [\#1098](https://github.com/apache/arrow-rs/issues/1098) -- Remove `Copy` trait bound from dyn scalar kernels [\#1243](https://github.com/apache/arrow-rs/pull/1243) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([matthewmturner](https://github.com/matthewmturner)) -- Add `into_inner` for IPC `FileWriter` [\#1236](https://github.com/apache/arrow-rs/pull/1236) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([yjshen](https://github.com/yjshen)) -- \[Minor\]Re-export `array::builder::make_builder` to make it available for downstream [\#1235](https://github.com/apache/arrow-rs/pull/1235) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([yjshen](https://github.com/yjshen)) - -**Fixed bugs:** - -- Parquet v8.0.0 panics when reading all null column to NullArray [\#1245](https://github.com/apache/arrow-rs/issues/1245) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Get `Unknown configuration option rust-version` when running the rust format command [\#1240](https://github.com/apache/arrow-rs/issues/1240) -- `Bitmap` Length Validation is Incorrect [\#1231](https://github.com/apache/arrow-rs/issues/1231) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Writing sliced `ListArray` or `MapArray` ignore offsets [\#1226](https://github.com/apache/arrow-rs/issues/1226) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Remove broken `memory-tracking` crate feature [\#1171](https://github.com/apache/arrow-rs/issues/1171) -- Revert making `parquet::data_type` and `parquet::arrow::schema` experimental [\#1244](https://github.com/apache/arrow-rs/pull/1244) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) - -**Documentation updates:** - -- Update parquet crate documentation and examples [\#1253](https://github.com/apache/arrow-rs/pull/1253) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) -- Refresh parquet readme / contributing guide [\#1252](https://github.com/apache/arrow-rs/pull/1252) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) -- Add docs examples for dynamically compare functions [\#1250](https://github.com/apache/arrow-rs/pull/1250) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) -- Add Rust Docs examples for UnionArray [\#1241](https://github.com/apache/arrow-rs/pull/1241) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) -- Improve documentation for Bitmap [\#1237](https://github.com/apache/arrow-rs/pull/1237) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) - -**Performance improvements:** - -- Improve performance for arithmetic kernels with `simd` feature enabled \(except for division/modulo\) [\#1221](https://github.com/apache/arrow-rs/pull/1221) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jhorstmann](https://github.com/jhorstmann)) -- Do not concatenate identical dictionaries [\#1219](https://github.com/apache/arrow-rs/pull/1219) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Preserve dictionary encoding when decoding parquet into Arrow arrays, 60x perf improvement \(\#171\) [\#1180](https://github.com/apache/arrow-rs/pull/1180) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) - -**Closed issues:** - -- `UnalignedBitChunkIterator` to that iterates through already aligned `u64` blocks [\#1227](https://github.com/apache/arrow-rs/issues/1227) -- Remove unused `ArrowArrayReader` in parquet [\#1197](https://github.com/apache/arrow-rs/issues/1197) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] - -**Merged pull requests:** - -- Upgrade clap to 3.0.0 [\#1261](https://github.com/apache/arrow-rs/pull/1261) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Jimexist](https://github.com/Jimexist)) -- Update chrono-tz requirement from 0.4 to 0.6 [\#1259](https://github.com/apache/arrow-rs/pull/1259) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- Update zstd requirement from 0.9 to 0.10 [\#1257](https://github.com/apache/arrow-rs/pull/1257) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- Fix NullArrayReader \(\#1245\) [\#1246](https://github.com/apache/arrow-rs/pull/1246) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- dyn compare for binary array [\#1238](https://github.com/apache/arrow-rs/pull/1238) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) -- Remove arrow array reader \(\#1197\) [\#1234](https://github.com/apache/arrow-rs/pull/1234) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Fix null bitmap length validation \(\#1231\) [\#1232](https://github.com/apache/arrow-rs/pull/1232) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Faster bitmask iteration [\#1228](https://github.com/apache/arrow-rs/pull/1228) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Add non utf8 values into the test cases of BinaryArray comparison [\#1220](https://github.com/apache/arrow-rs/pull/1220) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) -- Update DECIMAL\_RE to allow scientific notation in auto inferred schemas [\#1216](https://github.com/apache/arrow-rs/pull/1216) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([pjmore](https://github.com/pjmore)) -- Fix simd comparison kernels [\#1286](https://github.com/apache/arrow-rs/pull/1286) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jhorstmann](https://github.com/jhorstmann)) -- Fix bitmask creation also for simd comparisons with scalar [\#1290](https://github.com/apache/arrow-rs/pull/1290) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jhorstmann](https://github.com/jhorstmann)) - -## [8.0.0](https://github.com/apache/arrow-rs/tree/8.0.0) (2022-01-20) - -[Full Changelog](https://github.com/apache/arrow-rs/compare/7.0.0...8.0.0) - -**Breaking changes:** - -- Return error from JSON writer rather than panic [\#1205](https://github.com/apache/arrow-rs/pull/1205) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Ted-Jiang](https://github.com/Ted-Jiang)) -- Remove `ArrowSignedNumericType ` to Simplify and reduce code duplication in arithmetic kernels [\#1161](https://github.com/apache/arrow-rs/pull/1161) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jhorstmann](https://github.com/jhorstmann)) -- Restrict RecordReader and friends to scalar types \(\#1132\) [\#1155](https://github.com/apache/arrow-rs/pull/1155) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Move more parquet functionality behind experimental feature flag \(\#1032\) [\#1134](https://github.com/apache/arrow-rs/pull/1134) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) - -**Implemented enhancements:** - -- Parquet reader should be able to read structs within list [\#1186](https://github.com/apache/arrow-rs/issues/1186) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Disable serde\_json `arbitrary_precision` feature flag [\#1174](https://github.com/apache/arrow-rs/issues/1174) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Simplify and reduce code duplication in arithmetic.rs [\#1160](https://github.com/apache/arrow-rs/issues/1160) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Return `Err` from JSON writer rather than `panic!` for unsupported types [\#1157](https://github.com/apache/arrow-rs/issues/1157) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Support `scalar` mathematics kernels for `Array` and scalar value [\#1153](https://github.com/apache/arrow-rs/issues/1153) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Support `DecimalArray` in sort kernel [\#1137](https://github.com/apache/arrow-rs/issues/1137) -- Parquet Fuzz Tests [\#1053](https://github.com/apache/arrow-rs/issues/1053) -- BooleanBufferBuilder Append Packed [\#1038](https://github.com/apache/arrow-rs/issues/1038) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- parquet Performance Optimization: StructArrayReader Redundant Level & Bitmap Computation [\#1034](https://github.com/apache/arrow-rs/issues/1034) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Reduce Public Parquet API [\#1032](https://github.com/apache/arrow-rs/issues/1032) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Add `from_iter_values` for binary array [\#1188](https://github.com/apache/arrow-rs/pull/1188) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Jimexist](https://github.com/Jimexist)) -- Add support for `MapArray` in json writer [\#1149](https://github.com/apache/arrow-rs/pull/1149) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([helgikrs](https://github.com/helgikrs)) - -**Fixed bugs:** - -- Empty string arrays with no nulls are not equal [\#1208](https://github.com/apache/arrow-rs/issues/1208) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Pretty print a `RecordBatch` containing `Float16` triggers a panic [\#1193](https://github.com/apache/arrow-rs/issues/1193) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Writing structs nested in lists produces an incorrect output [\#1184](https://github.com/apache/arrow-rs/issues/1184) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Undefined behavior for `GenericStringArray::from_iter_values` if reported iterator upper bound is incorrect [\#1144](https://github.com/apache/arrow-rs/issues/1144) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Interval comparisons with `simd` feature asserts [\#1136](https://github.com/apache/arrow-rs/issues/1136) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- RecordReader Permits Illegal Types [\#1132](https://github.com/apache/arrow-rs/issues/1132) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] - -**Security fixes:** - -- Fix undefined behavor in GenericStringArray::from\_iter\_values [\#1145](https://github.com/apache/arrow-rs/pull/1145) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) -- parquet: Optimized ByteArrayReader, Add UTF-8 Validation \(\#1040\) [\#1082](https://github.com/apache/arrow-rs/pull/1082) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) - -**Documentation updates:** - -- Update parquet crate readme [\#1192](https://github.com/apache/arrow-rs/pull/1192) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) -- Document safety justification of some uses of `from_trusted_len_iter` [\#1148](https://github.com/apache/arrow-rs/pull/1148) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) - -**Performance improvements:** - -- Improve parquet reading performance for columns with nulls by preserving bitmask when possible \(\#1037\) [\#1054](https://github.com/apache/arrow-rs/pull/1054) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Improve parquet performance: Skip levels computation for required struct arrays in parquet [\#1035](https://github.com/apache/arrow-rs/pull/1035) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) - -**Closed issues:** - -- Generify ColumnReaderImpl and RecordReader [\#1040](https://github.com/apache/arrow-rs/issues/1040) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Parquet Preserve BitMask [\#1037](https://github.com/apache/arrow-rs/issues/1037) - -**Merged pull requests:** - -- fix a bug in variable sized equality [\#1209](https://github.com/apache/arrow-rs/pull/1209) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([helgikrs](https://github.com/helgikrs)) -- Pin WASM / packed SIMD tests to nightly-2022-01-17 [\#1204](https://github.com/apache/arrow-rs/pull/1204) ([alamb](https://github.com/alamb)) -- feat: add support for casting Duration/Interval to Int64Array [\#1196](https://github.com/apache/arrow-rs/pull/1196) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([e-dard](https://github.com/e-dard)) -- Add comparison support for fully qualified BinaryArray [\#1195](https://github.com/apache/arrow-rs/pull/1195) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) -- Fix in display of `Float16Array` [\#1194](https://github.com/apache/arrow-rs/pull/1194) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([helgikrs](https://github.com/helgikrs)) -- update nightly version for miri [\#1189](https://github.com/apache/arrow-rs/pull/1189) ([Jimexist](https://github.com/Jimexist)) -- feat\(parquet\): support for reading structs nested within lists [\#1187](https://github.com/apache/arrow-rs/pull/1187) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([helgikrs](https://github.com/helgikrs)) -- fix: Fix a bug in how definition levels are calculated for nested structs in a list [\#1185](https://github.com/apache/arrow-rs/pull/1185) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([helgikrs](https://github.com/helgikrs)) -- Truncate bitmask on BooleanBufferBuilder::resize: [\#1183](https://github.com/apache/arrow-rs/pull/1183) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Add ticket reference for false positive in clippy [\#1181](https://github.com/apache/arrow-rs/pull/1181) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) -- Fix record formatting in 1.58 [\#1178](https://github.com/apache/arrow-rs/pull/1178) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Serialize i128 as JSON string [\#1175](https://github.com/apache/arrow-rs/pull/1175) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Support DecimalType in `sort` and `take` kernels [\#1172](https://github.com/apache/arrow-rs/pull/1172) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([liukun4515](https://github.com/liukun4515)) -- Fix new clippy lints introduced in Rust 1.58 [\#1170](https://github.com/apache/arrow-rs/pull/1170) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) -- Fix compilation error with simd feature [\#1169](https://github.com/apache/arrow-rs/pull/1169) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jhorstmann](https://github.com/jhorstmann)) -- Fix bug while writing parquet with empty lists of structs [\#1166](https://github.com/apache/arrow-rs/pull/1166) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([helgikrs](https://github.com/helgikrs)) -- Use tempfile for parquet tests [\#1165](https://github.com/apache/arrow-rs/pull/1165) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Remove left over dev/README.md file from arrow/arrow-rs split [\#1162](https://github.com/apache/arrow-rs/pull/1162) ([alamb](https://github.com/alamb)) -- Add multiply\_scalar kernel [\#1159](https://github.com/apache/arrow-rs/pull/1159) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Fuzz test different parquet encodings [\#1156](https://github.com/apache/arrow-rs/pull/1156) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Add subtract\_scalar kernel [\#1152](https://github.com/apache/arrow-rs/pull/1152) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Add add\_scalar kernel [\#1151](https://github.com/apache/arrow-rs/pull/1151) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Move simd right out of for\_each loop [\#1150](https://github.com/apache/arrow-rs/pull/1150) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Internal Remove `GenericStringArray::from_vec` and `GenericStringArray::from_opt_vec` [\#1147](https://github.com/apache/arrow-rs/pull/1147) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) -- Implement SIMD comparison operations for types with less than 4 lanes \(i128\) [\#1146](https://github.com/apache/arrow-rs/pull/1146) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jhorstmann](https://github.com/jhorstmann)) -- Extends parquet fuzz tests to also tests nulls, dictionaries and row groups with multiple pages \(\#1053\) [\#1110](https://github.com/apache/arrow-rs/pull/1110) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Generify ColumnReaderImpl and RecordReader \(\#1040\) [\#1041](https://github.com/apache/arrow-rs/pull/1041) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- BooleanBufferBuilder::append\_packed \(\#1038\) [\#1039](https://github.com/apache/arrow-rs/pull/1039) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) - -## [7.0.0](https://github.com/apache/arrow-rs/tree/7.0.0) (2022-1-07) - -[Full Changelog](https://github.com/apache/arrow-rs/compare/6.5.0...7.0.0) - -### Arrow - -**Breaking changes:** -- `pretty_format_batches` now returns `Result` rather than `String`: [#975](https://github.com/apache/arrow-rs/pull/975) -- `MutableBuffer::typed_data_mut` is marked `unsafe`: [#1029](https://github.com/apache/arrow-rs/pull/1029) -- UnionArray updated match latest Arrow spec, added `UnionMode`, `UnionArray::new()` marked `unsafe`: [#885](https://github.com/apache/arrow-rs/pull/885) - -**New Features:** -- Support for `Float16Array` types [#888](https://github.com/apache/arrow-rs/pull/888) -- IPC support for `UnionArray` [#654](https://github.com/apache/arrow-rs/issues/654) -- Dynamic comparison kernels for scalars (e.g. `eq_dyn_scalar`), including `DictionaryArray`: [#1113](https://github.com/apache/arrow-rs/issues/1113) - -**Enhancements:** -- Added `Schema::with_metadata` and `Field::with_metadata` [#1092](https://github.com/apache/arrow-rs/pull/1092) -- Support for custom datetime format for inference and parsing csv files [#1112](https://github.com/apache/arrow-rs/pull/1112) -- Implement `Array` for `ArrayRef` for easier use [#1129](https://github.com/apache/arrow-rs/pull/1129) -- Pretty printing display support for `FixedSizeBinaryArray` [#1097](https://github.com/apache/arrow-rs/pull/1097) -- Dependency Upgrades: `pyo3`, `parquet-format`, `prost`, `tonic` -- Avoid allocating vector of indices in `lexicographical_partition_ranges`[#998](https://github.com/apache/arrow-rs/pull/998) - -### Parquet - -**Fixed bugs:** -- (parquet) Fix reading of dictionary encoded pages with null values: [#1130](https://github.com/apache/arrow-rs/pull/1130) - - -# Changelog - -## [6.5.0](https://github.com/apache/arrow-rs/tree/6.5.0) (2021-12-23) - -[Full Changelog](https://github.com/apache/arrow-rs/compare/6.4.0...6.5.0) - -* [092fc64bbb019244887ebd0d9c9a2d3e3a9aebc0](https://github.com/apache/arrow-rs/commit/092fc64bbb019244887ebd0d9c9a2d3e3a9aebc0) support cast decimal to decimal ([#1084](https://github.com/apache/arrow-rs/pull/1084)) ([#1093](https://github.com/apache/arrow-rs/pull/1093)) -* [01459762ed18b504e00e7b2818fce91f19188b1e](https://github.com/apache/arrow-rs/commit/01459762ed18b504e00e7b2818fce91f19188b1e) Fix like regex escaping ([#1085](https://github.com/apache/arrow-rs/pull/1085)) ([#1090](https://github.com/apache/arrow-rs/pull/1090)) -* [7c748bfccbc2eac0c1138378736b70dcb7e26a5b](https://github.com/apache/arrow-rs/commit/7c748bfccbc2eac0c1138378736b70dcb7e26a5b) support cast decimal to signed numeric ([#1073](https://github.com/apache/arrow-rs/pull/1073)) ([#1089](https://github.com/apache/arrow-rs/pull/1089)) -* [bd3600b6483c253ae57a38928a636d39a6b7cb02](https://github.com/apache/arrow-rs/commit/bd3600b6483c253ae57a38928a636d39a6b7cb02) parquet: Use constant for RLE decoder buffer size ([#1070](https://github.com/apache/arrow-rs/pull/1070)) ([#1088](https://github.com/apache/arrow-rs/pull/1088)) -* [2b5c53ecd92468fd95328637a15de7f35b6fcf28](https://github.com/apache/arrow-rs/commit/2b5c53ecd92468fd95328637a15de7f35b6fcf28) Box RleDecoder index buffer ([#1061](https://github.com/apache/arrow-rs/pull/1061)) ([#1062](https://github.com/apache/arrow-rs/pull/1062)) ([#1081](https://github.com/apache/arrow-rs/pull/1081)) -* [78721bc1a467177679ad6196b994759cf4d73377](https://github.com/apache/arrow-rs/commit/78721bc1a467177679ad6196b994759cf4d73377) BooleanBufferBuilder correct buffer length ([#1051](https://github.com/apache/arrow-rs/pull/1051)) ([#1052](https://github.com/apache/arrow-rs/pull/1052)) ([#1080](https://github.com/apache/arrow-rs/pull/1080)) -* [3a5e3541d3a4db61a828011ed95c8539adf1d57c](https://github.com/apache/arrow-rs/commit/3a5e3541d3a4db61a828011ed95c8539adf1d57c) support cast signed numeric to decimal ([#1044](https://github.com/apache/arrow-rs/pull/1044)) ([#1079](https://github.com/apache/arrow-rs/pull/1079)) -* [000bdb3053098255d43288aa3e8665e8b1892a6c](https://github.com/apache/arrow-rs/commit/000bdb3053098255d43288aa3e8665e8b1892a6c) fix(compute): LIKE escape parenthesis ([#1042](https://github.com/apache/arrow-rs/pull/1042)) ([#1078](https://github.com/apache/arrow-rs/pull/1078)) -* [e0abdb9e62772a2f853974e68e744246e7f47569](https://github.com/apache/arrow-rs/commit/e0abdb9e62772a2f853974e68e744246e7f47569) Add Schema::project and RecordBatch::project functions ([#1033](https://github.com/apache/arrow-rs/pull/1033)) ([#1077](https://github.com/apache/arrow-rs/pull/1077)) -* [31911a4d6328d889d98796b896412b3997f73e13](https://github.com/apache/arrow-rs/commit/31911a4d6328d889d98796b896412b3997f73e13) Remove outdated safety example from doc ([#1050](https://github.com/apache/arrow-rs/pull/1050)) ([#1058](https://github.com/apache/arrow-rs/pull/1058)) -* [71ac8620993a65a7f1f57278c3495556625356b3](https://github.com/apache/arrow-rs/commit/71ac8620993a65a7f1f57278c3495556625356b3) Use existing array type in `take` kernel ([#1046](https://github.com/apache/arrow-rs/pull/1046)) ([#1057](https://github.com/apache/arrow-rs/pull/1057)) -* [1c5902376b7f7d56cb5249db4f98a6a370ead919](https://github.com/apache/arrow-rs/commit/1c5902376b7f7d56cb5249db4f98a6a370ead919) Extract method to drive PageIterator -> RecordReader ([#1031](https://github.com/apache/arrow-rs/pull/1031)) ([#1056](https://github.com/apache/arrow-rs/pull/1056)) -* [7ca39361f8733b86bc0cef5ed5d74093e2c6b14d](https://github.com/apache/arrow-rs/commit/7ca39361f8733b86bc0cef5ed5d74093e2c6b14d) Clarify governance of arrow crate ([#1030](https://github.com/apache/arrow-rs/pull/1030)) ([#1055](https://github.com/apache/arrow-rs/pull/1055)) - - -## [6.4.0](https://github.com/apache/arrow-rs/tree/6.4.0) (2021-12-10) - - -[Full Changelog](https://github.com/apache/arrow-rs/compare/6.3.0...6.4.0) - - -* [049f48559f578243935b6e512d06c4c2df360bf1](https://github.com/apache/arrow-rs/commit/049f48559f578243935b6e512d06c4c2df360bf1) Force new cargo and target caching to fix CI ([#1023](https://github.com/apache/arrow-rs/pull/1023)) ([#1024](https://github.com/apache/arrow-rs/pull/1024)) -* [ef37da3b60f71a52d5ad67e9ca810dca38b29f00](https://github.com/apache/arrow-rs/commit/ef37da3b60f71a52d5ad67e9ca810dca38b29f00) Fix a broken link and some missing styling in the main arrow crate docs ([#1013](https://github.com/apache/arrow-rs/pull/1013)) ([#1019](https://github.com/apache/arrow-rs/pull/1019)) -* [f2c746a9b968714cfe05d35fcee8658371acd899](https://github.com/apache/arrow-rs/commit/f2c746a9b968714cfe05d35fcee8658371acd899) Remove out of date comment ([#1008](https://github.com/apache/arrow-rs/pull/1008)) ([#1018](https://github.com/apache/arrow-rs/pull/1018)) -* [557fc11e3b2a09a680c0cfbf38d27b13101b63fe](https://github.com/apache/arrow-rs/commit/557fc11e3b2a09a680c0cfbf38d27b13101b63fe) Remove unneeded `rc` feature of serde ([#990](https://github.com/apache/arrow-rs/pull/990)) ([#1016](https://github.com/apache/arrow-rs/pull/1016)) -* [b28385e096b1cf8f5fb2773d49b160f93d94fbac](https://github.com/apache/arrow-rs/commit/b28385e096b1cf8f5fb2773d49b160f93d94fbac) Docstrings for Timestamp*Array. ([#988](https://github.com/apache/arrow-rs/pull/988)) ([#1015](https://github.com/apache/arrow-rs/pull/1015)) -* [a92672e40217670d2566a85d70b0b59fffac594c](https://github.com/apache/arrow-rs/commit/a92672e40217670d2566a85d70b0b59fffac594c) Add full data validation for ArrayData::try_new() ([#1007](https://github.com/apache/arrow-rs/pull/1007)) -* [6c8b2936d7b07e1e2f5d1d48eea425a385382dfb](https://github.com/apache/arrow-rs/commit/6c8b2936d7b07e1e2f5d1d48eea425a385382dfb) Add boolean comparison to scalar kernels for less then, greater than ([#977](https://github.com/apache/arrow-rs/pull/977)) ([#1005](https://github.com/apache/arrow-rs/pull/1005)) -* [14d140aeca608a23a8a6b2c251c8f53ffd377e61](https://github.com/apache/arrow-rs/commit/14d140aeca608a23a8a6b2c251c8f53ffd377e61) Fix some typos in code and comments ([#985](https://github.com/apache/arrow-rs/pull/985)) ([#1006](https://github.com/apache/arrow-rs/pull/1006)) -* [b4507f562fb0eddfb79840871cd2733dc0e337cd](https://github.com/apache/arrow-rs/commit/b4507f562fb0eddfb79840871cd2733dc0e337cd) Fix warnings introduced by Rust/Clippy 1.57.0 ([#1004](https://github.com/apache/arrow-rs/pull/1004)) - - -## [6.3.0](https://github.com/apache/arrow-rs/tree/6.3.0) (2021-11-26) - - -[Full Changelog](https://github.com/apache/arrow-rs/compare/6.2.0...6.3.0) - - -**Changes:** -* [7e51df015ce851a5de444ca08b57b38e7ee959a3](https://github.com/apache/arrow-rs/commit/7e51df015ce851a5de444ca08b57b38e7ee959a3) add more error test case and change the code style ([#952](https://github.com/apache/arrow-rs/pull/952)) ([#976](https://github.com/apache/arrow-rs/pull/976)) -* [6c570cfe98d6a7a4ec74b139b733c5c72ed10015](https://github.com/apache/arrow-rs/commit/6c570cfe98d6a7a4ec74b139b733c5c72ed10015) Support read decimal data from csv reader if user provide the schema with decimal data type ([#941](https://github.com/apache/arrow-rs/pull/941)) ([#974](https://github.com/apache/arrow-rs/pull/974)) -* [4fa0d4d7f7d9ca0a3da2a6dfe3eae6dc2d51a79a](https://github.com/apache/arrow-rs/commit/4fa0d4d7f7d9ca0a3da2a6dfe3eae6dc2d51a79a) Adding Pretty Print Support For Fixed Size List ([#958](https://github.com/apache/arrow-rs/pull/958)) ([#968](https://github.com/apache/arrow-rs/pull/968)) -* [9d453a3128013c03e8ed854ded76b15cc6f28be4](https://github.com/apache/arrow-rs/commit/9d453a3128013c03e8ed854ded76b15cc6f28be4) Fix bug in temporal utilities due to DST being ignored. ([#955](https://github.com/apache/arrow-rs/pull/955)) ([#967](https://github.com/apache/arrow-rs/pull/967)) -* [1b9fd9e3fb2653236513bb7dda5aa2fa14d1d831](https://github.com/apache/arrow-rs/commit/1b9fd9e3fb2653236513bb7dda5aa2fa14d1d831) Inferring 2. as Float64 for issue [#929](https://github.com/apache/arrow-rs/pull/929) ([#950](https://github.com/apache/arrow-rs/pull/950)) ([#966](https://github.com/apache/arrow-rs/pull/966)) -* [e6c5e1c877bd94b3d6e545567f901d9962257cf8](https://github.com/apache/arrow-rs/commit/e6c5e1c877bd94b3d6e545567f901d9962257cf8) Fix CI for latest nightly ([#970](https://github.com/apache/arrow-rs/pull/970)) ([#973](https://github.com/apache/arrow-rs/pull/973)) -* [c96e8de457442806e18944f0b26dd06ba4cb1aee](https://github.com/apache/arrow-rs/commit/c96e8de457442806e18944f0b26dd06ba4cb1aee) Fix primitive sort when input contains more nulls than the given sort limit ([#954](https://github.com/apache/arrow-rs/pull/954)) ([#965](https://github.com/apache/arrow-rs/pull/965)) -* [094037d418381584178db1d886cad3b5024b414a](https://github.com/apache/arrow-rs/commit/094037d418381584178db1d886cad3b5024b414a) Update comfy-table to 5.0 ([#957](https://github.com/apache/arrow-rs/pull/957)) ([#964](https://github.com/apache/arrow-rs/pull/964)) -* [9f635021eee6786c5377c891218c5f88ebce07c3](https://github.com/apache/arrow-rs/commit/9f635021eee6786c5377c891218c5f88ebce07c3) Fix csv writing of timestamps to show timezone. ([#849](https://github.com/apache/arrow-rs/pull/849)) ([#963](https://github.com/apache/arrow-rs/pull/963)) -* [f7deba4c3a050a52608462ee8a827bb8f6364140](https://github.com/apache/arrow-rs/commit/f7deba4c3a050a52608462ee8a827bb8f6364140) Adding ability to parse float from number with leading decimal ([#831](https://github.com/apache/arrow-rs/pull/831)) ([#962](https://github.com/apache/arrow-rs/pull/962)) -* [59f96e842d05b63882f7ba285c66a9739761cf84](https://github.com/apache/arrow-rs/commit/59f96e842d05b63882f7ba285c66a9739761cf84) add ilike comparitor ([#874](https://github.com/apache/arrow-rs/pull/874)) ([#961](https://github.com/apache/arrow-rs/pull/961)) -* [54023c8a5543c9f9fa4955afa01189029f3e96f5](https://github.com/apache/arrow-rs/commit/54023c8a5543c9f9fa4955afa01189029f3e96f5) Remove unpassable cargo publish check from verify-release-candidate.sh ([#882](https://github.com/apache/arrow-rs/pull/882)) ([#949](https://github.com/apache/arrow-rs/pull/949)) - - - -## [6.2.0](https://github.com/apache/arrow-rs/tree/6.2.0) (2021-11-12) - - -[Full Changelog](https://github.com/apache/arrow-rs/compare/6.1.0...6.2.0) - -**Features / Fixes:** - - -* [4037933e43cad9e4de027039ce14caa65f78300a](https://github.com/apache/arrow-rs/commit/4037933e43cad9e4de027039ce14caa65f78300a) Fix validation for offsets of StructArrays ([#942](https://github.com/apache/arrow-rs/pull/942)) ([#946](https://github.com/apache/arrow-rs/pull/946)) -* [1af9ca5d363d870550026a7b1abcb749befbb371](https://github.com/apache/arrow-rs/commit/1af9ca5d363d870550026a7b1abcb749befbb371) implement take kernel for null arrays ([#939](https://github.com/apache/arrow-rs/pull/939)) ([#944](https://github.com/apache/arrow-rs/pull/944)) -* [320de1c20aefbf204f6888e2ad3663863afeba9f](https://github.com/apache/arrow-rs/commit/320de1c20aefbf204f6888e2ad3663863afeba9f) add checker for appending i128 to decimal builder ([#928](https://github.com/apache/arrow-rs/pull/928)) ([#943](https://github.com/apache/arrow-rs/pull/943)) -* [dff14113884ad4246a8cafb9be579ebdb4e1481f](https://github.com/apache/arrow-rs/commit/dff14113884ad4246a8cafb9be579ebdb4e1481f) Validate arguments to ArrayData::new and null bit buffer and buffers ([#810](https://github.com/apache/arrow-rs/pull/810)) ([#936](https://github.com/apache/arrow-rs/pull/936)) -* [c3eae1ec56303b97c9e15263063a6a13122ef194](https://github.com/apache/arrow-rs/commit/c3eae1ec56303b97c9e15263063a6a13122ef194) fix some warning about unused variables in panic tests ([#894](https://github.com/apache/arrow-rs/pull/894)) ([#933](https://github.com/apache/arrow-rs/pull/933)) -* [e80bb018450f13a30811ffd244c42917d8bf8a62](https://github.com/apache/arrow-rs/commit/e80bb018450f13a30811ffd244c42917d8bf8a62) fix some clippy warnings ([#896](https://github.com/apache/arrow-rs/pull/896)) ([#930](https://github.com/apache/arrow-rs/pull/930)) -* [bde89463b627be3f60b5569d038ca36c434da71d](https://github.com/apache/arrow-rs/commit/bde89463b627be3f60b5569d038ca36c434da71d) feat(ipc): add support for deserializing messages with nested dictionary fields ([#923](https://github.com/apache/arrow-rs/pull/923)) ([#931](https://github.com/apache/arrow-rs/pull/931)) -* [792544b5fb7b84224ef9745ecb9f330663c14fb4](https://github.com/apache/arrow-rs/commit/792544b5fb7b84224ef9745ecb9f330663c14fb4) refactor regexp_is_match_utf8_scalar to try to mitigate miri failures ([#895](https://github.com/apache/arrow-rs/pull/895)) ([#932](https://github.com/apache/arrow-rs/pull/932)) -* [3f0e252811cbb6e3f7c774959787dcfec985d03e](https://github.com/apache/arrow-rs/commit/3f0e252811cbb6e3f7c774959787dcfec985d03e) Automatically retry failed MIRI runs to work around intermittent failures ([#934](https://github.com/apache/arrow-rs/pull/934)) -* [c9a9515c46d560ced00e23ff57cb10a1c97573cb](https://github.com/apache/arrow-rs/commit/c9a9515c46d560ced00e23ff57cb10a1c97573cb) Update mod.rs ([#909](https://github.com/apache/arrow-rs/pull/909)) ([#919](https://github.com/apache/arrow-rs/pull/919)) -* [64ed79ece67141b92dc45b8a1d43cb9d909aa6a9](https://github.com/apache/arrow-rs/commit/64ed79ece67141b92dc45b8a1d43cb9d909aa6a9) Mark boolean kernels public ([#913](https://github.com/apache/arrow-rs/pull/913)) ([#920](https://github.com/apache/arrow-rs/pull/920)) -* [8b95fe0bbf03588c5cc00f67365c5b0dac4d7a34](https://github.com/apache/arrow-rs/commit/8b95fe0bbf03588c5cc00f67365c5b0dac4d7a34) doc example mistype ([#904](https://github.com/apache/arrow-rs/pull/904)) ([#918](https://github.com/apache/arrow-rs/pull/918)) -* [34c5eab4862cab16fdfd5f5ed6c68dce6298dfa4](https://github.com/apache/arrow-rs/commit/34c5eab4862cab16fdfd5f5ed6c68dce6298dfa4) allow null array to be cast to all other types ([#884](https://github.com/apache/arrow-rs/pull/884)) ([#917](https://github.com/apache/arrow-rs/pull/917)) -* [3c69752e55ed0c58f5a8faed918a22b45cd93766](https://github.com/apache/arrow-rs/commit/3c69752e55ed0c58f5a8faed918a22b45cd93766) Fix instances of UB that cause tests to not pass under miri ([#878](https://github.com/apache/arrow-rs/pull/878)) ([#916](https://github.com/apache/arrow-rs/pull/916)) -* [85402148c3af03d0855e81f855715ea98a7491c5](https://github.com/apache/arrow-rs/commit/85402148c3af03d0855e81f855715ea98a7491c5) feat(ipc): Support writing dictionaries nested in structs and unions ([#870](https://github.com/apache/arrow-rs/pull/870)) ([#915](https://github.com/apache/arrow-rs/pull/915)) -* [03d95e626cb0e654775fefa77786674ea41be4a2](https://github.com/apache/arrow-rs/commit/03d95e626cb0e654775fefa77786674ea41be4a2) Fix references to changelog ([#905](https://github.com/apache/arrow-rs/pull/905)) - - -## [6.1.0](https://github.com/apache/arrow-rs/tree/6.1.0) (2021-10-29) - - -[Full Changelog](https://github.com/apache/arrow-rs/compare/6.0.0...6.1.0) - -**Features / Fixes:** - -* [b42649b0088fe7762c713a41a23c1abdf8d0496d](https://github.com/apache/arrow-rs/commit/b42649b0088fe7762c713a41a23c1abdf8d0496d) implement eq_dyn and neq_dyn ([#858](https://github.com/apache/arrow-rs/pull/858)) ([#867](https://github.com/apache/arrow-rs/pull/867)) -* [01743f3f10a377c1ca857cd554acbf84155766d8](https://github.com/apache/arrow-rs/commit/01743f3f10a377c1ca857cd554acbf84155766d8) fix: fix a bug in offset calculation for unions ([#863](https://github.com/apache/arrow-rs/pull/863)) ([#871](https://github.com/apache/arrow-rs/pull/871)) -* [8bfff793a23f0e71008c7a9eea7a54d6b913ecff](https://github.com/apache/arrow-rs/commit/8bfff793a23f0e71008c7a9eea7a54d6b913ecff) add lt_bool, lt_eq_bool, gt_bool, gt_eq_bool ([#860](https://github.com/apache/arrow-rs/pull/860)) ([#868](https://github.com/apache/arrow-rs/pull/868)) -* [8845e91d4ab584c822e9ee903db7069551b124af](https://github.com/apache/arrow-rs/commit/8845e91d4ab584c822e9ee903db7069551b124af) fix(ipc): Support serializing structs containing dictionaries ([#848](https://github.com/apache/arrow-rs/pull/848)) ([#865](https://github.com/apache/arrow-rs/pull/865)) -* [620282a0d9fdd2a8ed7e8313d17ba3dec64c80e5](https://github.com/apache/arrow-rs/commit/620282a0d9fdd2a8ed7e8313d17ba3dec64c80e5) Implement boolean equality kernels ([#844](https://github.com/apache/arrow-rs/pull/844)) ([#857](https://github.com/apache/arrow-rs/pull/857)) -* [94cddcacf785be982e69689291ce034ef00220b4](https://github.com/apache/arrow-rs/commit/94cddcacf785be982e69689291ce034ef00220b4) Cherry pick fix parquet_derive with default features (and fix cargo publish) ([#856](https://github.com/apache/arrow-rs/pull/856)) -* [733fd583ddb3dbe6b4d58a809c444ee16ac0eae8](https://github.com/apache/arrow-rs/commit/733fd583ddb3dbe6b4d58a809c444ee16ac0eae8) Use kernel utility for parsing timestamps in csv reader. ([#832](https://github.com/apache/arrow-rs/pull/832)) ([#853](https://github.com/apache/arrow-rs/pull/853)) -* [2cc64937a153f632796915d2d9869d5c2a501d28](https://github.com/apache/arrow-rs/commit/2cc64937a153f632796915d2d9869d5c2a501d28) [Minor] Fix clippy errors with new rust version (1.56) and float formatting with nightly ([#845](https://github.com/apache/arrow-rs/pull/845)) ([#850](https://github.com/apache/arrow-rs/pull/850)) - -**Other:** -* [bfac9e5a027e3bd78b7a1ec90c75a3e385bd66bb](https://github.com/apache/arrow-rs/commit/bfac9e5a027e3bd78b7a1ec90c75a3e385bd66bb) Test out new tarpaulin version ([#852](https://github.com/apache/arrow-rs/pull/852)) ([#866](https://github.com/apache/arrow-rs/pull/866)) -* [809350ced392cfc78d8a1a46228d4ffc25dea9ff](https://github.com/apache/arrow-rs/commit/809350ced392cfc78d8a1a46228d4ffc25dea9ff) Update README.md ([#834](https://github.com/apache/arrow-rs/pull/834)) ([#854](https://github.com/apache/arrow-rs/pull/854)) -* [70582f40dd21f5c710c4946266d0563a92b92337](https://github.com/apache/arrow-rs/commit/70582f40dd21f5c710c4946266d0563a92b92337) [MINOR] Delete temp file from docs ([#836](https://github.com/apache/arrow-rs/pull/836)) ([#855](https://github.com/apache/arrow-rs/pull/855)) -* [a721e00014015a7e598946b6efb9b1da8080ec85](https://github.com/apache/arrow-rs/commit/a721e00014015a7e598946b6efb9b1da8080ec85) Force fresh cargo cache key in CI ([#839](https://github.com/apache/arrow-rs/pull/839)) ([#851](https://github.com/apache/arrow-rs/pull/851)) - - -## [6.0.0](https://github.com/apache/arrow-rs/tree/6.0.0) (2021-10-13) - -[Full Changelog](https://github.com/apache/arrow-rs/compare/5.5.0...6.0.0) - -**Breaking changes:** - -- Replace `ArrayData::new()` with `ArrayData::try_new()` and `unsafe ArrayData::new_unchecked` [\#822](https://github.com/apache/arrow-rs/pull/822) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) -- Update Bitmap::len to return bits rather than bytes [\#749](https://github.com/apache/arrow-rs/pull/749) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([matthewmturner](https://github.com/matthewmturner)) -- use sort\_unstable\_by in primitive sorting [\#552](https://github.com/apache/arrow-rs/pull/552) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Jimexist](https://github.com/Jimexist)) -- New MapArray support [\#491](https://github.com/apache/arrow-rs/pull/491) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([nevi-me](https://github.com/nevi-me)) - -**Implemented enhancements:** - -- Improve parquet binary writer speed by reducing allocations [\#819](https://github.com/apache/arrow-rs/issues/819) -- Expose buffer operations [\#808](https://github.com/apache/arrow-rs/issues/808) -- Add doc examples of writing parquet files using `ArrowWriter` [\#788](https://github.com/apache/arrow-rs/issues/788) - -**Fixed bugs:** - -- JSON reader can create null struct children on empty lists [\#825](https://github.com/apache/arrow-rs/issues/825) -- Incorrect null count for cast kernel for list arrays [\#815](https://github.com/apache/arrow-rs/issues/815) -- `minute` and `second` temporal kernels do not respect timezone [\#500](https://github.com/apache/arrow-rs/issues/500) -- Fix data corruption in json decoder f64-to-i64 cast [\#652](https://github.com/apache/arrow-rs/pull/652) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([xianwill](https://github.com/xianwill)) - -**Documentation updates:** - -- Doctest for PrimitiveArray using from\_iter\_values. [\#694](https://github.com/apache/arrow-rs/pull/694) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([novemberkilo](https://github.com/novemberkilo)) -- Doctests for BinaryArray and LargeBinaryArray. [\#625](https://github.com/apache/arrow-rs/pull/625) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([novemberkilo](https://github.com/novemberkilo)) -- Add links in docstrings [\#605](https://github.com/apache/arrow-rs/pull/605) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) - - -## [5.5.0](https://github.com/apache/arrow-rs/tree/5.5.0) (2021-09-24) - -[Full Changelog](https://github.com/apache/arrow-rs/compare/5.4.0...5.5.0) - -**Implemented enhancements:** - -- parquet should depend on a small set of arrow features [\#800](https://github.com/apache/arrow-rs/issues/800) -- Support equality on RecordBatch [\#735](https://github.com/apache/arrow-rs/issues/735) - -**Fixed bugs:** - -- Converting from string to timestamp uses microseconds instead of milliseconds [\#780](https://github.com/apache/arrow-rs/issues/780) -- Document has no link to `RowColumIter` [\#762](https://github.com/apache/arrow-rs/issues/762) -- length on slices with null doesn't work [\#744](https://github.com/apache/arrow-rs/issues/744) - -## [5.4.0](https://github.com/apache/arrow-rs/tree/5.4.0) (2021-09-10) - -[Full Changelog](https://github.com/apache/arrow-rs/compare/5.3.0...5.4.0) - -**Implemented enhancements:** - -- Upgrade lexical-core to 0.8 [\#747](https://github.com/apache/arrow-rs/issues/747) -- `append_nulls` and `append_trusted_len_iter` for PrimitiveBuilder [\#725](https://github.com/apache/arrow-rs/issues/725) -- Optimize MutableArrayData::extend for null buffers [\#397](https://github.com/apache/arrow-rs/issues/397) - -**Fixed bugs:** - -- Arithmetic with scalars doesn't work on slices [\#742](https://github.com/apache/arrow-rs/issues/742) -- Comparisons with scalar don't work on slices [\#740](https://github.com/apache/arrow-rs/issues/740) -- `unary` kernel doesn't respect offset [\#738](https://github.com/apache/arrow-rs/issues/738) -- `new_null_array` creates invalid struct arrays [\#734](https://github.com/apache/arrow-rs/issues/734) -- --no-default-features is broken for parquet [\#733](https://github.com/apache/arrow-rs/issues/733) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- `Bitmap::len` returns the number of bytes, not bits. [\#730](https://github.com/apache/arrow-rs/issues/730) -- Decimal logical type is formatted incorrectly by print\_schema [\#713](https://github.com/apache/arrow-rs/issues/713) -- parquet\_derive does not support chrono time values [\#711](https://github.com/apache/arrow-rs/issues/711) -- Numeric overflow when formatting Decimal type [\#710](https://github.com/apache/arrow-rs/issues/710) -- The integration tests are not running [\#690](https://github.com/apache/arrow-rs/issues/690) +- Update arrow module docs [\#1840](https://github.com/apache/arrow-rs/pull/1840) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Update safety disclaimer [\#1837](https://github.com/apache/arrow-rs/pull/1837) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Update ballista readme link [\#1765](https://github.com/apache/arrow-rs/pull/1765) ([tustvold](https://github.com/tustvold)) +- Move changelog archive to `CHANGELOG-old.md` [\#1759](https://github.com/apache/arrow-rs/pull/1759) ([alamb](https://github.com/alamb)) **Closed issues:** -- Question: Is there no way to create a DictionaryArray with a pre-arranged mapping? [\#729](https://github.com/apache/arrow-rs/issues/729) - -## [5.3.0](https://github.com/apache/arrow-rs/tree/5.3.0) (2021-08-26) - -[Full Changelog](https://github.com/apache/arrow-rs/compare/5.2.0...5.3.0) - -**Implemented enhancements:** - -- Add optimized filter kernel for regular expression matching [\#697](https://github.com/apache/arrow-rs/issues/697) -- Can't cast from timestamp array to string array [\#587](https://github.com/apache/arrow-rs/issues/587) - -**Fixed bugs:** - -- 'Encoding DELTA\_BYTE\_ARRAY is not supported' with parquet arrow readers [\#708](https://github.com/apache/arrow-rs/issues/708) -- Support reading json string into binary data type. [\#701](https://github.com/apache/arrow-rs/issues/701) - -**Closed issues:** - -- Resolve Issues with `prettytable-rs` dependency [\#69](https://github.com/apache/arrow-rs/issues/69) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] - -## [5.2.0](https://github.com/apache/arrow-rs/tree/5.2.0) (2021-08-12) - -[Full Changelog](https://github.com/apache/arrow-rs/compare/5.1.0...5.2.0) - -**Implemented enhancements:** - -- Make rand an optional dependency [\#671](https://github.com/apache/arrow-rs/issues/671) -- Remove undefined behavior in `value` method of boolean and primitive arrays [\#645](https://github.com/apache/arrow-rs/issues/645) -- Avoid materialization of indices in filter\_record\_batch for single arrays [\#636](https://github.com/apache/arrow-rs/issues/636) -- Add a note about arrow crate security / safety [\#627](https://github.com/apache/arrow-rs/issues/627) -- Allow the creation of String arrays from an interator of &Option\<&str\> [\#598](https://github.com/apache/arrow-rs/issues/598) -- Support arrow map datatype [\#395](https://github.com/apache/arrow-rs/issues/395) - -**Fixed bugs:** - -- Parquet fixed length byte array columns write byte array statistics [\#660](https://github.com/apache/arrow-rs/issues/660) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Parquet boolean columns write Int32 statistics [\#659](https://github.com/apache/arrow-rs/issues/659) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Writing Parquet with a boolean column fails [\#657](https://github.com/apache/arrow-rs/issues/657) -- JSON decoder data corruption for large i64/u64 [\#653](https://github.com/apache/arrow-rs/issues/653) -- Incorrect min/max statistics for strings in parquet files [\#641](https://github.com/apache/arrow-rs/issues/641) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] - -**Closed issues:** - -- Release candidate verifying script seems work on macOS [\#640](https://github.com/apache/arrow-rs/issues/640) -- Update CONTRIBUTING [\#342](https://github.com/apache/arrow-rs/issues/342) - -## [5.1.0](https://github.com/apache/arrow-rs/tree/5.1.0) (2021-07-29) - -[Full Changelog](https://github.com/apache/arrow-rs/compare/5.0.0...5.1.0) - -**Implemented enhancements:** - -- Make FFI\_ArrowArray empty\(\) public [\#602](https://github.com/apache/arrow-rs/issues/602) -- exponential sort can be used to speed up lexico partition kernel [\#586](https://github.com/apache/arrow-rs/issues/586) -- Implement sort\(\) for binary array [\#568](https://github.com/apache/arrow-rs/issues/568) -- primitive sorting can be improved and more consistent with and without `limit` if sorted unstably [\#553](https://github.com/apache/arrow-rs/issues/553) - -**Fixed bugs:** - -- Confusing memory usage with CSV reader [\#623](https://github.com/apache/arrow-rs/issues/623) -- FFI implementation deviates from specification for array release [\#595](https://github.com/apache/arrow-rs/issues/595) -- Parquet file content is different if `~/.cargo` is in a git checkout [\#589](https://github.com/apache/arrow-rs/issues/589) -- Ensure output of MIRI is checked for success [\#581](https://github.com/apache/arrow-rs/issues/581) -- MIRI failure in `array::ffi::tests::test_struct` and other ffi tests [\#580](https://github.com/apache/arrow-rs/issues/580) -- ListArray equality check may return wrong result [\#570](https://github.com/apache/arrow-rs/issues/570) -- cargo audit failed [\#561](https://github.com/apache/arrow-rs/issues/561) -- ArrayData::slice\(\) does not work for nested types such as StructArray [\#554](https://github.com/apache/arrow-rs/issues/554) - -**Documentation updates:** - -- More examples of how to construct Arrays [\#301](https://github.com/apache/arrow-rs/issues/301) - -**Closed issues:** - -- Implement StringBuilder::append\_option [\#263](https://github.com/apache/arrow-rs/issues/263) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] - -## [5.0.0](https://github.com/apache/arrow-rs/tree/5.0.0) (2021-07-14) - -[Full Changelog](https://github.com/apache/arrow-rs/compare/4.4.0...5.0.0) - -**Breaking changes:** - -- Remove lifetime from DynComparator [\#543](https://github.com/apache/arrow-rs/issues/543) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Simplify interactions with arrow flight APIs [\#376](https://github.com/apache/arrow-rs/issues/376) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] -- refactor: remove lifetime from DynComparator [\#542](https://github.com/apache/arrow-rs/pull/542) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([e-dard](https://github.com/e-dard)) -- use iterator for partition kernel instead of generating vec [\#438](https://github.com/apache/arrow-rs/pull/438) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Jimexist](https://github.com/Jimexist)) -- Remove DictionaryArray::keys\_array method [\#419](https://github.com/apache/arrow-rs/pull/419) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jhorstmann](https://github.com/jhorstmann)) -- simplify interactions with arrow flight APIs [\#377](https://github.com/apache/arrow-rs/pull/377) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([garyanaplan](https://github.com/garyanaplan)) -- return reference from DictionaryArray::values\(\) \(\#313\) [\#314](https://github.com/apache/arrow-rs/pull/314) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) - -**Implemented enhancements:** - -- Allow creation of StringArrays from Vec\ [\#519](https://github.com/apache/arrow-rs/issues/519) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Implement RecordBatch::concat [\#461](https://github.com/apache/arrow-rs/issues/461) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Implement RecordBatch::slice\(\) to slice RecordBatches [\#460](https://github.com/apache/arrow-rs/issues/460) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Add a RecordBatch::split to split large batches into a set of smaller batches [\#343](https://github.com/apache/arrow-rs/issues/343) -- generate parquet schema from rust struct [\#539](https://github.com/apache/arrow-rs/pull/539) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([nevi-me](https://github.com/nevi-me)) -- Implement `RecordBatch::concat` [\#537](https://github.com/apache/arrow-rs/pull/537) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([silathdiir](https://github.com/silathdiir)) -- Implement function slice for RecordBatch [\#490](https://github.com/apache/arrow-rs/pull/490) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([b41sh](https://github.com/b41sh)) -- add lexicographically partition points and ranges [\#424](https://github.com/apache/arrow-rs/pull/424) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Jimexist](https://github.com/Jimexist)) -- allow to read non-standard CSV [\#326](https://github.com/apache/arrow-rs/pull/326) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([kazuk](https://github.com/kazuk)) -- parquet: Speed up `BitReader`/`DeltaBitPackDecoder` [\#325](https://github.com/apache/arrow-rs/pull/325) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([kornholi](https://github.com/kornholi)) -- ARROW-12343: \[Rust\] Support auto-vectorization for min/max [\#9](https://github.com/apache/arrow-rs/pull/9) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Dandandan](https://github.com/Dandandan)) -- ARROW-12411: \[Rust\] Create RecordBatches from Iterators [\#7](https://github.com/apache/arrow-rs/pull/7) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) - -**Fixed bugs:** - -- Error building on master - error: cyclic package dependency: package `ahash v0.7.4` depends on itself. Cycle [\#544](https://github.com/apache/arrow-rs/issues/544) -- IPC reader panics with out of bounds error [\#541](https://github.com/apache/arrow-rs/issues/541) -- Take kernel doesn't handle nulls and structs correctly [\#530](https://github.com/apache/arrow-rs/issues/530) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- master fails to compile with `default-features=false` [\#529](https://github.com/apache/arrow-rs/issues/529) -- README developer instructions out of date [\#523](https://github.com/apache/arrow-rs/issues/523) -- Update rustc and packed\_simd in CI before 5.0 release [\#517](https://github.com/apache/arrow-rs/issues/517) -- Incorrect memory usage calculation for dictionary arrays [\#503](https://github.com/apache/arrow-rs/issues/503) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- sliced null buffers lead to incorrect result in take kernel \(and probably on other places\) [\#502](https://github.com/apache/arrow-rs/issues/502) -- Cast of utf8 types and list container types don't respect offset [\#334](https://github.com/apache/arrow-rs/issues/334) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- fix take kernel null handling on structs [\#531](https://github.com/apache/arrow-rs/pull/531) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([bjchambers](https://github.com/bjchambers)) -- Correct array memory usage calculation for dictionary arrays [\#505](https://github.com/apache/arrow-rs/pull/505) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jhorstmann](https://github.com/jhorstmann)) -- parquet: improve BOOLEAN writing logic and report error on encoding fail [\#443](https://github.com/apache/arrow-rs/pull/443) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([garyanaplan](https://github.com/garyanaplan)) -- Fix bug with null buffer offset in boolean not kernel [\#418](https://github.com/apache/arrow-rs/pull/418) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jhorstmann](https://github.com/jhorstmann)) -- respect offset in utf8 and list casts [\#335](https://github.com/apache/arrow-rs/pull/335) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([ritchie46](https://github.com/ritchie46)) -- Fix comparison of dictionaries with different values arrays \(\#332\) [\#333](https://github.com/apache/arrow-rs/pull/333) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- ensure null-counts are written for all-null columns [\#307](https://github.com/apache/arrow-rs/pull/307) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([crepererum](https://github.com/crepererum)) -- fix invalid null handling in filter [\#296](https://github.com/apache/arrow-rs/pull/296) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([ritchie46](https://github.com/ritchie46)) -- fix NaN handling in parquet statistics [\#256](https://github.com/apache/arrow-rs/pull/256) ([crepererum](https://github.com/crepererum)) - -**Documentation updates:** - -- Improve arrow's crate's readme on crates.io [\#463](https://github.com/apache/arrow-rs/issues/463) -- Clean up README.md in advance of the 5.0 release [\#536](https://github.com/apache/arrow-rs/pull/536) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) -- fix readme instructions to reflect new structure [\#524](https://github.com/apache/arrow-rs/pull/524) ([marcvanheerden](https://github.com/marcvanheerden)) -- Improve docs for NullArray, new\_null\_array and new\_empty\_array [\#240](https://github.com/apache/arrow-rs/pull/240) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- `DataType::Decimal` Non-Compliant? [\#1779](https://github.com/apache/arrow-rs/issues/1779) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Further simplify the offset validation [\#1770](https://github.com/apache/arrow-rs/issues/1770) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Best way to convert arrow to Rust native type [\#1760](https://github.com/apache/arrow-rs/issues/1760) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Why `Parquet` is a part of `Arrow`? [\#1715](https://github.com/apache/arrow-rs/issues/1715) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] **Merged pull requests:** -- Fix default arrow build [\#533](https://github.com/apache/arrow-rs/pull/533) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) -- Add tests for building applications using arrow with different feature flags [\#532](https://github.com/apache/arrow-rs/pull/532) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) -- Remove unused futures dependency from arrow-flight [\#528](https://github.com/apache/arrow-rs/pull/528) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alamb](https://github.com/alamb)) -- CI: update rust nightly and packed\_simd [\#525](https://github.com/apache/arrow-rs/pull/525) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([ritchie46](https://github.com/ritchie46)) -- Support `StringArray` creation from String Vec [\#522](https://github.com/apache/arrow-rs/pull/522) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([silathdiir](https://github.com/silathdiir)) -- Fix parquet benchmark schema [\#513](https://github.com/apache/arrow-rs/pull/513) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([nevi-me](https://github.com/nevi-me)) -- Fix parquet definition levels [\#511](https://github.com/apache/arrow-rs/pull/511) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([nevi-me](https://github.com/nevi-me)) -- Fix for primitive and boolean take kernel for nullable indices with an offset [\#509](https://github.com/apache/arrow-rs/pull/509) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jhorstmann](https://github.com/jhorstmann)) -- Bump flatbuffers [\#499](https://github.com/apache/arrow-rs/pull/499) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([PsiACE](https://github.com/PsiACE)) -- implement second/minute helpers for temporal [\#493](https://github.com/apache/arrow-rs/pull/493) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([ovr](https://github.com/ovr)) -- special case concatenating single element array shortcut [\#492](https://github.com/apache/arrow-rs/pull/492) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Jimexist](https://github.com/Jimexist)) -- update docs to reflect recent changes \(joins and window functions\) [\#489](https://github.com/apache/arrow-rs/pull/489) ([Jimexist](https://github.com/Jimexist)) -- Update rand, proc-macro and zstd dependencies [\#488](https://github.com/apache/arrow-rs/pull/488) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) -- Doctest for GenericListArray. [\#474](https://github.com/apache/arrow-rs/pull/474) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([novemberkilo](https://github.com/novemberkilo)) -- remove stale comment on `ArrayData` equality and update unit tests [\#472](https://github.com/apache/arrow-rs/pull/472) ([Jimexist](https://github.com/Jimexist)) -- remove unused patch file [\#471](https://github.com/apache/arrow-rs/pull/471) ([Jimexist](https://github.com/Jimexist)) -- fix clippy warnings for rust 1.53 [\#470](https://github.com/apache/arrow-rs/pull/470) ([Jimexist](https://github.com/Jimexist)) -- Fix PR labeler [\#468](https://github.com/apache/arrow-rs/pull/468) ([Dandandan](https://github.com/Dandandan)) -- Tweak dev backporting docs [\#466](https://github.com/apache/arrow-rs/pull/466) ([alamb](https://github.com/alamb)) -- Unvendor Archery [\#459](https://github.com/apache/arrow-rs/pull/459) ([kszucs](https://github.com/kszucs)) -- Add sort boolean benchmark [\#457](https://github.com/apache/arrow-rs/pull/457) ([alamb](https://github.com/alamb)) -- Add C data interface for decimal128 and timestamp [\#453](https://github.com/apache/arrow-rs/pull/453) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alippai](https://github.com/alippai)) -- Implement the Iterator trait for the json Reader. [\#451](https://github.com/apache/arrow-rs/pull/451) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([LaurentMazare](https://github.com/LaurentMazare)) -- Update release docs + release email template [\#450](https://github.com/apache/arrow-rs/pull/450) ([alamb](https://github.com/alamb)) -- remove clippy unnecessary wraps suppresions in cast kernel [\#449](https://github.com/apache/arrow-rs/pull/449) ([Jimexist](https://github.com/Jimexist)) -- Use partition for bool sort [\#448](https://github.com/apache/arrow-rs/pull/448) ([Jimexist](https://github.com/Jimexist)) -- remove unnecessary wraps in sort [\#445](https://github.com/apache/arrow-rs/pull/445) ([Jimexist](https://github.com/Jimexist)) -- Python FFI bridge for Schema, Field and DataType [\#439](https://github.com/apache/arrow-rs/pull/439) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([kszucs](https://github.com/kszucs)) -- Update release Readme.md [\#436](https://github.com/apache/arrow-rs/pull/436) ([alamb](https://github.com/alamb)) -- Derive Eq and PartialEq for SortOptions [\#425](https://github.com/apache/arrow-rs/pull/425) ([tustvold](https://github.com/tustvold)) -- refactor lexico sort for future code reuse [\#423](https://github.com/apache/arrow-rs/pull/423) ([Jimexist](https://github.com/Jimexist)) -- Reenable MIRI check on PRs [\#421](https://github.com/apache/arrow-rs/pull/421) ([alamb](https://github.com/alamb)) -- Sort by float lists [\#420](https://github.com/apache/arrow-rs/pull/420) ([medwards](https://github.com/medwards)) -- Fix out of bounds read in bit chunk iterator [\#416](https://github.com/apache/arrow-rs/pull/416) ([jhorstmann](https://github.com/jhorstmann)) -- Doctests for DecimalArray. [\#414](https://github.com/apache/arrow-rs/pull/414) ([novemberkilo](https://github.com/novemberkilo)) -- Add Decimal to CsvWriter and improve debug display [\#406](https://github.com/apache/arrow-rs/pull/406) ([alippai](https://github.com/alippai)) -- MINOR: update install instruction [\#400](https://github.com/apache/arrow-rs/pull/400) ([alippai](https://github.com/alippai)) -- use prettier to auto format md files [\#398](https://github.com/apache/arrow-rs/pull/398) ([Jimexist](https://github.com/Jimexist)) -- window::shift to work for all array types [\#388](https://github.com/apache/arrow-rs/pull/388) ([Jimexist](https://github.com/Jimexist)) -- add more tests for window::shift and handle boundary cases [\#386](https://github.com/apache/arrow-rs/pull/386) ([Jimexist](https://github.com/Jimexist)) -- Implement faster arrow array reader [\#384](https://github.com/apache/arrow-rs/pull/384) ([yordan-pavlov](https://github.com/yordan-pavlov)) -- Add set\_bit to BooleanBufferBuilder to allow mutating bit in index [\#383](https://github.com/apache/arrow-rs/pull/383) ([boazberman](https://github.com/boazberman)) -- make sure that only concat preallocates buffers [\#382](https://github.com/apache/arrow-rs/pull/382) ([ritchie46](https://github.com/ritchie46)) -- Respect max rowgroup size in Arrow writer [\#381](https://github.com/apache/arrow-rs/pull/381) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([nevi-me](https://github.com/nevi-me)) -- Fix typo in release script, update release location [\#380](https://github.com/apache/arrow-rs/pull/380) ([alamb](https://github.com/alamb)) -- Doctests for FixedSizeBinaryArray [\#378](https://github.com/apache/arrow-rs/pull/378) ([novemberkilo](https://github.com/novemberkilo)) -- Simplify shift kernel using new\_null\_array [\#370](https://github.com/apache/arrow-rs/pull/370) ([Dandandan](https://github.com/Dandandan)) -- allow `SliceableCursor` to be constructed from an `Arc` directly [\#369](https://github.com/apache/arrow-rs/pull/369) ([crepererum](https://github.com/crepererum)) -- Add doctest for ArrayBuilder [\#367](https://github.com/apache/arrow-rs/pull/367) ([alippai](https://github.com/alippai)) -- Fix version in readme [\#365](https://github.com/apache/arrow-rs/pull/365) ([domoritz](https://github.com/domoritz)) -- Remove superfluous space [\#363](https://github.com/apache/arrow-rs/pull/363) ([domoritz](https://github.com/domoritz)) -- Add crate badges [\#362](https://github.com/apache/arrow-rs/pull/362) ([domoritz](https://github.com/domoritz)) -- Disable MIRI check until it runs cleanly on CI [\#360](https://github.com/apache/arrow-rs/pull/360) ([alamb](https://github.com/alamb)) -- Only register Flight.proto with cargo if it exists [\#351](https://github.com/apache/arrow-rs/pull/351) ([tustvold](https://github.com/tustvold)) -- Reduce memory usage of concat \(large\)utf8 [\#348](https://github.com/apache/arrow-rs/pull/348) ([ritchie46](https://github.com/ritchie46)) -- Fix filter UB and add fast path [\#341](https://github.com/apache/arrow-rs/pull/341) ([ritchie46](https://github.com/ritchie46)) -- Automatic cherry-pick script [\#339](https://github.com/apache/arrow-rs/pull/339) ([alamb](https://github.com/alamb)) -- Doctests for BooleanArray. [\#338](https://github.com/apache/arrow-rs/pull/338) ([novemberkilo](https://github.com/novemberkilo)) -- feature gate ipc reader/writer [\#336](https://github.com/apache/arrow-rs/pull/336) ([ritchie46](https://github.com/ritchie46)) -- Add ported Rust release verification script [\#331](https://github.com/apache/arrow-rs/pull/331) ([wesm](https://github.com/wesm)) -- Doctests for StringArray and LargeStringArray. [\#330](https://github.com/apache/arrow-rs/pull/330) ([novemberkilo](https://github.com/novemberkilo)) -- inline PrimitiveArray::value [\#329](https://github.com/apache/arrow-rs/pull/329) ([ritchie46](https://github.com/ritchie46)) -- Enable wasm32 as a target architecture for the SIMD feature [\#324](https://github.com/apache/arrow-rs/pull/324) ([roee88](https://github.com/roee88)) -- Fix undefined behavior in FFI and enable MIRI checks on CI [\#323](https://github.com/apache/arrow-rs/pull/323) ([roee88](https://github.com/roee88)) -- Mutablebuffer::shrink\_to\_fit [\#318](https://github.com/apache/arrow-rs/pull/318) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([ritchie46](https://github.com/ritchie46)) -- Add \(simd\) modulus op [\#317](https://github.com/apache/arrow-rs/pull/317) ([gangliao](https://github.com/gangliao)) -- feature gate csv functionality [\#312](https://github.com/apache/arrow-rs/pull/312) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([ritchie46](https://github.com/ritchie46)) -- \[Minor\] Version upgrades [\#304](https://github.com/apache/arrow-rs/pull/304) ([Dandandan](https://github.com/Dandandan)) -- Remove old release scripts [\#293](https://github.com/apache/arrow-rs/pull/293) ([alamb](https://github.com/alamb)) -- Add Send to the ArrayBuilder trait [\#291](https://github.com/apache/arrow-rs/pull/291) ([Max-Meldrum](https://github.com/Max-Meldrum)) -- Added changelog generator script and configuration. [\#289](https://github.com/apache/arrow-rs/pull/289) ([jorgecarleitao](https://github.com/jorgecarleitao)) -- manually bump development version [\#288](https://github.com/apache/arrow-rs/pull/288) ([nevi-me](https://github.com/nevi-me)) -- Fix FFI and add support for Struct type [\#287](https://github.com/apache/arrow-rs/pull/287) ([roee88](https://github.com/roee88)) -- Fix subtraction underflow when sorting string arrays with many nulls [\#285](https://github.com/apache/arrow-rs/pull/285) ([medwards](https://github.com/medwards)) -- Speed up bound checking in `take` [\#281](https://github.com/apache/arrow-rs/pull/281) ([Dandandan](https://github.com/Dandandan)) -- Update PR template by commenting out instructions [\#278](https://github.com/apache/arrow-rs/pull/278) ([nevi-me](https://github.com/nevi-me)) -- Added Decimal support to pretty-print display utility \(\#230\) [\#273](https://github.com/apache/arrow-rs/pull/273) ([mgill25](https://github.com/mgill25)) -- Fix null struct and list roundtrip [\#270](https://github.com/apache/arrow-rs/pull/270) ([nevi-me](https://github.com/nevi-me)) -- 1.52 clippy fixes [\#267](https://github.com/apache/arrow-rs/pull/267) ([nevi-me](https://github.com/nevi-me)) -- Fix typo in csv/reader.rs [\#265](https://github.com/apache/arrow-rs/pull/265) ([domoritz](https://github.com/domoritz)) -- Fix empty Schema::metadata deserialization error [\#260](https://github.com/apache/arrow-rs/pull/260) ([hulunbier](https://github.com/hulunbier)) -- update datafusion and ballista doc links [\#259](https://github.com/apache/arrow-rs/pull/259) ([Jimexist](https://github.com/Jimexist)) -- support full u32 and u64 roundtrip through parquet [\#258](https://github.com/apache/arrow-rs/pull/258) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([crepererum](https://github.com/crepererum)) -- \[MINOR\] Added env to run rust in integration. [\#253](https://github.com/apache/arrow-rs/pull/253) ([jorgecarleitao](https://github.com/jorgecarleitao)) -- \[Minor\] Made integration tests always run. [\#248](https://github.com/apache/arrow-rs/pull/248) ([jorgecarleitao](https://github.com/jorgecarleitao)) -- fix parquet max\_definition for non-null structs [\#246](https://github.com/apache/arrow-rs/pull/246) ([nevi-me](https://github.com/nevi-me)) -- Disabled rebase needed until demonstrate working. [\#243](https://github.com/apache/arrow-rs/pull/243) ([jorgecarleitao](https://github.com/jorgecarleitao)) -- pin flatbuffers to 0.8.4 [\#239](https://github.com/apache/arrow-rs/pull/239) ([ritchie46](https://github.com/ritchie46)) -- sort\_primitive result is capped to the min of limit or values.len [\#236](https://github.com/apache/arrow-rs/pull/236) ([medwards](https://github.com/medwards)) -- Read list field correctly [\#234](https://github.com/apache/arrow-rs/pull/234) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([nevi-me](https://github.com/nevi-me)) -- Fix code examples for RecordBatch::try\_from\_iter [\#231](https://github.com/apache/arrow-rs/pull/231) ([alamb](https://github.com/alamb)) -- Support string dictionaries in csv reader \(\#228\) [\#229](https://github.com/apache/arrow-rs/pull/229) ([tustvold](https://github.com/tustvold)) -- support LargeUtf8 in sort kernel [\#26](https://github.com/apache/arrow-rs/pull/26) ([ritchie46](https://github.com/ritchie46)) -- Removed unused files [\#22](https://github.com/apache/arrow-rs/pull/22) ([jorgecarleitao](https://github.com/jorgecarleitao)) -- ARROW-12504: Buffer::from\_slice\_ref set correct capacity [\#18](https://github.com/apache/arrow-rs/pull/18) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Add GitHub templates [\#17](https://github.com/apache/arrow-rs/pull/17) ([andygrove](https://github.com/andygrove)) -- ARROW-12493: Add support for writing dictionary arrays to CSV and JSON [\#16](https://github.com/apache/arrow-rs/pull/16) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- ARROW-12426: \[Rust\] Fix concatentation of arrow dictionaries [\#15](https://github.com/apache/arrow-rs/pull/15) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Update repository and homepage urls [\#14](https://github.com/apache/arrow-rs/pull/14) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Dandandan](https://github.com/Dandandan)) -- Added rebase-needed bot [\#13](https://github.com/apache/arrow-rs/pull/13) ([jorgecarleitao](https://github.com/jorgecarleitao)) -- Added Integration tests against arrow [\#10](https://github.com/apache/arrow-rs/pull/10) ([jorgecarleitao](https://github.com/jorgecarleitao)) - -## [4.4.0](https://github.com/apache/arrow-rs/tree/4.4.0) (2021-06-24) - -[Full Changelog](https://github.com/apache/arrow-rs/compare/4.3.0...4.4.0) - -**Breaking changes:** - -- migrate partition kernel to use Iterator trait [\#437](https://github.com/apache/arrow-rs/issues/437) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Remove DictionaryArray::keys\_array [\#391](https://github.com/apache/arrow-rs/issues/391) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Make equals\_datatype method public, enabling other modules [\#1838](https://github.com/apache/arrow-rs/pull/1838) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([nl5887](https://github.com/nl5887)) +- \[Minor\] Clarify `PageIterator` Documentation [\#1831](https://github.com/apache/arrow-rs/pull/1831) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Ted-Jiang](https://github.com/Ted-Jiang)) +- Update MIRI pin [\#1828](https://github.com/apache/arrow-rs/pull/1828) ([tustvold](https://github.com/tustvold)) +- Change to use `resolver v2`, test more feature flag combinations in CI, fix errors \(\#1630\) [\#1822](https://github.com/apache/arrow-rs/pull/1822) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add ScalarBuffer abstraction \(\#1811\) [\#1820](https://github.com/apache/arrow-rs/pull/1820) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Fix list equal for empty offset list array [\#1818](https://github.com/apache/arrow-rs/pull/1818) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Fix Decimal and List ArrayData Validation \(\#1813\) \(\#1814\) [\#1816](https://github.com/apache/arrow-rs/pull/1816) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Don't overwrite existing data on snappy decompress \(\#1806\) [\#1807](https://github.com/apache/arrow-rs/pull/1807) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Rename `arrow/benches/string_kernels.rs` to `arrow/benches/substring_kernels.rs` [\#1805](https://github.com/apache/arrow-rs/pull/1805) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) +- Add public API for decoding parquet footer [\#1804](https://github.com/apache/arrow-rs/pull/1804) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Add AsyncFileReader trait [\#1803](https://github.com/apache/arrow-rs/pull/1803) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- add parquet-fromcsv \(\#1\) [\#1798](https://github.com/apache/arrow-rs/pull/1798) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([kazuk](https://github.com/kazuk)) +- Use IPC row count info in IPC reader [\#1796](https://github.com/apache/arrow-rs/pull/1796) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Fix typos in the Memory and Buffers section of the docs home [\#1795](https://github.com/apache/arrow-rs/pull/1795) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([datapythonista](https://github.com/datapythonista)) +- Write validity buffer for UnionArray in V4 IPC message [\#1794](https://github.com/apache/arrow-rs/pull/1794) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- feat:Add function for row alignment with page mask [\#1791](https://github.com/apache/arrow-rs/pull/1791) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Ted-Jiang](https://github.com/Ted-Jiang)) +- Read and skip validity buffer of UnionType Array for V4 ipc message [\#1789](https://github.com/apache/arrow-rs/pull/1789) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([viirya](https://github.com/viirya)) +- Add `Substring_by_char` [\#1784](https://github.com/apache/arrow-rs/pull/1784) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) +- Add `ParquetFileArrowReader::try_new` [\#1782](https://github.com/apache/arrow-rs/pull/1782) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Arbitrary size combine option bitmap [\#1781](https://github.com/apache/arrow-rs/pull/1781) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Ismail-Maj](https://github.com/Ismail-Maj)) +- Implement `ChunkReader` for `Bytes`, deprecate `SliceableCursor` [\#1775](https://github.com/apache/arrow-rs/pull/1775) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Access metadata of flushed row groups on write \(\#1691\) [\#1774](https://github.com/apache/arrow-rs/pull/1774) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Simplify ParquetFileArrowReader Metadata API [\#1773](https://github.com/apache/arrow-rs/pull/1773) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- MINOR: Unpin nightly version as packed\_simd releases new version [\#1771](https://github.com/apache/arrow-rs/pull/1771) ([viirya](https://github.com/viirya)) +- Update comfy-table requirement from 5.0 to 6.0 [\#1769](https://github.com/apache/arrow-rs/pull/1769) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Optionally disable `validate_decimal_precision` check in `DecimalBuilder.append_value` for interop test [\#1767](https://github.com/apache/arrow-rs/pull/1767) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Minor: Clean up the code of MutableArrayData [\#1763](https://github.com/apache/arrow-rs/pull/1763) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) +- Support reading PageIndex from parquet metadata, prepare for skipping pages at reading [\#1762](https://github.com/apache/arrow-rs/pull/1762) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Ted-Jiang](https://github.com/Ted-Jiang)) +- Support casting `Utf8` to `Boolean` [\#1738](https://github.com/apache/arrow-rs/pull/1738) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([MazterQyou](https://github.com/MazterQyou)) -**Implemented enhancements:** - -- sort kernel boolean sort can be O\(n\) [\#447](https://github.com/apache/arrow-rs/issues/447) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- C data interface for decimal128, timestamp, date32 and date64 [\#413](https://github.com/apache/arrow-rs/issues/413) -- Add Decimal to CsvWriter [\#405](https://github.com/apache/arrow-rs/issues/405) -- Use iterators to increase performance of creating Arrow arrays [\#200](https://github.com/apache/arrow-rs/issues/200) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] - -**Fixed bugs:** - -- Release Audit Tool \(RAT\) is not being triggered [\#481](https://github.com/apache/arrow-rs/issues/481) -- Security Vulnerabilities: flatbuffers: `read_scalar` and `read_scalar_at` allow transmuting values without `unsafe` blocks [\#476](https://github.com/apache/arrow-rs/issues/476) -- Clippy broken after upgrade to rust 1.53 [\#467](https://github.com/apache/arrow-rs/issues/467) -- Pull Request Labeler is not working [\#462](https://github.com/apache/arrow-rs/issues/462) -- Arrow 4.3 release: error\[E0658\]: use of unstable library feature 'partition\_point': new API [\#456](https://github.com/apache/arrow-rs/issues/456) -- parquet reading hangs when row\_group contains more than 2048 rows of data [\#349](https://github.com/apache/arrow-rs/issues/349) -- Fail to build arrow [\#247](https://github.com/apache/arrow-rs/issues/247) -- JSON reader does not implement iterator [\#193](https://github.com/apache/arrow-rs/issues/193) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] - -**Security fixes:** - -- Ensure a successful MIRI Run on CI [\#227](https://github.com/apache/arrow-rs/issues/227) - -**Closed issues:** - -- sort kernel has a lot of unnecessary wrapping [\#446](https://github.com/apache/arrow-rs/issues/446) -- \[Parquet\] Plain encoded boolean column chunks limited to 2048 values [\#48](https://github.com/apache/arrow-rs/issues/48) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] - -## [4.3.0](https://github.com/apache/arrow-rs/tree/4.3.0) (2021-06-10) - -[Full Changelog](https://github.com/apache/arrow-rs/compare/4.2.0...4.3.0) - -**Implemented enhancements:** - -- Add partitioning kernel for sorted arrays [\#428](https://github.com/apache/arrow-rs/issues/428) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Implement sort by float lists [\#427](https://github.com/apache/arrow-rs/issues/427) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Derive Eq and PartialEq for SortOptions [\#426](https://github.com/apache/arrow-rs/issues/426) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- use prettier and github action to normalize markdown document syntax [\#399](https://github.com/apache/arrow-rs/issues/399) -- window::shift can work for more than just primitive array type [\#392](https://github.com/apache/arrow-rs/issues/392) -- Doctest for ArrayBuilder [\#366](https://github.com/apache/arrow-rs/issues/366) - -**Fixed bugs:** - -- Boolean `not` kernel does not take offset of null buffer into account [\#417](https://github.com/apache/arrow-rs/issues/417) -- my contribution not marged in 4.2 release [\#394](https://github.com/apache/arrow-rs/issues/394) -- window::shift shall properly handle boundary cases [\#387](https://github.com/apache/arrow-rs/issues/387) -- Parquet `WriterProperties.max_row_group_size` not wired up [\#257](https://github.com/apache/arrow-rs/issues/257) -- Out of bound reads in chunk iterator [\#198](https://github.com/apache/arrow-rs/issues/198) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] - -## [4.2.0](https://github.com/apache/arrow-rs/tree/4.2.0) (2021-05-29) - -[Full Changelog](https://github.com/apache/arrow-rs/compare/4.1.0...4.2.0) - -**Breaking changes:** - -- DictionaryArray::values\(\) clones the underlying ArrayRef [\#313](https://github.com/apache/arrow-rs/issues/313) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] - -**Implemented enhancements:** - -- Simplify shift kernel using null array [\#371](https://github.com/apache/arrow-rs/issues/371) -- Provide `Arc`-based constructor for `parquet::util::cursor::SliceableCursor` [\#368](https://github.com/apache/arrow-rs/issues/368) -- Add badges to crates [\#361](https://github.com/apache/arrow-rs/issues/361) -- Consider inlining PrimitiveArray::value [\#328](https://github.com/apache/arrow-rs/issues/328) -- Implement automated release verification script [\#327](https://github.com/apache/arrow-rs/issues/327) -- Add wasm32 to the list of target architectures of the simd feature [\#316](https://github.com/apache/arrow-rs/issues/316) -- add with\_escape for csv::ReaderBuilder [\#315](https://github.com/apache/arrow-rs/issues/315) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- IPC feature gate [\#310](https://github.com/apache/arrow-rs/issues/310) -- csv feature gate [\#309](https://github.com/apache/arrow-rs/issues/309) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Add `shrink_to` / `shrink_to_fit` to `MutableBuffer` [\#297](https://github.com/apache/arrow-rs/issues/297) - -**Fixed bugs:** - -- Incorrect crate setup instructions [\#364](https://github.com/apache/arrow-rs/issues/364) -- Arrow-flight only register rerun-if-changed if file exists [\#350](https://github.com/apache/arrow-rs/issues/350) -- Dictionary Comparison Uses Wrong Values Array [\#332](https://github.com/apache/arrow-rs/issues/332) -- Undefined behavior in FFI implementation [\#322](https://github.com/apache/arrow-rs/issues/322) -- All-null column get wrong parquet null-counts [\#306](https://github.com/apache/arrow-rs/issues/306) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Filter has inconsistent null handling [\#295](https://github.com/apache/arrow-rs/issues/295) - -## [4.1.0](https://github.com/apache/arrow-rs/tree/4.1.0) (2021-05-17) - -[Full Changelog](https://github.com/apache/arrow-rs/compare/4.0.0...4.1.0) - -**Implemented enhancements:** - -- Add Send to ArrayBuilder [\#290](https://github.com/apache/arrow-rs/issues/290) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Improve performance of bound checking option [\#280](https://github.com/apache/arrow-rs/issues/280) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- extend compute kernel arity to include nullary functions [\#276](https://github.com/apache/arrow-rs/issues/276) -- Implement FFI / CDataInterface for Struct Arrays [\#251](https://github.com/apache/arrow-rs/issues/251) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Add support for pretty-printing Decimal numbers [\#230](https://github.com/apache/arrow-rs/issues/230) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- CSV Reader String Dictionary Support [\#228](https://github.com/apache/arrow-rs/issues/228) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Add Builder interface for adding Arrays to record batches [\#210](https://github.com/apache/arrow-rs/issues/210) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Support auto-vectorization for min/max [\#209](https://github.com/apache/arrow-rs/issues/209) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Support LargeUtf8 in sort kernel [\#25](https://github.com/apache/arrow-rs/issues/25) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] - -**Fixed bugs:** - -- no method named `select_nth_unstable_by` found for mutable reference `&mut [T]` [\#283](https://github.com/apache/arrow-rs/issues/283) -- Rust 1.52 Clippy error [\#266](https://github.com/apache/arrow-rs/issues/266) -- NaNs can break parquet statistics [\#255](https://github.com/apache/arrow-rs/issues/255) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- u64::MAX does not roundtrip through parquet [\#254](https://github.com/apache/arrow-rs/issues/254) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Integration tests failing to compile \(flatbuffer\) [\#249](https://github.com/apache/arrow-rs/issues/249) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Fix compatibility quirks between arrow and parquet structs [\#245](https://github.com/apache/arrow-rs/issues/245) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Unable to write non-null Arrow structs to Parquet [\#244](https://github.com/apache/arrow-rs/issues/244) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- schema: missing field `metadata` when deserialize [\#241](https://github.com/apache/arrow-rs/issues/241) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Arrow does not compile due to flatbuffers upgrade [\#238](https://github.com/apache/arrow-rs/issues/238) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Sort with limit panics for the limit includes some but not all nulls, for large arrays [\#235](https://github.com/apache/arrow-rs/issues/235) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- arrow-rs contains a copy of the "format" directory [\#233](https://github.com/apache/arrow-rs/issues/233) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Fix SEGFAULT/ SIGILL in child-data ffi [\#206](https://github.com/apache/arrow-rs/issues/206) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Read list field correctly in \\> [\#167](https://github.com/apache/arrow-rs/issues/167) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- FFI listarray lead to undefined behavior. [\#20](https://github.com/apache/arrow-rs/issues/20) - -**Security fixes:** - -- Fix MIRI build on CI [\#226](https://github.com/apache/arrow-rs/issues/226) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Get MIRI running again [\#224](https://github.com/apache/arrow-rs/issues/224) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] - -**Documentation updates:** - -- Comment out the instructions in the PR template [\#277](https://github.com/apache/arrow-rs/issues/277) -- Update links to datafusion and ballista in README.md [\#19](https://github.com/apache/arrow-rs/issues/19) -- Update "repository" in Cargo.toml [\#12](https://github.com/apache/arrow-rs/issues/12) - -**Closed issues:** -- Arrow Aligned Vec [\#268](https://github.com/apache/arrow-rs/issues/268) -- \[Rust\]: Tracking issue for AVX-512 [\#220](https://github.com/apache/arrow-rs/issues/220) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Umbrella issue for clippy integration [\#217](https://github.com/apache/arrow-rs/issues/217) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Support sort [\#215](https://github.com/apache/arrow-rs/issues/215) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Support stable Rust [\#214](https://github.com/apache/arrow-rs/issues/214) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Remove Rust and point integration tests to arrow-rs repo [\#211](https://github.com/apache/arrow-rs/issues/211) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- ArrayData buffers are inconsistent accross implementations [\#207](https://github.com/apache/arrow-rs/issues/207) -- 3.0.1 patch release [\#204](https://github.com/apache/arrow-rs/issues/204) -- Document patch release process [\#202](https://github.com/apache/arrow-rs/issues/202) -- Simplify Offset [\#186](https://github.com/apache/arrow-rs/issues/186) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Typed Bytes [\#185](https://github.com/apache/arrow-rs/issues/185) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- \[CI\]docker-compose setup should enable caching [\#175](https://github.com/apache/arrow-rs/issues/175) -- Improve take primitive performance [\#174](https://github.com/apache/arrow-rs/issues/174) -- \[CI\] Try out buildkite [\#165](https://github.com/apache/arrow-rs/issues/165) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Update assignees in JIRA where missing [\#160](https://github.com/apache/arrow-rs/issues/160) -- \[Rust\]: From\ implementations should validate data type [\#103](https://github.com/apache/arrow-rs/issues/103) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- \[DataFusion\] Verify that projection push down does not remove aliases columns [\#99](https://github.com/apache/arrow-rs/issues/99) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- \[Rust\]\[DataFusion\] Implement modulus expression [\#98](https://github.com/apache/arrow-rs/issues/98) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- \[DataFusion\] Add constant folding to expressions during logically planning [\#96](https://github.com/apache/arrow-rs/issues/96) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- \[DataFusion\] DataFrame.collect should return RecordBatchReader [\#95](https://github.com/apache/arrow-rs/issues/95) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- \[Rust\]\[DataFusion\] Add FORMAT to explain plan and an easy to visualize format [\#94](https://github.com/apache/arrow-rs/issues/94) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- \[DataFusion\] Implement metrics framework [\#90](https://github.com/apache/arrow-rs/issues/90) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- \[DataFusion\] Implement micro benchmarks for each operator [\#89](https://github.com/apache/arrow-rs/issues/89) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- \[DataFusion\] Implement pretty print for physical query plan [\#88](https://github.com/apache/arrow-rs/issues/88) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- \[Archery\] Support rust clippy in the lint command [\#83](https://github.com/apache/arrow-rs/issues/83) -- \[rust\]\[datafusion\] optimize count\(\*\) queries on parquet sources [\#75](https://github.com/apache/arrow-rs/issues/75) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- \[Rust\]\[DataFusion\] Improve like/nlike performance [\#71](https://github.com/apache/arrow-rs/issues/71) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- \[DataFusion\] Implement optimizer rule to remove redundant projections [\#56](https://github.com/apache/arrow-rs/issues/56) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- \[DataFusion\] Parquet data source does not support complex types [\#39](https://github.com/apache/arrow-rs/issues/39) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Merge utils from Parquet and Arrow [\#32](https://github.com/apache/arrow-rs/issues/32) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Add benchmarks for Parquet [\#30](https://github.com/apache/arrow-rs/issues/30) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Mark methods that do not perform bounds checking as unsafe [\#28](https://github.com/apache/arrow-rs/issues/28) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Test issue [\#24](https://github.com/apache/arrow-rs/issues/24) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- This is a test issue [\#11](https://github.com/apache/arrow-rs/issues/11) -For older versions, see [apache/arrow/CHANGELOG.md](https://github.com/apache/arrow/blob/master/CHANGELOG.md) \* *This Changelog was automatically generated by [github_changelog_generator](https://github.com/github-changelog-generator/github-changelog-generator)* diff --git a/Cargo.toml b/Cargo.toml index de7d36f34814..2837f028e8c4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -24,6 +24,15 @@ members = [ "arrow-flight", "integration-testing", ] +# Enable the version 2 feature resolver, which avoids unifying features for targets that are not being built +# +# Critically this prevents dev-dependencies from enabling features even when not building a target that +# uses dev-dependencies, e.g. the library crate. This in turn ensures that we can catch invalid feature +# flag combinations that would otherwise only surface in dependent crates +# +# Reference - https://doc.rust-lang.org/nightly/cargo/reference/features.html#feature-resolver-version-2 +# +resolver = "2" # this package is excluded because it requires different compilation flags, thereby significantly changing # how it is compiled within the workspace, causing the whole workspace to be compiled from scratch diff --git a/README.md b/README.md index 08c79bac35ff..08385fb6c15d 100644 --- a/README.md +++ b/README.md @@ -66,5 +66,5 @@ There is more information in the [contributing] guide. [parquet-readme]: parquet/README.md [flight-readme]: arrow-flight/README.md [datafusion-readme]: https://github.com/apache/arrow-datafusion/blob/master/README.md -[ballista-readme]: https://github.com/apache/arrow-datafusion/blob/master/ballista/README.md +[ballista-readme]: https://github.com/apache/arrow-ballista/blob/master/README.md [issues]: https://github.com/apache/arrow-rs/issues diff --git a/arrow-flight/Cargo.toml b/arrow-flight/Cargo.toml index 19215cd3df2a..c5522766e4bd 100644 --- a/arrow-flight/Cargo.toml +++ b/arrow-flight/Cargo.toml @@ -18,7 +18,7 @@ [package] name = "arrow-flight" description = "Apache Arrow Flight" -version = "15.0.0" +version = "16.0.0" edition = "2021" rust-version = "1.57" authors = ["Apache Arrow "] @@ -27,14 +27,14 @@ repository = "https://github.com/apache/arrow-rs" license = "Apache-2.0" [dependencies] -arrow = { path = "../arrow", version = "15.0.0" } -base64 = "0.13" -tonic = "0.7" -bytes = "1" -prost = "0.10" -prost-types = { version = "0.10.0", optional = true } -prost-derive = "0.10" -tokio = { version = "1.0", features = ["macros", "rt", "rt-multi-thread"] } +arrow = { path = "../arrow", version = "16.0.0", default-features = false, features = ["ipc"] } +base64 = { version = "0.13", default-features = false } +tonic = { version = "0.7", default-features = false, features = ["transport", "codegen", "prost"] } +bytes = { version = "1", default-features = false } +prost = { version = "0.10", default-features = false } +prost-types = { version = "0.10.0", default-features = false, optional = true } +prost-derive = { version = "0.10", default-features = false } +tokio = { version = "1.0", default-features = false, features = ["macros", "rt", "rt-multi-thread"] } futures = { version = "0.3", default-features = false, features = ["alloc"]} [features] @@ -44,10 +44,10 @@ flight-sql-experimental = ["prost-types"] [dev-dependencies] [build-dependencies] -tonic-build = "0.7" +tonic-build = { version = "0.7", default-features = false, features = ["transport", "prost"] } # Pin specific version of the tonic-build dependencies to avoid auto-generated # (and checked in) arrow.flight.protocol.rs from changing -proc-macro2 = ">1.0.30" +proc-macro2 = { version = ">1.0.30", default-features = false } [[example]] name = "flight_sql_server" diff --git a/arrow-flight/README.md b/arrow-flight/README.md index 45b081799e6f..a951699f40aa 100644 --- a/arrow-flight/README.md +++ b/arrow-flight/README.md @@ -27,7 +27,7 @@ Add this to your Cargo.toml: ```toml [dependencies] -arrow-flight = "15.0.0" +arrow-flight = "16.0.0" ``` Apache Arrow Flight is a gRPC based protocol for exchanging Arrow data between processes. See the blog post [Introducing Apache Arrow Flight: A Framework for Fast Data Transport](https://arrow.apache.org/blog/2019/10/13/introducing-arrow-flight/) for more information. diff --git a/arrow-flight/src/arrow.flight.protocol.rs b/arrow-flight/src/arrow.flight.protocol.rs index bbca033fda3b..c76469b39ce7 100644 --- a/arrow-flight/src/arrow.flight.protocol.rs +++ b/arrow-flight/src/arrow.flight.protocol.rs @@ -229,7 +229,7 @@ pub mod flight_service_client { where T: tonic::client::GrpcService, T::Error: Into, - T::ResponseBody: Default + Body + Send + 'static, + T::ResponseBody: Body + Send + 'static, ::Error: Into + Send, { pub fn new(inner: T) -> Self { @@ -242,6 +242,7 @@ pub mod flight_service_client { ) -> FlightServiceClient> where F: tonic::service::Interceptor, + T::ResponseBody: Default, T: tonic::codegen::Service< http::Request, Response = http::Response< @@ -278,9 +279,9 @@ pub mod flight_service_client { &mut self, request: impl tonic::IntoStreamingRequest, ) -> Result< - tonic::Response>, - tonic::Status, - > { + tonic::Response>, + tonic::Status, + > { self.inner .ready() .await @@ -307,9 +308,9 @@ pub mod flight_service_client { &mut self, request: impl tonic::IntoRequest, ) -> Result< - tonic::Response>, - tonic::Status, - > { + tonic::Response>, + tonic::Status, + > { self.inner .ready() .await @@ -388,9 +389,9 @@ pub mod flight_service_client { &mut self, request: impl tonic::IntoRequest, ) -> Result< - tonic::Response>, - tonic::Status, - > { + tonic::Response>, + tonic::Status, + > { self.inner .ready() .await @@ -417,9 +418,9 @@ pub mod flight_service_client { &mut self, request: impl tonic::IntoStreamingRequest, ) -> Result< - tonic::Response>, - tonic::Status, - > { + tonic::Response>, + tonic::Status, + > { self.inner .ready() .await @@ -445,9 +446,9 @@ pub mod flight_service_client { &mut self, request: impl tonic::IntoStreamingRequest, ) -> Result< - tonic::Response>, - tonic::Status, - > { + tonic::Response>, + tonic::Status, + > { self.inner .ready() .await @@ -474,9 +475,9 @@ pub mod flight_service_client { &mut self, request: impl tonic::IntoRequest, ) -> Result< - tonic::Response>, - tonic::Status, - > { + tonic::Response>, + tonic::Status, + > { self.inner .ready() .await @@ -500,9 +501,9 @@ pub mod flight_service_client { &mut self, request: impl tonic::IntoRequest, ) -> Result< - tonic::Response>, - tonic::Status, - > { + tonic::Response>, + tonic::Status, + > { self.inner .ready() .await diff --git a/arrow-flight/src/utils.rs b/arrow-flight/src/utils.rs index 77526917f22a..dda3fc7fe3db 100644 --- a/arrow-flight/src/utils.rs +++ b/arrow-flight/src/utils.rs @@ -71,6 +71,7 @@ pub fn flight_data_to_arrow_batch( schema, dictionaries_by_id, None, + &message.version(), ) })? } diff --git a/arrow-pyarrow-integration-testing/Cargo.toml b/arrow-pyarrow-integration-testing/Cargo.toml index cba15fc61f32..58ba726091c8 100644 --- a/arrow-pyarrow-integration-testing/Cargo.toml +++ b/arrow-pyarrow-integration-testing/Cargo.toml @@ -18,7 +18,7 @@ [package] name = "arrow-pyarrow-integration-testing" description = "" -version = "15.0.0" +version = "16.0.0" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" authors = ["Apache Arrow "] @@ -32,7 +32,7 @@ name = "arrow_pyarrow_integration_testing" crate-type = ["cdylib"] [dependencies] -arrow = { path = "../arrow", version = "15.0.0", features = ["pyarrow"] } +arrow = { path = "../arrow", version = "16.0.0", features = ["pyarrow"] } pyo3 = { version = "0.16", features = ["extension-module"] } [package.metadata.maturin] diff --git a/arrow-pyarrow-integration-testing/src/lib.rs b/arrow-pyarrow-integration-testing/src/lib.rs index 26c09d64d5d1..086b21834657 100644 --- a/arrow-pyarrow-integration-testing/src/lib.rs +++ b/arrow-pyarrow-integration-testing/src/lib.rs @@ -27,6 +27,7 @@ use arrow::array::{ArrayData, ArrayRef, Int64Array}; use arrow::compute::kernels; use arrow::datatypes::{DataType, Field, Schema}; use arrow::error::ArrowError; +use arrow::ffi_stream::ArrowArrayStreamReader; use arrow::pyarrow::PyArrowConvert; use arrow::record_batch::RecordBatch; @@ -111,6 +112,13 @@ fn round_trip_record_batch(obj: RecordBatch) -> PyResult { Ok(obj) } +#[pyfunction] +fn round_trip_record_batch_reader( + obj: ArrowArrayStreamReader, +) -> PyResult { + Ok(obj) +} + #[pymodule] fn arrow_pyarrow_integration_testing(_py: Python, m: &PyModule) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(double))?; @@ -122,5 +130,6 @@ fn arrow_pyarrow_integration_testing(_py: Python, m: &PyModule) -> PyResult<()> m.add_wrapped(wrap_pyfunction!(round_trip_schema))?; m.add_wrapped(wrap_pyfunction!(round_trip_array))?; m.add_wrapped(wrap_pyfunction!(round_trip_record_batch))?; + m.add_wrapped(wrap_pyfunction!(round_trip_record_batch_reader))?; Ok(()) } diff --git a/arrow-pyarrow-integration-testing/tests/test_sql.py b/arrow-pyarrow-integration-testing/tests/test_sql.py index 324956c9c6a6..a17ba6d06135 100644 --- a/arrow-pyarrow-integration-testing/tests/test_sql.py +++ b/arrow-pyarrow-integration-testing/tests/test_sql.py @@ -303,3 +303,19 @@ def test_dictionary_python(): assert a == b del a del b + +def test_record_batch_reader(): + """ + Python -> Rust -> Python + """ + schema = pa.schema([('ints', pa.list_(pa.int32()))], metadata={b'key1': b'value1'}) + batches = [ + pa.record_batch([[[1], [2, 42]]], schema), + pa.record_batch([[None, [], [5, 6]]], schema), + ] + a = pa.RecordBatchReader.from_batches(schema, batches) + b = rust.round_trip_record_batch_reader(a) + + assert b.schema == schema + got_batches = list(b) + assert got_batches == batches diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index 8878d4a607fd..6579c002380d 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow" -version = "15.0.0" +version = "16.0.0" description = "Rust implementation of Apache Arrow" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,33 +38,32 @@ path = "src/lib.rs" bench = false [dependencies] -byteorder = "1" -lz4 = "1.23" -zstd = "0.11.1" -serde = { version = "1.0" } -serde_derive = "1.0" -serde_json = { version = "1.0", features = ["preserve_order"] } -indexmap = { version = "1.6", features = ["std"] } -rand = { version = "0.8", optional = true } -num = "0.4" -half = "1.8" -csv_crate = { version = "1.1", optional = true, package="csv" } -regex = "1.3" -lazy_static = "1.4" -packed_simd = { version = "0.3", optional = true, package = "packed_simd_2" } +byteorder = { version = "1", default-features = false } +lz4 = { version = "1.23", default-features = false, optional = true } +zstd = { version = "0.11.1", optional = true, default-features = false } +serde = { version = "1.0", default-features = false } +serde_derive = { version = "1.0", default-features = false } +serde_json = { version = "1.0", default-features = false, features = ["preserve_order"] } +indexmap = { version = "1.6", default-features = false, features = ["std"] } +rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } +num = { version = "0.4", default-features = false, features = ["std"] } +half = { version = "1.8", default-features = false } +csv_crate = { version = "1.1", default-features = false, optional = true, package="csv" } +regex = { version = "1.5.6", default-features = false, features = ["std", "unicode"] } +lazy_static = { version = "1.4", default-features = false } +packed_simd = { version = "0.3", default-features = false, optional = true, package = "packed_simd_2" } chrono = { version = "0.4", default-features = false, features = ["clock"] } -chrono-tz = {version = "0.6", optional = true} -flatbuffers = { version = "2.1.2", optional = true } -hex = "0.4" -comfy-table = { version = "5.0", optional = true, default-features = false } -pyo3 = { version = "0.16", optional = true } -lexical-core = "^0.8" -multiversion = "0.6.1" -bitflags = "1.2.1" +chrono-tz = {version = "0.6", default-features = false, optional = true} +flatbuffers = { version = "2.1.2", default-features = false, features = ["thiserror"], optional = true } +hex = { version = "0.4", default-features = false, features = ["std"] } +comfy-table = { version = "6.0", optional = true, default-features = false } +pyo3 = { version = "0.16", default-features = false, optional = true } +lexical-core = { version = "^0.8", default-features = false, features = ["write-integers", "write-floats", "parse-integers", "parse-floats"] } +multiversion = { version = "0.6.1", default-features = false } +bitflags = { version = "1.2.1", default-features = false } [features] default = ["csv", "ipc", "test_utils"] -avx512 = [] csv = ["csv_crate"] ipc = ["flatbuffers"] simd = ["packed_simd"] @@ -81,16 +80,17 @@ pyarrow = ["pyo3"] force_validate = [] [dev-dependencies] -rand = "0.8" -criterion = "0.3" -flate2 = "1" -tempfile = "3" +rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] } +criterion = { version = "0.3", default-features = false } +flate2 = { version = "1", default-features = false, features = ["rust_backend"] } +tempfile = { version = "3", default-features = false } [build-dependencies] [[bench]] name = "aggregate_kernels" harness = false +required-features = ["test_utils"] [[bench]] name = "array_from_vec" @@ -99,6 +99,7 @@ harness = false [[bench]] name = "builder" harness = false +required-features = ["test_utils"] [[bench]] name = "buffer_bit_ops" @@ -107,6 +108,7 @@ harness = false [[bench]] name = "boolean_kernels" harness = false +required-features = ["test_utils"] [[bench]] name = "boolean_append_packed" @@ -115,22 +117,27 @@ harness = false [[bench]] name = "arithmetic_kernels" harness = false +required-features = ["test_utils"] [[bench]] name = "cast_kernels" harness = false +required-features = ["test_utils"] [[bench]] name = "comparison_kernels" harness = false +required-features = ["test_utils"] [[bench]] name = "filter_kernels" harness = false +required-features = ["test_utils"] [[bench]] name = "take_kernels" harness = false +required-features = ["test_utils"] [[bench]] name = "length_kernel" @@ -143,10 +150,12 @@ harness = false [[bench]] name = "sort_kernel" harness = false +required-features = ["test_utils"] [[bench]] name = "partition_kernels" harness = false +required-features = ["test_utils"] [[bench]] name = "csv_writer" @@ -159,6 +168,7 @@ harness = false [[bench]] name = "equal" harness = false +required-features = ["test_utils"] [[bench]] name = "array_slice" @@ -167,18 +177,22 @@ harness = false [[bench]] name = "concatenate_kernel" harness = false +required-features = ["test_utils"] [[bench]] name = "mutable_array" harness = false +required-features = ["test_utils"] [[bench]] name = "buffer_create" harness = false +required-features = ["test_utils"] [[bench]] -name = "string_kernels" +name = "substring_kernels" harness = false +required-features = ["test_utils"] [[bench]] name = "array_data_validate" diff --git a/arrow/README.md b/arrow/README.md index 33940d5beb85..28240e77dff3 100644 --- a/arrow/README.md +++ b/arrow/README.md @@ -32,7 +32,7 @@ This crate is tested with the latest stable version of Rust. We do not currently The arrow crate follows the [SemVer standard](https://doc.rust-lang.org/cargo/reference/semver.html) defined by Cargo and works well within the Rust crate ecosystem. -However, for historical reasons, this crate uses versions with major numbers greater than `0.x` (e.g. `15.0.0`), unlike many other crates in the Rust ecosystem which spend extended time releasing versions `0.x` to signal planned ongoing API changes. Minor arrow releases contain only compatible changes, while major releases may contain breaking API changes. +However, for historical reasons, this crate uses versions with major numbers greater than `0.x` (e.g. `16.0.0`), unlike many other crates in the Rust ecosystem which spend extended time releasing versions `0.x` to signal planned ongoing API changes. Minor arrow releases contain only compatible changes, while major releases may contain breaking API changes. ## Features @@ -49,13 +49,31 @@ The arrow crate provides the following features which may be enabled: ## Safety -TLDR: You should avoid using the `alloc` and `buffer` and `bitmap` modules if at all possible. These modules contain `unsafe` code, are easy to misuse, and are not needed for most users. +Arrow seeks to uphold the Rust Soundness Pledge as articulated eloquently [here](https://raphlinus.github.io/rust/2020/01/18/soundness-pledge.html). Specifically: -As with all open source code, you should carefully evaluate the suitability of `arrow` for your project, taking into consideration your needs and risk tolerance prior to doing so. +> The intent of this crate is to be free of soundness bugs. The developers will do their best to avoid them, and welcome help in analyzing and fixing them -_Background_: There are various parts of the `arrow` crate which use `unsafe` and `transmute` code internally. We are actively working as a community to minimize undefined behavior and remove `unsafe` usage to align more with Rust's core principles of safety. +Where soundness in turn is defined as: -As `arrow` exists today, it is fairly easy to misuse the code in modules named above, leading to undefined behavior. +> Code is unable to trigger undefined behaviour using safe APIs + +One way to ensure this would be to not use `unsafe`, however, as described in the opening chapter of the [Rustonomicon](https://doc.rust-lang.org/nomicon/meet-safe-and-unsafe.html) this is not a requirement, and flexibility in this regard is actually one of Rust's great strengths. + +In particular there are a number of scenarios where `unsafe` is largely unavoidable: + +* Invariants that cannot be statically verified by the compiler and unlock non-trivial performance wins, e.g. values in a StringArray are UTF-8, [TrustedLen](https://doc.rust-lang.org/std/iter/trait.TrustedLen.html) iterators, etc... +* FFI +* SIMD + +Additionally, this crate exposes a number of `unsafe` APIs, allowing downstream crates to explicitly opt-out of potentially expensive invariant checking where appropriate. + +We have a number of strategies to help reduce this risk: + +* Provide strongly-typed `Array` and `ArrayBuilder` APIs to safely and efficiently interact with arrays +* Extensive validation logic to safely construct `ArrayData` from untrusted sources +* All commits are verified using [MIRI](https://github.com/rust-lang/miri) to detect undefined behaviour +* We provide a `force_validate` feature that enables additional validation checks for use in test/debug builds +* There is ongoing work to reduce and better document the use of unsafe, and we welcome contributions in this space ## Building for WASM @@ -82,3 +100,17 @@ cargo run --example read_csv ``` [arrow]: https://arrow.apache.org/ + + +## Performance + +Most of the compute kernels benefit a lot from being optimized for a specific CPU target. +This is especially so on x86-64 since without specifying a target the compiler can only assume support for SSE2 vector instructions. +One of the following values as `-Ctarget-cpu=value` in `RUSTFLAGS` can therefore improve performance significantly: + + - `native`: Target the exact features of the cpu that the build is running on. + This should give the best performance when building and running locally, but should be used carefully for example when building in a CI pipeline or when shipping pre-compiled software. + - `x86-64-v3`: Includes AVX2 support and is close to the intel `haswell` architecture released in 2013 and should be supported by any recent Intel or Amd cpu. + - `x86-64-v4`: Includes AVX512 support available on intel `skylake` server and `icelake`/`tigerlake`/`rocketlake` laptop and desktop processors. + +These flags should be used in addition to the `simd` feature, since they will also affect the code generated by the simd library. \ No newline at end of file diff --git a/arrow/benches/buffer_bit_ops.rs b/arrow/benches/buffer_bit_ops.rs index 063f39c92729..6c6bb0463b28 100644 --- a/arrow/benches/buffer_bit_ops.rs +++ b/arrow/benches/buffer_bit_ops.rs @@ -17,11 +17,14 @@ #[macro_use] extern crate criterion; -use criterion::Criterion; + +use criterion::{Criterion, Throughput}; extern crate arrow; -use arrow::buffer::{Buffer, MutableBuffer}; +use arrow::buffer::{ + buffer_bin_and, buffer_bin_or, buffer_unary_not, Buffer, MutableBuffer, +}; /// Helper function to create arrays fn create_buffer(size: usize) -> Buffer { @@ -42,17 +45,59 @@ fn bench_buffer_or(left: &Buffer, right: &Buffer) { criterion::black_box((left | right).unwrap()); } +fn bench_buffer_not(buffer: &Buffer) { + criterion::black_box(!buffer); +} + +fn bench_buffer_and_with_offsets( + left: &Buffer, + left_offset: usize, + right: &Buffer, + right_offset: usize, + len: usize, +) { + criterion::black_box(buffer_bin_and(left, left_offset, right, right_offset, len)); +} + +fn bench_buffer_or_with_offsets( + left: &Buffer, + left_offset: usize, + right: &Buffer, + right_offset: usize, + len: usize, +) { + criterion::black_box(buffer_bin_or(left, left_offset, right, right_offset, len)); +} + +fn bench_buffer_not_with_offsets(buffer: &Buffer, offset: usize, len: usize) { + criterion::black_box(buffer_unary_not(buffer, offset, len)); +} + fn bit_ops_benchmark(c: &mut Criterion) { let left = create_buffer(512 * 10); let right = create_buffer(512 * 10); - c.bench_function("buffer_bit_ops and", |b| { - b.iter(|| bench_buffer_and(&left, &right)) - }); + c.benchmark_group("buffer_binary_ops") + .throughput(Throughput::Bytes(3 * left.len() as u64)) + .bench_function("and", |b| b.iter(|| bench_buffer_and(&left, &right))) + .bench_function("or", |b| b.iter(|| bench_buffer_or(&left, &right))) + .bench_function("and_with_offset", |b| { + b.iter(|| { + bench_buffer_and_with_offsets(&left, 1, &right, 2, left.len() * 8 - 5) + }) + }) + .bench_function("or_with_offset", |b| { + b.iter(|| { + bench_buffer_or_with_offsets(&left, 1, &right, 2, left.len() * 8 - 5) + }) + }); - c.bench_function("buffer_bit_ops or", |b| { - b.iter(|| bench_buffer_or(&left, &right)) - }); + c.benchmark_group("buffer_unary_ops") + .throughput(Throughput::Bytes(2 * left.len() as u64)) + .bench_function("not", |b| b.iter(|| bench_buffer_not(&left))) + .bench_function("not_with_offset", |b| { + b.iter(|| bench_buffer_not_with_offsets(&left, 1, left.len() * 8 - 5)) + }); } criterion_group!(benches, bit_ops_benchmark); diff --git a/arrow/benches/comparison_kernels.rs b/arrow/benches/comparison_kernels.rs index 4dced67ad87f..21d83e07eec3 100644 --- a/arrow/benches/comparison_kernels.rs +++ b/arrow/benches/comparison_kernels.rs @@ -124,6 +124,11 @@ fn bench_ilike_utf8_scalar(arr_a: &StringArray, value_b: &str) { .unwrap(); } +fn bench_nilike_utf8_scalar(arr_a: &StringArray, value_b: &str) { + nilike_utf8_scalar(criterion::black_box(arr_a), criterion::black_box(value_b)) + .unwrap(); +} + fn bench_regexp_is_match_utf8_scalar(arr_a: &StringArray, value_b: &str) { regexp_is_match_utf8_scalar( criterion::black_box(arr_a), @@ -254,6 +259,26 @@ fn add_benchmark(c: &mut Criterion) { b.iter(|| bench_ilike_utf8_scalar(&arr_string, "%xx_xX%xXX")) }); + c.bench_function("nilike_utf8 scalar equals", |b| { + b.iter(|| bench_nilike_utf8_scalar(&arr_string, "xxXX")) + }); + + c.bench_function("nilike_utf8 scalar contains", |b| { + b.iter(|| bench_nilike_utf8_scalar(&arr_string, "%xxXX%")) + }); + + c.bench_function("nilike_utf8 scalar ends with", |b| { + b.iter(|| bench_nilike_utf8_scalar(&arr_string, "xXXx%")) + }); + + c.bench_function("nilike_utf8 scalar starts with", |b| { + b.iter(|| bench_nilike_utf8_scalar(&arr_string, "%XXXx")) + }); + + c.bench_function("nilike_utf8 scalar complex", |b| { + b.iter(|| bench_nilike_utf8_scalar(&arr_string, "%xx_xX%xXX")) + }); + c.bench_function("egexp_matches_utf8 scalar starts with", |b| { b.iter(|| bench_regexp_is_match_utf8_scalar(&arr_string, "^xx")) }); diff --git a/arrow/benches/string_kernels.rs b/arrow/benches/substring_kernels.rs similarity index 82% rename from arrow/benches/string_kernels.rs rename to arrow/benches/substring_kernels.rs index 7df52a6bbd4c..6bbfc9c09839 100644 --- a/arrow/benches/string_kernels.rs +++ b/arrow/benches/substring_kernels.rs @@ -22,13 +22,21 @@ use criterion::Criterion; extern crate arrow; use arrow::array::*; -use arrow::compute::kernels::substring::substring; +use arrow::compute::kernels::substring::*; use arrow::util::bench_util::*; fn bench_substring(arr: &dyn Array, start: i64, length: Option) { substring(criterion::black_box(arr), start, length).unwrap(); } +fn bench_substring_by_char( + arr: &GenericStringArray, + start: i64, + length: Option, +) { + substring_by_char(criterion::black_box(arr), start, length).unwrap(); +} + fn add_benchmark(c: &mut Criterion) { let size = 65536; let val_len = 1000; @@ -44,6 +52,10 @@ fn add_benchmark(c: &mut Criterion) { b.iter(|| bench_substring(&arr_string, 1, Some((val_len - 1) as u64))) }); + c.bench_function("substring utf8 by char", |b| { + b.iter(|| bench_substring_by_char(&arr_string, 1, Some((val_len - 1) as u64))) + }); + c.bench_function("substring fixed size binary array", |b| { b.iter(|| bench_substring(&arr_fsb, 1, Some((val_len - 1) as u64))) }); diff --git a/arrow/examples/dynamic_types.rs b/arrow/examples/dynamic_types.rs index 58e41560e238..f98596f2e777 100644 --- a/arrow/examples/dynamic_types.rs +++ b/arrow/examples/dynamic_types.rs @@ -25,6 +25,9 @@ use arrow::datatypes::*; use arrow::error::Result; use arrow::record_batch::*; +#[cfg(feature = "prettyprint")] +use arrow::util::pretty::print_batches; + fn main() -> Result<()> { // define schema let schema = Schema::new(vec![ @@ -62,6 +65,11 @@ fn main() -> Result<()> { let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(id), Arc::new(nested)])?; + #[cfg(feature = "prettyprint")] + { + print_batches(&[batch.clone()]).unwrap(); + } + process(&batch); Ok(()) } @@ -91,11 +99,17 @@ fn process(batch: &RecordBatch) { Field::new("sum", DataType::Float64, false), ]); - let _ = RecordBatch::try_new( + let projection = RecordBatch::try_new( Arc::new(projected_schema), vec![ id.clone(), // NOTE: this is cloning the Arc not the array data Arc::new(Float64Array::from(nested_c.data().clone())), ], - ); + ) + .unwrap(); + + #[cfg(feature = "prettyprint")] + { + print_batches(&[projection]).unwrap(); + } } diff --git a/arrow/examples/read_csv.rs b/arrow/examples/read_csv.rs index 243d8d0f7ee3..5ccf0c58a797 100644 --- a/arrow/examples/read_csv.rs +++ b/arrow/examples/read_csv.rs @@ -35,7 +35,8 @@ fn main() { Field::new("lng", DataType::Float64, false), ]); - let file = File::open("test/data/uk_cities.csv").unwrap(); + let path = format!("{}/test/data/uk_cities.csv", env!("CARGO_MANIFEST_DIR")); + let file = File::open(path).unwrap(); let mut csv = csv::Reader::new(file, Arc::new(schema), false, None, 1024, None, None, None); diff --git a/arrow/examples/read_csv_infer_schema.rs b/arrow/examples/read_csv_infer_schema.rs index 11f8cfb7f7d2..e9f5ff650706 100644 --- a/arrow/examples/read_csv_infer_schema.rs +++ b/arrow/examples/read_csv_infer_schema.rs @@ -26,7 +26,11 @@ use std::fs::File; fn main() { #[cfg(feature = "csv")] { - let file = File::open("test/data/uk_cities_with_headers.csv").unwrap(); + let path = format!( + "{}/test/data/uk_cities_with_headers.csv", + env!("CARGO_MANIFEST_DIR") + ); + let file = File::open(path).unwrap(); let builder = csv::ReaderBuilder::new() .has_header(true) .infer_schema(Some(100)); diff --git a/arrow/src/arch/avx512.rs b/arrow/src/arch/avx512.rs deleted file mode 100644 index 264532f3594c..000000000000 --- a/arrow/src/arch/avx512.rs +++ /dev/null @@ -1,73 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -pub(crate) const AVX512_U8X64_LANES: usize = 64; - -#[target_feature(enable = "avx512f")] -pub(crate) unsafe fn avx512_bin_and(left: &[u8], right: &[u8], res: &mut [u8]) { - use core::arch::x86_64::{__m512i, _mm512_and_si512, _mm512_loadu_epi64}; - - let l: __m512i = _mm512_loadu_epi64(left.as_ptr() as *const _); - let r: __m512i = _mm512_loadu_epi64(right.as_ptr() as *const _); - let f = _mm512_and_si512(l, r); - let s = &f as *const __m512i as *const u8; - let d = res.get_unchecked_mut(0) as *mut _ as *mut u8; - std::ptr::copy_nonoverlapping(s, d, std::mem::size_of::<__m512i>()); -} - -#[target_feature(enable = "avx512f")] -pub(crate) unsafe fn avx512_bin_or(left: &[u8], right: &[u8], res: &mut [u8]) { - use core::arch::x86_64::{__m512i, _mm512_loadu_epi64, _mm512_or_si512}; - - let l: __m512i = _mm512_loadu_epi64(left.as_ptr() as *const _); - let r: __m512i = _mm512_loadu_epi64(right.as_ptr() as *const _); - let f = _mm512_or_si512(l, r); - let s = &f as *const __m512i as *const u8; - let d = res.get_unchecked_mut(0) as *mut _ as *mut u8; - std::ptr::copy_nonoverlapping(s, d, std::mem::size_of::<__m512i>()); -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_bitwise_and_avx512() { - let buf1 = [0b00110011u8; 64]; - let buf2 = [0b11110000u8; 64]; - let mut buf3 = [0b00000000; 64]; - unsafe { - avx512_bin_and(&buf1, &buf2, &mut buf3); - }; - for i in buf3.iter() { - assert_eq!(&0b00110000u8, i); - } - } - - #[test] - fn test_bitwise_or_avx512() { - let buf1 = [0b00010011u8; 64]; - let buf2 = [0b11100000u8; 64]; - let mut buf3 = [0b00000000; 64]; - unsafe { - avx512_bin_or(&buf1, &buf2, &mut buf3); - }; - for i in buf3.iter() { - assert_eq!(&0b11110011u8, i); - } - } -} diff --git a/arrow/src/array/array.rs b/arrow/src/array/array.rs index f28aba59d73e..c566ff99f12e 100644 --- a/arrow/src/array/array.rs +++ b/arrow/src/array/array.rs @@ -873,7 +873,9 @@ mod tests { #[test] fn test_memory_size_primitive_nullable() { - let arr: PrimitiveArray = (0..128).map(Some).collect(); + let arr: PrimitiveArray = (0..128) + .map(|i| if i % 20 == 0 { Some(i) } else { None }) + .collect(); let empty_with_bitmap = PrimitiveArray::::from( ArrayData::builder(arr.data_type().clone()) .add_buffer(MutableBuffer::new(0).into()) diff --git a/arrow/src/array/array_binary.rs b/arrow/src/array/array_binary.rs index a3ab4aeaa115..481ea92d66c3 100644 --- a/arrow/src/array/array_binary.rs +++ b/arrow/src/array/array_binary.rs @@ -33,6 +33,7 @@ use crate::datatypes::{ }; use crate::error::{ArrowError, Result}; use crate::util::bit_util; +use crate::util::decimal::Decimal128; use crate::{buffer::MutableBuffer, datatypes::DataType}; /// See [`BinaryArray`] and [`LargeBinaryArray`] for storing @@ -700,6 +701,18 @@ impl From for FixedSizeBinaryArray { } } +impl From>> for FixedSizeBinaryArray { + fn from(v: Vec>) -> Self { + Self::try_from_sparse_iter(v.into_iter()).unwrap() + } +} + +impl From> for FixedSizeBinaryArray { + fn from(v: Vec<&[u8]>) -> Self { + Self::try_from_iter(v.into_iter()).unwrap() + } +} + impl fmt::Debug for FixedSizeBinaryArray { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "FixedSizeBinaryArray<{}>\n[\n", self.value_length())?; @@ -744,7 +757,7 @@ impl Array for FixedSizeBinaryArray { /// .unwrap(); /// /// assert_eq!(&DataType::Decimal(23, 6), decimal_array.data_type()); -/// assert_eq!(8_887_000_000, decimal_array.value(0)); +/// assert_eq!(8_887_000_000_i128, decimal_array.value(0).as_i128()); /// assert_eq!("8887.000000", decimal_array.value_as_string(0)); /// assert_eq!(3, decimal_array.len()); /// assert_eq!(1, decimal_array.null_count()); @@ -763,8 +776,8 @@ pub struct DecimalArray { } impl DecimalArray { - /// Returns the element at index `i` as i128. - pub fn value(&self, i: usize) -> i128 { + /// Returns the element at index `i`. + pub fn value(&self, i: usize) -> Decimal128 { assert!(i < self.data.len(), "DecimalArray out of bounds access"); let offset = i.checked_add(self.data.offset()).unwrap(); let raw_val = unsafe { @@ -775,10 +788,11 @@ impl DecimalArray { ) }; let as_array = raw_val.try_into(); - match as_array { + let integer = match as_array { Ok(v) if raw_val.len() == 16 => i128::from_le_bytes(v), _ => panic!("DecimalArray elements are not 128bit integers."), - } + }; + Decimal128::new_from_i128(self.precision, self.scale, integer) } /// Returns the offset for the element at index `i`. @@ -809,23 +823,7 @@ impl DecimalArray { #[inline] pub fn value_as_string(&self, row: usize) -> String { - let value = self.value(row); - let value_str = value.to_string(); - - if self.scale == 0 { - value_str - } else { - let (sign, rest) = value_str.split_at(if value >= 0 { 0 } else { 1 }); - - if rest.len() > self.scale { - // Decimal separator is in the middle of the string - let (whole, decimal) = value_str.split_at(value_str.len() - self.scale); - format!("{}.{}", whole, decimal) - } else { - // String has to be padded - format!("{}0.{:0>width$}", sign, rest, width = self.scale) - } - } + self.value(row).as_string() } pub fn from_fixed_size_list_array( @@ -1480,18 +1478,19 @@ mod tests { 192, 219, 180, 17, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 64, 36, 75, 238, 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]; - let array_data = ArrayData::builder(DataType::Decimal(23, 6)) + let array_data = ArrayData::builder(DataType::Decimal(38, 6)) .len(2) .add_buffer(Buffer::from(&values[..])) .build() .unwrap(); let decimal_array = DecimalArray::from(array_data); - assert_eq!(8_887_000_000, decimal_array.value(0)); - assert_eq!(-8_887_000_000, decimal_array.value(1)); + assert_eq!(8_887_000_000_i128, decimal_array.value(0).into()); + assert_eq!(-8_887_000_000_i128, decimal_array.value(1).into()); assert_eq!(16, decimal_array.value_length()); } #[test] + #[cfg(not(feature = "force_validate"))] fn test_decimal_append_error_value() { let mut decimal_builder = DecimalBuilder::new(10, 5, 3); let mut result = decimal_builder.append_value(123456); @@ -1500,9 +1499,15 @@ mod tests { "Invalid argument error: 123456 is too large to store in a Decimal of precision 5. Max is 99999", error.to_string() ); + + unsafe { + decimal_builder.disable_value_validation(); + } + result = decimal_builder.append_value(123456); + assert!(result.is_ok()); decimal_builder.append_value(12345).unwrap(); let arr = decimal_builder.finish(); - assert_eq!("12.345", arr.value_as_string(0)); + assert_eq!("12.345", arr.value_as_string(1)); decimal_builder = DecimalBuilder::new(10, 2, 1); result = decimal_builder.append_value(100); @@ -1511,28 +1516,31 @@ mod tests { "Invalid argument error: 100 is too large to store in a Decimal of precision 2. Max is 99", error.to_string() ); + + unsafe { + decimal_builder.disable_value_validation(); + } + result = decimal_builder.append_value(100); + assert!(result.is_ok()); decimal_builder.append_value(99).unwrap(); result = decimal_builder.append_value(-100); - error = result.unwrap_err(); - assert_eq!( - "Invalid argument error: -100 is too small to store in a Decimal of precision 2. Min is -99", - error.to_string() - ); + assert!(result.is_ok()); decimal_builder.append_value(-99).unwrap(); let arr = decimal_builder.finish(); - assert_eq!("9.9", arr.value_as_string(0)); - assert_eq!("-9.9", arr.value_as_string(1)); + assert_eq!("9.9", arr.value_as_string(1)); + assert_eq!("-9.9", arr.value_as_string(3)); } + #[test] fn test_decimal_from_iter_values() { let array = DecimalArray::from_iter_values(vec![-100, 0, 101].into_iter()); assert_eq!(array.len(), 3); assert_eq!(array.data_type(), &DataType::Decimal(38, 10)); - assert_eq!(-100, array.value(0)); + assert_eq!(-100_i128, array.value(0).into()); assert!(!array.is_null(0)); - assert_eq!(0, array.value(1)); + assert_eq!(0_i128, array.value(1).into()); assert!(!array.is_null(1)); - assert_eq!(101, array.value(2)); + assert_eq!(101_i128, array.value(2).into()); assert!(!array.is_null(2)); } @@ -1541,10 +1549,10 @@ mod tests { let array: DecimalArray = vec![Some(-100), None, Some(101)].into_iter().collect(); assert_eq!(array.len(), 3); assert_eq!(array.data_type(), &DataType::Decimal(38, 10)); - assert_eq!(-100, array.value(0)); + assert_eq!(-100_i128, array.value(0).into()); assert!(!array.is_null(0)); assert!(array.is_null(1)); - assert_eq!(101, array.value(2)); + assert_eq!(101_i128, array.value(2).into()); assert!(!array.is_null(2)); } @@ -1703,6 +1711,64 @@ mod tests { assert_eq!(5, arr.len()) } + #[test] + fn test_fixed_size_binary_array_from_vec() { + let values = vec!["one".as_bytes(), b"two", b"six", b"ten"]; + let array = FixedSizeBinaryArray::from(values); + assert_eq!(array.len(), 4); + assert_eq!(array.null_count(), 0); + assert_eq!(array.value(0), b"one"); + assert_eq!(array.value(1), b"two"); + assert_eq!(array.value(2), b"six"); + assert_eq!(array.value(3), b"ten"); + assert!(!array.is_null(0)); + assert!(!array.is_null(1)); + assert!(!array.is_null(2)); + assert!(!array.is_null(3)); + } + + #[test] + #[should_panic(expected = "Nested array size mismatch: one is 3, and the other is 5")] + fn test_fixed_size_binary_array_from_vec_incorrect_length() { + let values = vec!["one".as_bytes(), b"two", b"three", b"four"]; + let _ = FixedSizeBinaryArray::from(values); + } + + #[test] + fn test_fixed_size_binary_array_from_opt_vec() { + let values = vec![ + Some("one".as_bytes()), + Some(b"two"), + None, + Some(b"six"), + Some(b"ten"), + ]; + let array = FixedSizeBinaryArray::from(values); + assert_eq!(array.len(), 5); + assert_eq!(array.value(0), b"one"); + assert_eq!(array.value(1), b"two"); + assert_eq!(array.value(3), b"six"); + assert_eq!(array.value(4), b"ten"); + assert!(!array.is_null(0)); + assert!(!array.is_null(1)); + assert!(array.is_null(2)); + assert!(!array.is_null(3)); + assert!(!array.is_null(4)); + } + + #[test] + #[should_panic(expected = "Nested array size mismatch: one is 3, and the other is 5")] + fn test_fixed_size_binary_array_from_opt_vec_incorrect_length() { + let values = vec![ + Some("one".as_bytes()), + Some(b"two"), + None, + Some(b"three"), + Some(b"four"), + ]; + let _ = FixedSizeBinaryArray::from(values); + } + #[test] fn test_binary_array_all_null() { let data = vec![None]; diff --git a/arrow/src/array/array_dictionary.rs b/arrow/src/array/array_dictionary.rs index b967b3abb49f..0fbd5a34eb60 100644 --- a/arrow/src/array/array_dictionary.rs +++ b/arrow/src/array/array_dictionary.rs @@ -114,11 +114,11 @@ impl<'a, K: ArrowPrimitiveType> DictionaryArray { } // Safety: `validate` ensures key type is correct, and - // `validate_dictionary_offset` ensures all offsets are within range + // `validate_values` ensures all offsets are within range let array = unsafe { data.build_unchecked() }; array.validate()?; - array.validate_dictionary_offset()?; + array.validate_values()?; Ok(array.into()) } diff --git a/arrow/src/array/array_primitive.rs b/arrow/src/array/array_primitive.rs index 8893703aa853..6f496562f896 100644 --- a/arrow/src/array/array_primitive.rs +++ b/arrow/src/array/array_primitive.rs @@ -397,29 +397,35 @@ impl<'a, T: ArrowPrimitiveType, Ptr: Into>> FromIterator let iter = iter.into_iter(); let (lower, _) = iter.size_hint(); - let mut null_buf = BooleanBufferBuilder::new(lower); + let mut null_builder = BooleanBufferBuilder::new(lower); let buffer: Buffer = iter .map(|item| { if let Some(a) = item.into().native { - null_buf.append(true); + null_builder.append(true); a } else { - null_buf.append(false); + null_builder.append(false); // this ensures that null items on the buffer are not arbitrary. - // This is important because falible operations can use null values (e.g. a vectorized "add") + // This is important because fallible operations can use null values (e.g. a vectorized "add") // which may panic (e.g. overflow if the number on the slots happen to be very large). T::Native::default() } }) .collect(); + let len = null_builder.len(); + let null_buf: Buffer = null_builder.into(); + let valid_count = null_buf.count_set_bits(); + let null_count = len - valid_count; + let opt_null_buf = (null_count != 0).then(|| null_buf); + let data = unsafe { ArrayData::new_unchecked( T::DATA_TYPE, - null_buf.len(), - None, - Some(null_buf.into()), + len, + Some(null_count), + opt_null_buf, 0, vec![buffer], vec![], @@ -1025,6 +1031,16 @@ mod tests { assert_eq!(primitive_array.len(), 10); } + #[test] + fn test_primitive_array_from_non_null_iter() { + let iter = (0..10_i32).map(Some); + let primitive_array = PrimitiveArray::::from_iter(iter); + assert_eq!(primitive_array.len(), 10); + assert_eq!(primitive_array.null_count(), 0); + assert_eq!(primitive_array.data().null_buffer(), None); + assert_eq!(primitive_array.values(), &[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) + } + #[test] #[should_panic(expected = "PrimitiveArray data should contain a single buffer only \ (values buffer)")] diff --git a/arrow/src/array/array_union.rs b/arrow/src/array/array_union.rs index 5cfab0bbf858..4ff0a31c6529 100644 --- a/arrow/src/array/array_union.rs +++ b/arrow/src/array/array_union.rs @@ -185,7 +185,7 @@ impl UnionArray { } // Check the type_ids - let type_id_slice: &[i8] = unsafe { type_ids.typed_data() }; + let type_id_slice: &[i8] = type_ids.typed_data(); let invalid_type_ids = type_id_slice .iter() .filter(|i| *i < &0) @@ -201,7 +201,7 @@ impl UnionArray { // Check the value offsets if provided if let Some(offset_buffer) = &value_offsets { let max_len = type_ids.len() as i32; - let offsets_slice: &[i32] = unsafe { offset_buffer.typed_data() }; + let offsets_slice: &[i32] = offset_buffer.typed_data(); let invalid_offsets = offsets_slice .iter() .filter(|i| *i < &0 || *i > &max_len) @@ -255,9 +255,7 @@ impl UnionArray { pub fn value_offset(&self, index: usize) -> i32 { assert!(index - self.offset() < self.len()); if self.is_dense() { - // safety: reinterpreting is safe since the offset buffer contains `i32` values and is - // properly aligned. - unsafe { self.data().buffers()[1].typed_data::()[index] } + self.data().buffers()[1].typed_data::()[index] } else { index as i32 } @@ -436,6 +434,7 @@ mod tests { } #[test] + #[cfg_attr(miri, ignore)] fn test_dense_i32_large() { let mut builder = UnionBuilder::new_dense(1024); diff --git a/arrow/src/array/builder.rs b/arrow/src/array/builder.rs deleted file mode 100644 index e22a6f81ed8f..000000000000 --- a/arrow/src/array/builder.rs +++ /dev/null @@ -1,3844 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Defines a [`BufferBuilder`](crate::array::BufferBuilder) capable -//! of creating a [`Buffer`](crate::buffer::Buffer) which can be used -//! as an internal buffer in an [`ArrayData`](crate::array::ArrayData) -//! object. - -use std::any::Any; -use std::collections::HashMap; -use std::fmt; -use std::marker::PhantomData; -use std::mem; -use std::ops::Range; -use std::sync::Arc; - -use crate::array::*; -use crate::buffer::{Buffer, MutableBuffer}; -use crate::datatypes::*; -use crate::error::{ArrowError, Result}; -use crate::util::bit_util; - -/// Converts a `MutableBuffer` to a `BufferBuilder`. -/// -/// `slots` is the number of array slots currently represented in the `MutableBuffer`. -pub(crate) fn mutable_buffer_to_builder( - mutable_buffer: MutableBuffer, - slots: usize, -) -> BufferBuilder { - BufferBuilder:: { - buffer: mutable_buffer, - len: slots, - _marker: PhantomData, - } -} - -/// Converts a `BufferBuilder` into its underlying `MutableBuffer`. -/// -/// `From` is not implemented because associated type bounds are unstable. -pub(crate) fn builder_to_mutable_buffer( - builder: BufferBuilder, -) -> MutableBuffer { - builder.buffer -} - -/// Builder for creating a [`Buffer`](crate::buffer::Buffer) object. -/// -/// A [`Buffer`](crate::buffer::Buffer) is the underlying data -/// structure of Arrow's [`Arrays`](crate::array::Array). -/// -/// For all supported types, there are type definitions for the -/// generic version of `BufferBuilder`, e.g. `UInt8BufferBuilder`. -/// -/// # Example: -/// -/// ``` -/// use arrow::array::UInt8BufferBuilder; -/// -/// # fn main() -> arrow::error::Result<()> { -/// let mut builder = UInt8BufferBuilder::new(100); -/// builder.append_slice(&[42, 43, 44]); -/// builder.append(45); -/// let buffer = builder.finish(); -/// -/// assert_eq!(unsafe { buffer.typed_data::() }, &[42, 43, 44, 45]); -/// # Ok(()) -/// # } -/// ``` -#[derive(Debug)] -pub struct BufferBuilder { - buffer: MutableBuffer, - len: usize, - _marker: PhantomData, -} - -impl BufferBuilder { - /// Creates a new builder with initial capacity for _at least_ `capacity` - /// elements of type `T`. - /// - /// The capacity can later be manually adjusted with the - /// [`reserve()`](BufferBuilder::reserve) method. - /// Also the - /// [`append()`](BufferBuilder::append), - /// [`append_slice()`](BufferBuilder::append_slice) and - /// [`advance()`](BufferBuilder::advance) - /// methods automatically increase the capacity if needed. - /// - /// # Example: - /// - /// ``` - /// use arrow::array::UInt8BufferBuilder; - /// - /// let mut builder = UInt8BufferBuilder::new(10); - /// - /// assert!(builder.capacity() >= 10); - /// ``` - #[inline] - pub fn new(capacity: usize) -> Self { - let buffer = MutableBuffer::new(capacity * mem::size_of::()); - - Self { - buffer, - len: 0, - _marker: PhantomData, - } - } - - /// Returns the current number of array elements in the internal buffer. - /// - /// # Example: - /// - /// ``` - /// use arrow::array::UInt8BufferBuilder; - /// - /// let mut builder = UInt8BufferBuilder::new(10); - /// builder.append(42); - /// - /// assert_eq!(builder.len(), 1); - /// ``` - pub fn len(&self) -> usize { - self.len - } - - /// Returns whether the internal buffer is empty. - /// - /// # Example: - /// - /// ``` - /// use arrow::array::UInt8BufferBuilder; - /// - /// let mut builder = UInt8BufferBuilder::new(10); - /// builder.append(42); - /// - /// assert_eq!(builder.is_empty(), false); - /// ``` - pub fn is_empty(&self) -> bool { - self.len == 0 - } - - /// Returns the actual capacity (number of elements) of the internal buffer. - /// - /// Note: the internal capacity returned by this method might be larger than - /// what you'd expect after setting the capacity in the `new()` or `reserve()` - /// functions. - pub fn capacity(&self) -> usize { - let byte_capacity = self.buffer.capacity(); - byte_capacity / std::mem::size_of::() - } - - /// Increases the number of elements in the internal buffer by `n` - /// and resizes the buffer as needed. - /// - /// The values of the newly added elements are 0. - /// This method is usually used when appending `NULL` values to the buffer - /// as they still require physical memory space. - /// - /// # Example: - /// - /// ``` - /// use arrow::array::UInt8BufferBuilder; - /// - /// let mut builder = UInt8BufferBuilder::new(10); - /// builder.advance(2); - /// - /// assert_eq!(builder.len(), 2); - /// ``` - #[inline] - pub fn advance(&mut self, i: usize) { - let new_buffer_len = (self.len + i) * mem::size_of::(); - self.buffer.resize(new_buffer_len, 0); - self.len += i; - } - - /// Reserves memory for _at least_ `n` more elements of type `T`. - /// - /// # Example: - /// - /// ``` - /// use arrow::array::UInt8BufferBuilder; - /// - /// let mut builder = UInt8BufferBuilder::new(10); - /// builder.reserve(10); - /// - /// assert!(builder.capacity() >= 20); - /// ``` - #[inline] - pub fn reserve(&mut self, n: usize) { - self.buffer.reserve(n * mem::size_of::()); - } - - /// Appends a value of type `T` into the builder, - /// growing the internal buffer as needed. - /// - /// # Example: - /// - /// ``` - /// use arrow::array::UInt8BufferBuilder; - /// - /// let mut builder = UInt8BufferBuilder::new(10); - /// builder.append(42); - /// - /// assert_eq!(builder.len(), 1); - /// ``` - #[inline] - pub fn append(&mut self, v: T) { - self.reserve(1); - self.buffer.push(v); - self.len += 1; - } - - /// Appends a value of type `T` into the builder N times, - /// growing the internal buffer as needed. - /// - /// # Example: - /// - /// ``` - /// use arrow::array::UInt8BufferBuilder; - /// - /// let mut builder = UInt8BufferBuilder::new(10); - /// builder.append_n(10, 42); - /// - /// assert_eq!(builder.len(), 10); - /// ``` - #[inline] - pub fn append_n(&mut self, n: usize, v: T) { - self.reserve(n); - for _ in 0..n { - self.buffer.push(v); - } - self.len += n; - } - - /// Appends a slice of type `T`, growing the internal buffer as needed. - /// - /// # Example: - /// - /// ``` - /// use arrow::array::UInt8BufferBuilder; - /// - /// let mut builder = UInt8BufferBuilder::new(10); - /// builder.append_slice(&[42, 44, 46]); - /// - /// assert_eq!(builder.len(), 3); - /// ``` - #[inline] - pub fn append_slice(&mut self, slice: &[T]) { - self.buffer.extend_from_slice(slice); - self.len += slice.len(); - } - - /// # Safety - /// This requires the iterator be a trusted length. This could instead require - /// the iterator implement `TrustedLen` once that is stabilized. - #[inline] - pub unsafe fn append_trusted_len_iter(&mut self, iter: impl IntoIterator) { - let iter = iter.into_iter(); - let len = iter - .size_hint() - .1 - .expect("append_trusted_len_iter expects upper bound"); - self.reserve(len); - for v in iter { - self.buffer.push(v) - } - self.len += len; - } - - /// Resets this builder and returns an immutable [`Buffer`](crate::buffer::Buffer). - /// - /// # Example: - /// - /// ``` - /// use arrow::array::UInt8BufferBuilder; - /// - /// let mut builder = UInt8BufferBuilder::new(10); - /// builder.append_slice(&[42, 44, 46]); - /// - /// let buffer = builder.finish(); - /// - /// assert_eq!(unsafe { buffer.typed_data::() }, &[42, 44, 46]); - /// ``` - #[inline] - pub fn finish(&mut self) -> Buffer { - let buf = std::mem::replace(&mut self.buffer, MutableBuffer::new(0)); - self.len = 0; - buf.into() - } -} - -#[derive(Debug)] -pub struct BooleanBufferBuilder { - buffer: MutableBuffer, - len: usize, -} - -impl BooleanBufferBuilder { - #[inline] - pub fn new(capacity: usize) -> Self { - let byte_capacity = bit_util::ceil(capacity, 8); - let buffer = MutableBuffer::new(byte_capacity); - Self { buffer, len: 0 } - } - - #[inline] - pub fn len(&self) -> usize { - self.len - } - - #[inline] - pub fn set_bit(&mut self, index: usize, v: bool) { - if v { - bit_util::set_bit(self.buffer.as_mut(), index); - } else { - bit_util::unset_bit(self.buffer.as_mut(), index); - } - } - - #[inline] - pub fn get_bit(&self, index: usize) -> bool { - bit_util::get_bit(self.buffer.as_slice(), index) - } - - #[inline] - pub fn is_empty(&self) -> bool { - self.len == 0 - } - - #[inline] - pub fn capacity(&self) -> usize { - self.buffer.capacity() * 8 - } - - #[inline] - pub fn advance(&mut self, additional: usize) { - let new_len = self.len + additional; - let new_len_bytes = bit_util::ceil(new_len, 8); - if new_len_bytes > self.buffer.len() { - self.buffer.resize(new_len_bytes, 0); - } - self.len = new_len; - } - - /// Reserve space to at least `additional` new bits. - /// Capacity will be `>= self.len() + additional`. - /// New bytes are uninitialized and reading them is undefined behavior. - #[inline] - pub fn reserve(&mut self, additional: usize) { - let capacity = self.len + additional; - if capacity > self.capacity() { - // convert differential to bytes - let additional = bit_util::ceil(capacity, 8) - self.buffer.len(); - self.buffer.reserve(additional); - } - } - - /// Resizes the buffer, either truncating its contents (with no change in capacity), or - /// growing it (potentially reallocating it) and writing `false` in the newly available bits. - #[inline] - pub fn resize(&mut self, len: usize) { - let len_bytes = bit_util::ceil(len, 8); - self.buffer.resize(len_bytes, 0); - self.len = len; - } - - #[inline] - pub fn append(&mut self, v: bool) { - self.advance(1); - if v { - unsafe { bit_util::set_bit_raw(self.buffer.as_mut_ptr(), self.len - 1) }; - } - } - - #[inline] - pub fn append_n(&mut self, additional: usize, v: bool) { - self.advance(additional); - if additional > 0 && v { - let offset = self.len() - additional; - (0..additional).for_each(|i| unsafe { - bit_util::set_bit_raw(self.buffer.as_mut_ptr(), offset + i) - }) - } - } - - #[inline] - pub fn append_slice(&mut self, slice: &[bool]) { - let additional = slice.len(); - self.advance(additional); - - let offset = self.len() - additional; - for (i, v) in slice.iter().enumerate() { - if *v { - unsafe { bit_util::set_bit_raw(self.buffer.as_mut_ptr(), offset + i) } - } - } - } - - /// Append `range` bits from `to_set` - /// - /// `to_set` is a slice of bits packed LSB-first into `[u8]` - /// - /// # Panics - /// - /// Panics if `to_set` does not contain `ceil(range.end / 8)` bytes - pub fn append_packed_range(&mut self, range: Range, to_set: &[u8]) { - let offset_write = self.len; - let len = range.end - range.start; - self.advance(len); - crate::util::bit_mask::set_bits( - self.buffer.as_slice_mut(), - to_set, - offset_write, - range.start, - len, - ); - } - - /// Returns the packed bits - pub fn as_slice(&self) -> &[u8] { - self.buffer.as_slice() - } - - #[inline] - pub fn finish(&mut self) -> Buffer { - let buf = std::mem::replace(&mut self.buffer, MutableBuffer::new(0)); - self.len = 0; - buf.into() - } -} - -impl From for Buffer { - #[inline] - fn from(builder: BooleanBufferBuilder) -> Self { - builder.buffer.into() - } -} - -/// Trait for dealing with different array builders at runtime -/// -/// # Example -/// -/// ``` -/// # use arrow::{ -/// # array::{ArrayBuilder, ArrayRef, Float64Builder, Int64Builder, StringArray, StringBuilder}, -/// # error::ArrowError, -/// # }; -/// # fn main() -> std::result::Result<(), ArrowError> { -/// // Create -/// let mut data_builders: Vec> = vec![ -/// Box::new(Float64Builder::new(1024)), -/// Box::new(Int64Builder::new(1024)), -/// Box::new(StringBuilder::new(1024)), -/// ]; -/// -/// // Fill -/// data_builders[0] -/// .as_any_mut() -/// .downcast_mut::() -/// .unwrap() -/// .append_value(3.14)?; -/// data_builders[1] -/// .as_any_mut() -/// .downcast_mut::() -/// .unwrap() -/// .append_value(-1)?; -/// data_builders[2] -/// .as_any_mut() -/// .downcast_mut::() -/// .unwrap() -/// .append_value("🍎")?; -/// -/// // Finish -/// let array_refs: Vec = data_builders -/// .iter_mut() -/// .map(|builder| builder.finish()) -/// .collect(); -/// assert_eq!(array_refs[0].len(), 1); -/// assert_eq!(array_refs[1].is_null(0), false); -/// assert_eq!( -/// array_refs[2] -/// .as_any() -/// .downcast_ref::() -/// .unwrap() -/// .value(0), -/// "🍎" -/// ); -/// # Ok(()) -/// # } -/// ``` -pub trait ArrayBuilder: Any + Send { - /// Returns the number of array slots in the builder - fn len(&self) -> usize; - - /// Returns whether number of array slots is zero - fn is_empty(&self) -> bool; - - /// Builds the array - fn finish(&mut self) -> ArrayRef; - - /// Returns the builder as a non-mutable `Any` reference. - /// - /// This is most useful when one wants to call non-mutable APIs on a specific builder - /// type. In this case, one can first cast this into a `Any`, and then use - /// `downcast_ref` to get a reference on the specific builder. - fn as_any(&self) -> &dyn Any; - - /// Returns the builder as a mutable `Any` reference. - /// - /// This is most useful when one wants to call mutable APIs on a specific builder - /// type. In this case, one can first cast this into a `Any`, and then use - /// `downcast_mut` to get a reference on the specific builder. - fn as_any_mut(&mut self) -> &mut dyn Any; - - /// Returns the boxed builder as a box of `Any`. - fn into_box_any(self: Box) -> Box; -} - -/// Array builder for fixed-width primitive types -/// -/// # Example -/// -/// Create a `BooleanArray` from a `BooleanBuilder` -/// -/// ``` -/// use arrow::array::{Array, BooleanArray, BooleanBuilder}; -/// -/// let mut b = BooleanBuilder::new(4); -/// b.append_value(true); -/// b.append_null(); -/// b.append_value(false); -/// b.append_value(true); -/// let arr = b.finish(); -/// -/// assert_eq!(4, arr.len()); -/// assert_eq!(1, arr.null_count()); -/// assert_eq!(true, arr.value(0)); -/// assert!(arr.is_valid(0)); -/// assert!(!arr.is_null(0)); -/// assert!(!arr.is_valid(1)); -/// assert!(arr.is_null(1)); -/// assert_eq!(false, arr.value(2)); -/// assert!(arr.is_valid(2)); -/// assert!(!arr.is_null(2)); -/// assert_eq!(true, arr.value(3)); -/// assert!(arr.is_valid(3)); -/// assert!(!arr.is_null(3)); -/// ``` -#[derive(Debug)] -pub struct BooleanBuilder { - values_builder: BooleanBufferBuilder, - bitmap_builder: BooleanBufferBuilder, -} - -impl BooleanBuilder { - /// Creates a new primitive array builder - pub fn new(capacity: usize) -> Self { - Self { - values_builder: BooleanBufferBuilder::new(capacity), - bitmap_builder: BooleanBufferBuilder::new(capacity), - } - } - - /// Returns the capacity of this builder measured in slots of type `T` - pub fn capacity(&self) -> usize { - self.values_builder.capacity() - } - - /// Appends a value of type `T` into the builder - #[inline] - pub fn append_value(&mut self, v: bool) -> Result<()> { - self.bitmap_builder.append(true); - self.values_builder.append(v); - Ok(()) - } - - /// Appends a null slot into the builder - #[inline] - pub fn append_null(&mut self) -> Result<()> { - self.bitmap_builder.append(false); - self.values_builder.advance(1); - Ok(()) - } - - /// Appends an `Option` into the builder - #[inline] - pub fn append_option(&mut self, v: Option) -> Result<()> { - match v { - None => self.append_null()?, - Some(v) => self.append_value(v)?, - }; - Ok(()) - } - - /// Appends a slice of type `T` into the builder - #[inline] - pub fn append_slice(&mut self, v: &[bool]) -> Result<()> { - self.bitmap_builder.append_n(v.len(), true); - self.values_builder.append_slice(v); - Ok(()) - } - - /// Appends values from a slice of type `T` and a validity boolean slice - #[inline] - pub fn append_values(&mut self, values: &[bool], is_valid: &[bool]) -> Result<()> { - if values.len() != is_valid.len() { - return Err(ArrowError::InvalidArgumentError( - "Value and validity lengths must be equal".to_string(), - )); - } - self.bitmap_builder.append_slice(is_valid); - self.values_builder.append_slice(values); - Ok(()) - } - - /// Builds the [BooleanArray] and reset this builder. - pub fn finish(&mut self) -> BooleanArray { - let len = self.len(); - let null_bit_buffer = self.bitmap_builder.finish(); - let null_count = len - null_bit_buffer.count_set_bits(); - let builder = ArrayData::builder(DataType::Boolean) - .len(len) - .add_buffer(self.values_builder.finish()) - .null_bit_buffer((null_count > 0).then(|| null_bit_buffer)); - - let array_data = unsafe { builder.build_unchecked() }; - BooleanArray::from(array_data) - } -} - -impl ArrayBuilder for BooleanBuilder { - /// Returns the builder as a non-mutable `Any` reference. - fn as_any(&self) -> &dyn Any { - self - } - - /// Returns the builder as a mutable `Any` reference. - fn as_any_mut(&mut self) -> &mut dyn Any { - self - } - - /// Returns the boxed builder as a box of `Any`. - fn into_box_any(self: Box) -> Box { - self - } - - /// Returns the number of array slots in the builder - fn len(&self) -> usize { - self.values_builder.len - } - - /// Returns whether the number of array slots is zero - fn is_empty(&self) -> bool { - self.values_builder.is_empty() - } - - /// Builds the array and reset this builder. - fn finish(&mut self) -> ArrayRef { - Arc::new(self.finish()) - } -} - -/// Array builder for fixed-width primitive types -#[derive(Debug)] -pub struct PrimitiveBuilder { - values_builder: BufferBuilder, - /// We only materialize the builder when we add `false`. - /// This optimization is **very** important for performance of `StringBuilder`. - bitmap_builder: Option, -} - -impl ArrayBuilder for PrimitiveBuilder { - /// Returns the builder as a non-mutable `Any` reference. - fn as_any(&self) -> &dyn Any { - self - } - - /// Returns the builder as a mutable `Any` reference. - fn as_any_mut(&mut self) -> &mut dyn Any { - self - } - - /// Returns the boxed builder as a box of `Any`. - fn into_box_any(self: Box) -> Box { - self - } - - /// Returns the number of array slots in the builder - fn len(&self) -> usize { - self.values_builder.len - } - - /// Returns whether the number of array slots is zero - fn is_empty(&self) -> bool { - self.values_builder.is_empty() - } - - /// Builds the array and reset this builder. - fn finish(&mut self) -> ArrayRef { - Arc::new(self.finish()) - } -} - -impl PrimitiveBuilder { - /// Creates a new primitive array builder - pub fn new(capacity: usize) -> Self { - Self { - values_builder: BufferBuilder::::new(capacity), - bitmap_builder: None, - } - } - - /// Returns the capacity of this builder measured in slots of type `T` - pub fn capacity(&self) -> usize { - self.values_builder.capacity() - } - - /// Appends a value of type `T` into the builder - #[inline] - pub fn append_value(&mut self, v: T::Native) -> Result<()> { - if let Some(b) = self.bitmap_builder.as_mut() { - b.append(true); - } - self.values_builder.append(v); - Ok(()) - } - - /// Appends a null slot into the builder - #[inline] - pub fn append_null(&mut self) -> Result<()> { - self.materialize_bitmap_builder(); - self.bitmap_builder.as_mut().unwrap().append(false); - self.values_builder.advance(1); - Ok(()) - } - - #[inline] - pub fn append_nulls(&mut self, n: usize) -> Result<()> { - self.materialize_bitmap_builder(); - self.bitmap_builder.as_mut().unwrap().append_n(n, false); - self.values_builder.advance(n); - Ok(()) - } - - /// Appends an `Option` into the builder - #[inline] - pub fn append_option(&mut self, v: Option) -> Result<()> { - match v { - None => self.append_null()?, - Some(v) => self.append_value(v)?, - }; - Ok(()) - } - - /// Appends a slice of type `T` into the builder - #[inline] - pub fn append_slice(&mut self, v: &[T::Native]) -> Result<()> { - if let Some(b) = self.bitmap_builder.as_mut() { - b.append_n(v.len(), true); - } - self.values_builder.append_slice(v); - Ok(()) - } - - /// Appends values from a slice of type `T` and a validity boolean slice - #[inline] - pub fn append_values( - &mut self, - values: &[T::Native], - is_valid: &[bool], - ) -> Result<()> { - if values.len() != is_valid.len() { - return Err(ArrowError::InvalidArgumentError( - "Value and validity lengths must be equal".to_string(), - )); - } - if is_valid.iter().any(|v| !*v) { - self.materialize_bitmap_builder(); - } - if let Some(b) = self.bitmap_builder.as_mut() { - b.append_slice(is_valid); - } - self.values_builder.append_slice(values); - Ok(()) - } - - /// Appends values from a trusted length iterator. - /// - /// # Safety - /// This requires the iterator be a trusted length. This could instead require - /// the iterator implement `TrustedLen` once that is stabilized. - #[inline] - pub unsafe fn append_trusted_len_iter( - &mut self, - iter: impl IntoIterator, - ) -> Result<()> { - let iter = iter.into_iter(); - let len = iter - .size_hint() - .1 - .expect("append_trusted_len_iter requires an upper bound"); - - if let Some(b) = self.bitmap_builder.as_mut() { - b.append_n(len, true); - } - self.values_builder.append_trusted_len_iter(iter); - Ok(()) - } - - /// Builds the `PrimitiveArray` and reset this builder. - pub fn finish(&mut self) -> PrimitiveArray { - let len = self.len(); - let null_bit_buffer = self.bitmap_builder.as_mut().map(|b| b.finish()); - let null_count = len - - null_bit_buffer - .as_ref() - .map(|b| b.count_set_bits()) - .unwrap_or(len); - let builder = ArrayData::builder(T::DATA_TYPE) - .len(len) - .add_buffer(self.values_builder.finish()) - .null_bit_buffer(if null_count > 0 { - null_bit_buffer - } else { - None - }); - - let array_data = unsafe { builder.build_unchecked() }; - PrimitiveArray::::from(array_data) - } - - /// Builds the `DictionaryArray` and reset this builder. - pub fn finish_dict(&mut self, values: ArrayRef) -> DictionaryArray { - let len = self.len(); - let null_bit_buffer = self.bitmap_builder.as_mut().map(|b| b.finish()); - let null_count = len - - null_bit_buffer - .as_ref() - .map(|b| b.count_set_bits()) - .unwrap_or(len); - let data_type = DataType::Dictionary( - Box::new(T::DATA_TYPE), - Box::new(values.data_type().clone()), - ); - let mut builder = ArrayData::builder(data_type) - .len(len) - .add_buffer(self.values_builder.finish()); - if null_count > 0 { - builder = builder.null_bit_buffer(null_bit_buffer); - } - builder = builder.add_child_data(values.data().clone()); - let array_data = unsafe { builder.build_unchecked() }; - DictionaryArray::::from(array_data) - } - - fn materialize_bitmap_builder(&mut self) { - if self.bitmap_builder.is_some() { - return; - } - let mut b = BooleanBufferBuilder::new(0); - b.reserve(self.values_builder.capacity()); - b.append_n(self.values_builder.len, true); - self.bitmap_builder = Some(b); - } -} - -/// Array builder for `ListArray` -#[derive(Debug)] -pub struct GenericListBuilder { - offsets_builder: BufferBuilder, - bitmap_builder: BooleanBufferBuilder, - values_builder: T, - len: OffsetSize, -} - -impl GenericListBuilder { - /// Creates a new `ListArrayBuilder` from a given values array builder - pub fn new(values_builder: T) -> Self { - let capacity = values_builder.len(); - Self::with_capacity(values_builder, capacity) - } - - /// Creates a new `ListArrayBuilder` from a given values array builder - /// `capacity` is the number of items to pre-allocate space for in this builder - pub fn with_capacity(values_builder: T, capacity: usize) -> Self { - let mut offsets_builder = BufferBuilder::::new(capacity + 1); - let len = OffsetSize::zero(); - offsets_builder.append(len); - Self { - offsets_builder, - bitmap_builder: BooleanBufferBuilder::new(capacity), - values_builder, - len, - } - } -} - -impl ArrayBuilder - for GenericListBuilder -where - T: 'static, -{ - /// Returns the builder as a non-mutable `Any` reference. - fn as_any(&self) -> &dyn Any { - self - } - - /// Returns the builder as a mutable `Any` reference. - fn as_any_mut(&mut self) -> &mut dyn Any { - self - } - - /// Returns the boxed builder as a box of `Any`. - fn into_box_any(self: Box) -> Box { - self - } - - /// Returns the number of array slots in the builder - fn len(&self) -> usize { - self.len.to_usize().unwrap() - } - - /// Returns whether the number of array slots is zero - fn is_empty(&self) -> bool { - self.len == OffsetSize::zero() - } - - /// Builds the array and reset this builder. - fn finish(&mut self) -> ArrayRef { - Arc::new(self.finish()) - } -} - -impl GenericListBuilder -where - T: 'static, -{ - /// Returns the child array builder as a mutable reference. - /// - /// This mutable reference can be used to append values into the child array builder, - /// but you must call `append` to delimit each distinct list value. - pub fn values(&mut self) -> &mut T { - &mut self.values_builder - } - - /// Finish the current variable-length list array slot - #[inline] - pub fn append(&mut self, is_valid: bool) -> Result<()> { - self.offsets_builder - .append(OffsetSize::from_usize(self.values_builder.len()).unwrap()); - self.bitmap_builder.append(is_valid); - self.len += OffsetSize::one(); - Ok(()) - } - - /// Builds the `ListArray` and reset this builder. - pub fn finish(&mut self) -> GenericListArray { - let len = self.len(); - self.len = OffsetSize::zero(); - let values_arr = self - .values_builder - .as_any_mut() - .downcast_mut::() - .unwrap() - .finish(); - let values_data = values_arr.data(); - - let offset_buffer = self.offsets_builder.finish(); - let null_bit_buffer = self.bitmap_builder.finish(); - self.offsets_builder.append(self.len); - let field = Box::new(Field::new( - "item", - values_data.data_type().clone(), - true, // TODO: find a consistent way of getting this - )); - let data_type = if OffsetSize::IS_LARGE { - DataType::LargeList(field) - } else { - DataType::List(field) - }; - let array_data = ArrayData::builder(data_type) - .len(len) - .add_buffer(offset_buffer) - .add_child_data(values_data.clone()) - .null_bit_buffer(Some(null_bit_buffer)); - - let array_data = unsafe { array_data.build_unchecked() }; - - GenericListArray::::from(array_data) - } -} - -pub type ListBuilder = GenericListBuilder; -pub type LargeListBuilder = GenericListBuilder; - -/// Array builder for `ListArray` -#[derive(Debug)] -pub struct FixedSizeListBuilder { - bitmap_builder: BooleanBufferBuilder, - values_builder: T, - len: usize, - list_len: i32, -} - -impl FixedSizeListBuilder { - /// Creates a new `FixedSizeListBuilder` from a given values array builder - /// `length` is the number of values within each array - pub fn new(values_builder: T, length: i32) -> Self { - let capacity = values_builder.len(); - Self::with_capacity(values_builder, length, capacity) - } - - /// Creates a new `FixedSizeListBuilder` from a given values array builder - /// `length` is the number of values within each array - /// `capacity` is the number of items to pre-allocate space for in this builder - pub fn with_capacity(values_builder: T, length: i32, capacity: usize) -> Self { - let mut offsets_builder = Int32BufferBuilder::new(capacity + 1); - offsets_builder.append(0); - Self { - bitmap_builder: BooleanBufferBuilder::new(capacity), - values_builder, - len: 0, - list_len: length, - } - } -} - -impl ArrayBuilder for FixedSizeListBuilder -where - T: 'static, -{ - /// Returns the builder as a non-mutable `Any` reference. - fn as_any(&self) -> &dyn Any { - self - } - - /// Returns the builder as a mutable `Any` reference. - fn as_any_mut(&mut self) -> &mut dyn Any { - self - } - - /// Returns the boxed builder as a box of `Any`. - fn into_box_any(self: Box) -> Box { - self - } - - /// Returns the number of array slots in the builder - fn len(&self) -> usize { - self.len - } - - /// Returns whether the number of array slots is zero - fn is_empty(&self) -> bool { - self.len == 0 - } - - /// Builds the array and reset this builder. - fn finish(&mut self) -> ArrayRef { - Arc::new(self.finish()) - } -} - -impl FixedSizeListBuilder -where - T: 'static, -{ - /// Returns the child array builder as a mutable reference. - /// - /// This mutable reference can be used to append values into the child array builder, - /// but you must call `append` to delimit each distinct list value. - pub fn values(&mut self) -> &mut T { - &mut self.values_builder - } - - pub fn value_length(&self) -> i32 { - self.list_len - } - - /// Finish the current variable-length list array slot - #[inline] - pub fn append(&mut self, is_valid: bool) -> Result<()> { - self.bitmap_builder.append(is_valid); - self.len += 1; - Ok(()) - } - - /// Builds the `FixedSizeListBuilder` and reset this builder. - pub fn finish(&mut self) -> FixedSizeListArray { - let len = self.len(); - self.len = 0; - let values_arr = self - .values_builder - .as_any_mut() - .downcast_mut::() - .unwrap() - .finish(); - let values_data = values_arr.data(); - - // check that values_data length is multiple of len if we have data - if len != 0 { - assert!( - values_data.len() / len == self.list_len as usize, - "Values of FixedSizeList must have equal lengths, values have length {} and list has {}", - values_data.len() / len, - self.list_len - ); - } - - let null_bit_buffer = self.bitmap_builder.finish(); - let array_data = ArrayData::builder(DataType::FixedSizeList( - Box::new(Field::new("item", values_data.data_type().clone(), true)), - self.list_len, - )) - .len(len) - .add_child_data(values_data.clone()) - .null_bit_buffer(Some(null_bit_buffer)); - - let array_data = unsafe { array_data.build_unchecked() }; - - FixedSizeListArray::from(array_data) - } -} - -/// Array builder for `BinaryArray` -#[derive(Debug)] -pub struct GenericBinaryBuilder { - builder: GenericListBuilder, -} - -pub type BinaryBuilder = GenericBinaryBuilder; -pub type LargeBinaryBuilder = GenericBinaryBuilder; - -#[derive(Debug)] -pub struct GenericStringBuilder { - builder: GenericListBuilder, -} - -pub type StringBuilder = GenericStringBuilder; -pub type LargeStringBuilder = GenericStringBuilder; - -#[derive(Debug)] -pub struct FixedSizeBinaryBuilder { - builder: FixedSizeListBuilder, -} - -/// -/// Array Builder for [`DecimalArray`] -/// -/// See [`DecimalArray`] for example. -/// -#[derive(Debug)] -pub struct DecimalBuilder { - builder: FixedSizeListBuilder, - precision: usize, - scale: usize, -} - -impl ArrayBuilder for GenericBinaryBuilder { - /// Returns the builder as a non-mutable `Any` reference. - fn as_any(&self) -> &dyn Any { - self - } - - /// Returns the builder as a mutable `Any` reference. - fn as_any_mut(&mut self) -> &mut dyn Any { - self - } - - /// Returns the boxed builder as a box of `Any`. - fn into_box_any(self: Box) -> Box { - self - } - - /// Returns the number of array slots in the builder - fn len(&self) -> usize { - self.builder.len() - } - - /// Returns whether the number of array slots is zero - fn is_empty(&self) -> bool { - self.builder.is_empty() - } - - /// Builds the array and reset this builder. - fn finish(&mut self) -> ArrayRef { - Arc::new(self.finish()) - } -} - -impl ArrayBuilder for GenericStringBuilder { - /// Returns the builder as a non-mutable `Any` reference. - fn as_any(&self) -> &dyn Any { - self - } - - /// Returns the builder as a mutable `Any` reference. - fn as_any_mut(&mut self) -> &mut dyn Any { - self - } - - /// Returns the boxed builder as a box of `Any`. - fn into_box_any(self: Box) -> Box { - self - } - - /// Returns the number of array slots in the builder - fn len(&self) -> usize { - self.builder.len() - } - - /// Returns whether the number of array slots is zero - fn is_empty(&self) -> bool { - self.builder.is_empty() - } - - /// Builds the array and reset this builder. - fn finish(&mut self) -> ArrayRef { - let a = GenericStringBuilder::::finish(self); - Arc::new(a) - } -} - -impl ArrayBuilder for FixedSizeBinaryBuilder { - /// Returns the builder as a non-mutable `Any` reference. - fn as_any(&self) -> &dyn Any { - self - } - - /// Returns the builder as a mutable `Any` reference. - fn as_any_mut(&mut self) -> &mut dyn Any { - self - } - - /// Returns the boxed builder as a box of `Any`. - fn into_box_any(self: Box) -> Box { - self - } - - /// Returns the number of array slots in the builder - fn len(&self) -> usize { - self.builder.len() - } - - /// Returns whether the number of array slots is zero - fn is_empty(&self) -> bool { - self.builder.is_empty() - } - - /// Builds the array and reset this builder. - fn finish(&mut self) -> ArrayRef { - Arc::new(self.finish()) - } -} - -impl ArrayBuilder for DecimalBuilder { - /// Returns the builder as a non-mutable `Any` reference. - fn as_any(&self) -> &dyn Any { - self - } - - /// Returns the builder as a mutable `Any` reference. - fn as_any_mut(&mut self) -> &mut dyn Any { - self - } - - /// Returns the boxed builder as a box of `Any`. - fn into_box_any(self: Box) -> Box { - self - } - - /// Returns the number of array slots in the builder - fn len(&self) -> usize { - self.builder.len() - } - - /// Returns whether the number of array slots is zero - fn is_empty(&self) -> bool { - self.builder.is_empty() - } - - /// Builds the array and reset this builder. - fn finish(&mut self) -> ArrayRef { - Arc::new(self.finish()) - } -} - -impl GenericBinaryBuilder { - /// Creates a new `GenericBinaryBuilder`, `capacity` is the number of bytes in the values - /// array - pub fn new(capacity: usize) -> Self { - let values_builder = UInt8Builder::new(capacity); - Self { - builder: GenericListBuilder::new(values_builder), - } - } - - /// Appends a single byte value into the builder's values array. - /// - /// Note, when appending individual byte values you must call `append` to delimit each - /// distinct list value. - #[inline] - pub fn append_byte(&mut self, value: u8) -> Result<()> { - self.builder.values().append_value(value)?; - Ok(()) - } - - /// Appends a byte slice into the builder. - /// - /// Automatically calls the `append` method to delimit the slice appended in as a - /// distinct array element. - #[inline] - pub fn append_value(&mut self, value: impl AsRef<[u8]>) -> Result<()> { - self.builder.values().append_slice(value.as_ref())?; - self.builder.append(true)?; - Ok(()) - } - - /// Finish the current variable-length list array slot. - #[inline] - pub fn append(&mut self, is_valid: bool) -> Result<()> { - self.builder.append(is_valid) - } - - /// Append a null value to the array. - #[inline] - pub fn append_null(&mut self) -> Result<()> { - self.append(false) - } - - /// Builds the `BinaryArray` and reset this builder. - pub fn finish(&mut self) -> GenericBinaryArray { - GenericBinaryArray::::from(self.builder.finish()) - } -} - -impl GenericStringBuilder { - /// Creates a new `StringBuilder`, - /// `capacity` is the number of bytes of string data to pre-allocate space for in this builder - pub fn new(capacity: usize) -> Self { - let values_builder = UInt8Builder::new(capacity); - Self { - builder: GenericListBuilder::new(values_builder), - } - } - - /// Creates a new `StringBuilder`, - /// `data_capacity` is the number of bytes of string data to pre-allocate space for in this builder - /// `item_capacity` is the number of items to pre-allocate space for in this builder - pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self { - let values_builder = UInt8Builder::new(data_capacity); - Self { - builder: GenericListBuilder::with_capacity(values_builder, item_capacity), - } - } - - /// Appends a string into the builder. - /// - /// Automatically calls the `append` method to delimit the string appended in as a - /// distinct array element. - #[inline] - pub fn append_value(&mut self, value: impl AsRef) -> Result<()> { - self.builder - .values() - .append_slice(value.as_ref().as_bytes())?; - self.builder.append(true)?; - Ok(()) - } - - /// Finish the current variable-length list array slot. - #[inline] - pub fn append(&mut self, is_valid: bool) -> Result<()> { - self.builder.append(is_valid) - } - - /// Append a null value to the array. - #[inline] - pub fn append_null(&mut self) -> Result<()> { - self.append(false) - } - - /// Append an `Option` value to the array. - #[inline] - pub fn append_option(&mut self, value: Option>) -> Result<()> { - match value { - None => self.append_null()?, - Some(v) => self.append_value(v)?, - }; - Ok(()) - } - - /// Builds the `StringArray` and reset this builder. - pub fn finish(&mut self) -> GenericStringArray { - GenericStringArray::::from(self.builder.finish()) - } -} - -impl FixedSizeBinaryBuilder { - /// Creates a new `BinaryBuilder`, `capacity` is the number of bytes in the values - /// array - pub fn new(capacity: usize, byte_width: i32) -> Self { - let values_builder = UInt8Builder::new(capacity); - Self { - builder: FixedSizeListBuilder::new(values_builder, byte_width), - } - } - - /// Appends a byte slice into the builder. - /// - /// Automatically calls the `append` method to delimit the slice appended in as a - /// distinct array element. - #[inline] - pub fn append_value(&mut self, value: impl AsRef<[u8]>) -> Result<()> { - if self.builder.value_length() != value.as_ref().len() as i32 { - return Err(ArrowError::InvalidArgumentError( - "Byte slice does not have the same length as FixedSizeBinaryBuilder value lengths".to_string() - )); - } - self.builder.values().append_slice(value.as_ref())?; - self.builder.append(true) - } - - /// Append a null value to the array. - #[inline] - pub fn append_null(&mut self) -> Result<()> { - let length: usize = self.builder.value_length() as usize; - self.builder.values().append_slice(&vec![0u8; length][..])?; - self.builder.append(false) - } - - /// Builds the `FixedSizeBinaryArray` and reset this builder. - pub fn finish(&mut self) -> FixedSizeBinaryArray { - FixedSizeBinaryArray::from(self.builder.finish()) - } -} - -impl DecimalBuilder { - /// Creates a new `BinaryBuilder`, `capacity` is the number of bytes in the values - /// array - pub fn new(capacity: usize, precision: usize, scale: usize) -> Self { - let values_builder = UInt8Builder::new(capacity); - let byte_width = 16; - Self { - builder: FixedSizeListBuilder::new(values_builder, byte_width), - precision, - scale, - } - } - - /// Appends a byte slice into the builder. - /// - /// Automatically calls the `append` method to delimit the slice appended in as a - /// distinct array element. - #[inline] - pub fn append_value(&mut self, value: i128) -> Result<()> { - let value = validate_decimal_precision(value, self.precision)?; - let value_as_bytes = Self::from_i128_to_fixed_size_bytes( - value, - self.builder.value_length() as usize, - )?; - if self.builder.value_length() != value_as_bytes.len() as i32 { - return Err(ArrowError::InvalidArgumentError( - "Byte slice does not have the same length as DecimalBuilder value lengths".to_string() - )); - } - self.builder - .values() - .append_slice(value_as_bytes.as_slice())?; - self.builder.append(true) - } - - fn from_i128_to_fixed_size_bytes(v: i128, size: usize) -> Result> { - if size > 16 { - return Err(ArrowError::InvalidArgumentError( - "DecimalBuilder only supports values up to 16 bytes.".to_string(), - )); - } - let res = v.to_le_bytes(); - let start_byte = 16 - size; - Ok(res[start_byte..16].to_vec()) - } - - /// Append a null value to the array. - #[inline] - pub fn append_null(&mut self) -> Result<()> { - let length: usize = self.builder.value_length() as usize; - self.builder.values().append_slice(&vec![0u8; length][..])?; - self.builder.append(false) - } - - /// Builds the `DecimalArray` and reset this builder. - pub fn finish(&mut self) -> DecimalArray { - DecimalArray::from_fixed_size_list_array( - self.builder.finish(), - self.precision, - self.scale, - ) - } -} - -/// Array builder for Struct types. -/// -/// Note that callers should make sure that methods of all the child field builders are -/// properly called to maintain the consistency of the data structure. -pub struct StructBuilder { - fields: Vec, - field_builders: Vec>, - bitmap_builder: BooleanBufferBuilder, - len: usize, -} - -impl fmt::Debug for StructBuilder { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_struct("StructBuilder") - .field("fields", &self.fields) - .field("bitmap_builder", &self.bitmap_builder) - .field("len", &self.len) - .finish() - } -} - -impl ArrayBuilder for StructBuilder { - /// Returns the number of array slots in the builder. - /// - /// Note that this always return the first child field builder's length, and it is - /// the caller's responsibility to maintain the consistency that all the child field - /// builder should have the equal number of elements. - fn len(&self) -> usize { - self.len - } - - /// Returns whether the number of array slots is zero - fn is_empty(&self) -> bool { - self.len == 0 - } - - /// Builds the array. - fn finish(&mut self) -> ArrayRef { - Arc::new(self.finish()) - } - - /// Returns the builder as a non-mutable `Any` reference. - /// - /// This is most useful when one wants to call non-mutable APIs on a specific builder - /// type. In this case, one can first cast this into a `Any`, and then use - /// `downcast_ref` to get a reference on the specific builder. - fn as_any(&self) -> &dyn Any { - self - } - - /// Returns the builder as a mutable `Any` reference. - /// - /// This is most useful when one wants to call mutable APIs on a specific builder - /// type. In this case, one can first cast this into a `Any`, and then use - /// `downcast_mut` to get a reference on the specific builder. - fn as_any_mut(&mut self) -> &mut dyn Any { - self - } - - /// Returns the boxed builder as a box of `Any`. - fn into_box_any(self: Box) -> Box { - self - } -} - -/// Returns a builder with capacity `capacity` that corresponds to the datatype `DataType` -/// This function is useful to construct arrays from an arbitrary vectors with known/expected -/// schema. -pub fn make_builder(datatype: &DataType, capacity: usize) -> Box { - match datatype { - DataType::Null => unimplemented!(), - DataType::Boolean => Box::new(BooleanBuilder::new(capacity)), - DataType::Int8 => Box::new(Int8Builder::new(capacity)), - DataType::Int16 => Box::new(Int16Builder::new(capacity)), - DataType::Int32 => Box::new(Int32Builder::new(capacity)), - DataType::Int64 => Box::new(Int64Builder::new(capacity)), - DataType::UInt8 => Box::new(UInt8Builder::new(capacity)), - DataType::UInt16 => Box::new(UInt16Builder::new(capacity)), - DataType::UInt32 => Box::new(UInt32Builder::new(capacity)), - DataType::UInt64 => Box::new(UInt64Builder::new(capacity)), - DataType::Float32 => Box::new(Float32Builder::new(capacity)), - DataType::Float64 => Box::new(Float64Builder::new(capacity)), - DataType::Binary => Box::new(BinaryBuilder::new(capacity)), - DataType::FixedSizeBinary(len) => { - Box::new(FixedSizeBinaryBuilder::new(capacity, *len)) - } - DataType::Decimal(precision, scale) => { - Box::new(DecimalBuilder::new(capacity, *precision, *scale)) - } - DataType::Utf8 => Box::new(StringBuilder::new(capacity)), - DataType::Date32 => Box::new(Date32Builder::new(capacity)), - DataType::Date64 => Box::new(Date64Builder::new(capacity)), - DataType::Time32(TimeUnit::Second) => { - Box::new(Time32SecondBuilder::new(capacity)) - } - DataType::Time32(TimeUnit::Millisecond) => { - Box::new(Time32MillisecondBuilder::new(capacity)) - } - DataType::Time64(TimeUnit::Microsecond) => { - Box::new(Time64MicrosecondBuilder::new(capacity)) - } - DataType::Time64(TimeUnit::Nanosecond) => { - Box::new(Time64NanosecondBuilder::new(capacity)) - } - DataType::Timestamp(TimeUnit::Second, _) => { - Box::new(TimestampSecondBuilder::new(capacity)) - } - DataType::Timestamp(TimeUnit::Millisecond, _) => { - Box::new(TimestampMillisecondBuilder::new(capacity)) - } - DataType::Timestamp(TimeUnit::Microsecond, _) => { - Box::new(TimestampMicrosecondBuilder::new(capacity)) - } - DataType::Timestamp(TimeUnit::Nanosecond, _) => { - Box::new(TimestampNanosecondBuilder::new(capacity)) - } - DataType::Interval(IntervalUnit::YearMonth) => { - Box::new(IntervalYearMonthBuilder::new(capacity)) - } - DataType::Interval(IntervalUnit::DayTime) => { - Box::new(IntervalDayTimeBuilder::new(capacity)) - } - DataType::Interval(IntervalUnit::MonthDayNano) => { - Box::new(IntervalMonthDayNanoBuilder::new(capacity)) - } - DataType::Duration(TimeUnit::Second) => { - Box::new(DurationSecondBuilder::new(capacity)) - } - DataType::Duration(TimeUnit::Millisecond) => { - Box::new(DurationMillisecondBuilder::new(capacity)) - } - DataType::Duration(TimeUnit::Microsecond) => { - Box::new(DurationMicrosecondBuilder::new(capacity)) - } - DataType::Duration(TimeUnit::Nanosecond) => { - Box::new(DurationNanosecondBuilder::new(capacity)) - } - DataType::Struct(fields) => { - Box::new(StructBuilder::from_fields(fields.clone(), capacity)) - } - t => panic!("Data type {:?} is not currently supported", t), - } -} - -impl StructBuilder { - pub fn new(fields: Vec, field_builders: Vec>) -> Self { - Self { - fields, - field_builders, - bitmap_builder: BooleanBufferBuilder::new(0), - len: 0, - } - } - - pub fn from_fields(fields: Vec, capacity: usize) -> Self { - let mut builders = Vec::with_capacity(fields.len()); - for field in &fields { - builders.push(make_builder(field.data_type(), capacity)); - } - Self::new(fields, builders) - } - - /// Returns a mutable reference to the child field builder at index `i`. - /// Result will be `None` if the input type `T` provided doesn't match the actual - /// field builder's type. - pub fn field_builder(&mut self, i: usize) -> Option<&mut T> { - self.field_builders[i].as_any_mut().downcast_mut::() - } - - /// Returns the number of fields for the struct this builder is building. - pub fn num_fields(&self) -> usize { - self.field_builders.len() - } - - /// Appends an element (either null or non-null) to the struct. The actual elements - /// should be appended for each child sub-array in a consistent way. - #[inline] - pub fn append(&mut self, is_valid: bool) -> Result<()> { - self.bitmap_builder.append(is_valid); - self.len += 1; - Ok(()) - } - - /// Appends a null element to the struct. - #[inline] - pub fn append_null(&mut self) -> Result<()> { - self.append(false) - } - - /// Builds the `StructArray` and reset this builder. - pub fn finish(&mut self) -> StructArray { - let mut child_data = Vec::with_capacity(self.field_builders.len()); - for f in &mut self.field_builders { - let arr = f.finish(); - child_data.push(arr.data().clone()); - } - - let null_bit_buffer = self.bitmap_builder.finish(); - let null_count = self.len - null_bit_buffer.count_set_bits(); - let mut builder = ArrayData::builder(DataType::Struct(self.fields.clone())) - .len(self.len) - .child_data(child_data); - if null_count > 0 { - builder = builder.null_bit_buffer(Some(null_bit_buffer)); - } - - self.len = 0; - - let array_data = unsafe { builder.build_unchecked() }; - StructArray::from(array_data) - } -} - -#[derive(Debug)] -pub struct MapBuilder { - offsets_builder: BufferBuilder, - bitmap_builder: BooleanBufferBuilder, - field_names: MapFieldNames, - key_builder: K, - value_builder: V, - len: i32, -} - -#[derive(Debug, Clone)] -pub struct MapFieldNames { - pub entry: String, - pub key: String, - pub value: String, -} - -impl Default for MapFieldNames { - fn default() -> Self { - Self { - entry: "entries".to_string(), - key: "keys".to_string(), - value: "values".to_string(), - } - } -} - -#[allow(dead_code)] -impl MapBuilder { - pub fn new( - field_names: Option, - key_builder: K, - value_builder: V, - ) -> Self { - let capacity = key_builder.len(); - Self::with_capacity(field_names, key_builder, value_builder, capacity) - } - - pub fn with_capacity( - field_names: Option, - key_builder: K, - value_builder: V, - capacity: usize, - ) -> Self { - let mut offsets_builder = BufferBuilder::::new(capacity + 1); - let len = 0; - offsets_builder.append(len); - Self { - offsets_builder, - bitmap_builder: BooleanBufferBuilder::new(capacity), - field_names: field_names.unwrap_or_default(), - key_builder, - value_builder, - len, - } - } - - pub fn keys(&mut self) -> &mut K { - &mut self.key_builder - } - - pub fn values(&mut self) -> &mut V { - &mut self.value_builder - } - - /// Finish the current map array slot - #[inline] - pub fn append(&mut self, is_valid: bool) -> Result<()> { - if self.key_builder.len() != self.value_builder.len() { - return Err(ArrowError::InvalidArgumentError(format!( - "Cannot append to a map builder when its keys and values have unequal lengths of {} and {}", - self.key_builder.len(), - self.value_builder.len() - ))); - } - self.offsets_builder.append(self.key_builder.len() as i32); - self.bitmap_builder.append(is_valid); - self.len += 1; - Ok(()) - } - - pub fn finish(&mut self) -> MapArray { - let len = self.len(); - self.len = 0; - - // Build the keys - let keys_arr = self - .key_builder - .as_any_mut() - .downcast_mut::() - .unwrap() - .finish(); - let values_arr = self - .value_builder - .as_any_mut() - .downcast_mut::() - .unwrap() - .finish(); - - let keys_field = Field::new( - self.field_names.key.as_str(), - keys_arr.data_type().clone(), - false, // always nullable - ); - let values_field = Field::new( - self.field_names.value.as_str(), - values_arr.data_type().clone(), - true, - ); - - let struct_array = - StructArray::from(vec![(keys_field, keys_arr), (values_field, values_arr)]); - - let offset_buffer = self.offsets_builder.finish(); - let null_bit_buffer = self.bitmap_builder.finish(); - self.offsets_builder.append(self.len); - let map_field = Box::new(Field::new( - self.field_names.entry.as_str(), - struct_array.data_type().clone(), - false, // always non-nullable - )); - let array_data = ArrayData::builder(DataType::Map(map_field, false)) // TODO: support sorted keys - .len(len) - .add_buffer(offset_buffer) - .add_child_data(struct_array.data().clone()) - .null_bit_buffer(Some(null_bit_buffer)); - - let array_data = unsafe { array_data.build_unchecked() }; - - MapArray::from(array_data) - } -} - -impl ArrayBuilder for MapBuilder { - fn len(&self) -> usize { - self.len as usize - } - - fn is_empty(&self) -> bool { - self.len == 0 - } - - fn finish(&mut self) -> ArrayRef { - Arc::new(self.finish()) - } - - fn as_any(&self) -> &dyn Any { - self - } - - fn as_any_mut(&mut self) -> &mut dyn Any { - self - } - - fn into_box_any(self: Box) -> Box { - self - } -} - -/// `FieldData` is a helper struct to track the state of the fields in the `UnionBuilder`. -#[derive(Debug)] -struct FieldData { - /// The type id for this field - type_id: i8, - /// The Arrow data type represented in the `values_buffer`, which is untyped - data_type: DataType, - /// A buffer containing the values for this field in raw bytes - values_buffer: Option, - /// The number of array slots represented by the buffer - slots: usize, - /// A builder for the null bitmap - bitmap_builder: BooleanBufferBuilder, -} - -impl FieldData { - /// Creates a new `FieldData`. - fn new(type_id: i8, data_type: DataType) -> Self { - Self { - type_id, - data_type, - values_buffer: Some(MutableBuffer::new(1)), - slots: 0, - bitmap_builder: BooleanBufferBuilder::new(1), - } - } - - /// Appends a single value to this `FieldData`'s `values_buffer`. - #[allow(clippy::unnecessary_wraps)] - fn append_to_values_buffer( - &mut self, - v: T::Native, - ) -> Result<()> { - let values_buffer = self - .values_buffer - .take() - .expect("Values buffer was never created"); - let mut builder: BufferBuilder = - mutable_buffer_to_builder(values_buffer, self.slots); - builder.append(v); - let mutable_buffer = builder_to_mutable_buffer(builder); - self.values_buffer = Some(mutable_buffer); - - self.slots += 1; - self.bitmap_builder.append(true); - Ok(()) - } - - /// Appends a null to this `FieldData`. - #[allow(clippy::unnecessary_wraps)] - fn append_null(&mut self) -> Result<()> { - let values_buffer = self - .values_buffer - .take() - .expect("Values buffer was never created"); - - let mut builder: BufferBuilder = - mutable_buffer_to_builder(values_buffer, self.slots); - - builder.advance(1); - let mutable_buffer = builder_to_mutable_buffer(builder); - self.values_buffer = Some(mutable_buffer); - self.slots += 1; - self.bitmap_builder.append(false); - Ok(()) - } - - /// Appends a null to this `FieldData` when the type is not known at compile time. - /// - /// As the main `append` method of `UnionBuilder` is generic, we need a way to append null - /// slots to the fields that are not being appended to in the case of sparse unions. This - /// method solves this problem by appending dynamically based on `DataType`. - /// - /// Note, this method does **not** update the length of the `UnionArray` (this is done by the - /// main append operation) and assumes that it is called from a method that is generic over `T` - /// where `T` satisfies the bound `ArrowPrimitiveType`. - fn append_null_dynamic(&mut self) -> Result<()> { - match self.data_type { - DataType::Null => unimplemented!(), - DataType::Int8 => self.append_null::()?, - DataType::Int16 => self.append_null::()?, - DataType::Int32 - | DataType::Date32 - | DataType::Time32(_) - | DataType::Interval(IntervalUnit::YearMonth) => { - self.append_null::()? - } - DataType::Int64 - | DataType::Timestamp(_, _) - | DataType::Date64 - | DataType::Time64(_) - | DataType::Interval(IntervalUnit::DayTime) - | DataType::Duration(_) => self.append_null::()?, - DataType::Interval(IntervalUnit::MonthDayNano) => self.append_null::()?, - DataType::UInt8 => self.append_null::()?, - DataType::UInt16 => self.append_null::()?, - DataType::UInt32 => self.append_null::()?, - DataType::UInt64 => self.append_null::()?, - DataType::Float32 => self.append_null::()?, - DataType::Float64 => self.append_null::()?, - _ => unreachable!("All cases of types that satisfy the trait bounds over T are covered above."), - }; - Ok(()) - } -} - -/// Builder type for creating a new `UnionArray`. -/// -/// Example: **Dense Memory Layout** -/// -/// ``` -/// use arrow::array::UnionBuilder; -/// use arrow::datatypes::{Float64Type, Int32Type}; -/// -/// let mut builder = UnionBuilder::new_dense(3); -/// builder.append::("a", 1).unwrap(); -/// builder.append::("b", 3.0).unwrap(); -/// builder.append::("a", 4).unwrap(); -/// let union = builder.build().unwrap(); -/// -/// assert_eq!(union.type_id(0), 0_i8); -/// assert_eq!(union.type_id(1), 1_i8); -/// assert_eq!(union.type_id(2), 0_i8); -/// -/// assert_eq!(union.value_offset(0), 0_i32); -/// assert_eq!(union.value_offset(1), 0_i32); -/// assert_eq!(union.value_offset(2), 1_i32); -/// ``` -/// -/// Example: **Sparse Memory Layout** -/// ``` -/// use arrow::array::UnionBuilder; -/// use arrow::datatypes::{Float64Type, Int32Type}; -/// -/// let mut builder = UnionBuilder::new_sparse(3); -/// builder.append::("a", 1).unwrap(); -/// builder.append::("b", 3.0).unwrap(); -/// builder.append::("a", 4).unwrap(); -/// let union = builder.build().unwrap(); -/// -/// assert_eq!(union.type_id(0), 0_i8); -/// assert_eq!(union.type_id(1), 1_i8); -/// assert_eq!(union.type_id(2), 0_i8); -/// -/// assert_eq!(union.value_offset(0), 0_i32); -/// assert_eq!(union.value_offset(1), 1_i32); -/// assert_eq!(union.value_offset(2), 2_i32); -/// ``` -#[derive(Debug)] -pub struct UnionBuilder { - /// The current number of slots in the array - len: usize, - /// Maps field names to `FieldData` instances which track the builders for that field - fields: HashMap, - /// Builder to keep track of type ids - type_id_builder: Int8BufferBuilder, - /// Builder to keep track of offsets (`None` for sparse unions) - value_offset_builder: Option, -} - -impl UnionBuilder { - /// Creates a new dense array builder. - pub fn new_dense(capacity: usize) -> Self { - Self { - len: 0, - fields: HashMap::default(), - type_id_builder: Int8BufferBuilder::new(capacity), - value_offset_builder: Some(Int32BufferBuilder::new(capacity)), - } - } - - /// Creates a new sparse array builder. - pub fn new_sparse(capacity: usize) -> Self { - Self { - len: 0, - fields: HashMap::default(), - type_id_builder: Int8BufferBuilder::new(capacity), - value_offset_builder: None, - } - } - - /// Appends a null to this builder, encoding the null in the array - /// of the `type_name` child / field. - /// - /// Since `UnionArray` encodes nulls as an entry in its children - /// (it doesn't have a validity bitmap itself), and where the null - /// is part of the final array, appending a NULL requires - /// specifying which field (child) to use. - #[inline] - pub fn append_null(&mut self, type_name: &str) -> Result<()> { - self.append_option::(type_name, None) - } - - /// Appends a value to this builder. - #[inline] - pub fn append( - &mut self, - type_name: &str, - v: T::Native, - ) -> Result<()> { - self.append_option::(type_name, Some(v)) - } - - fn append_option( - &mut self, - type_name: &str, - v: Option, - ) -> Result<()> { - let type_name = type_name.to_string(); - - let mut field_data = match self.fields.remove(&type_name) { - Some(data) => { - if data.data_type != T::DATA_TYPE { - return Err(ArrowError::InvalidArgumentError(format!("Attempt to write col \"{}\" with type {} doesn't match existing type {}", type_name, T::DATA_TYPE, data.data_type))); - } - data - } - None => match self.value_offset_builder { - Some(_) => FieldData::new(self.fields.len() as i8, T::DATA_TYPE), - None => { - let mut fd = FieldData::new(self.fields.len() as i8, T::DATA_TYPE); - for _ in 0..self.len { - fd.append_null::()?; - } - fd - } - }, - }; - self.type_id_builder.append(field_data.type_id); - - match &mut self.value_offset_builder { - // Dense Union - Some(offset_builder) => { - offset_builder.append(field_data.slots as i32); - } - // Sparse Union - None => { - for (_, fd) in self.fields.iter_mut() { - // Append to all bar the FieldData currently being appended to - fd.append_null_dynamic()?; - } - } - } - - match v { - Some(v) => field_data.append_to_values_buffer::(v)?, - None => field_data.append_null::()?, - } - - self.fields.insert(type_name, field_data); - self.len += 1; - Ok(()) - } - - /// Builds this builder creating a new `UnionArray`. - pub fn build(mut self) -> Result { - let type_id_buffer = self.type_id_builder.finish(); - let value_offsets_buffer = self.value_offset_builder.map(|mut b| b.finish()); - let mut children = Vec::new(); - for ( - name, - FieldData { - type_id, - data_type, - values_buffer, - slots, - mut bitmap_builder, - }, - ) in self.fields.into_iter() - { - let buffer = values_buffer - .expect("The `values_buffer` should only ever be None inside the `append` method.") - .into(); - let arr_data_builder = ArrayDataBuilder::new(data_type.clone()) - .add_buffer(buffer) - .len(slots) - .null_bit_buffer(Some(bitmap_builder.finish())); - - let arr_data_ref = unsafe { arr_data_builder.build_unchecked() }; - let array_ref = make_array(arr_data_ref); - children.push((type_id, (Field::new(&name, data_type, false), array_ref))) - } - - children.sort_by(|a, b| { - a.0.partial_cmp(&b.0) - .expect("This will never be None as type ids are always i8 values.") - }); - let children: Vec<_> = children.into_iter().map(|(_, b)| b).collect(); - - let type_ids: Vec = (0_i8..children.len() as i8).collect(); - - UnionArray::try_new(&type_ids, type_id_buffer, value_offsets_buffer, children) - } -} - -/// Array builder for `DictionaryArray`. For example to map a set of byte indices -/// to f32 values. Note that the use of a `HashMap` here will not scale to very large -/// arrays or result in an ordered dictionary. -/// -/// # Example: -/// -/// ``` -/// use arrow::array::{ -/// Array, PrimitiveBuilder, PrimitiveDictionaryBuilder, -/// UInt8Array, UInt32Array, -/// }; -/// use arrow::datatypes::{UInt8Type, UInt32Type}; -/// -/// let key_builder = PrimitiveBuilder::::new(3); -/// let value_builder = PrimitiveBuilder::::new(2); -/// let mut builder = PrimitiveDictionaryBuilder::new(key_builder, value_builder); -/// builder.append(12345678).unwrap(); -/// builder.append_null().unwrap(); -/// builder.append(22345678).unwrap(); -/// let array = builder.finish(); -/// -/// assert_eq!( -/// array.keys(), -/// &UInt8Array::from(vec![Some(0), None, Some(1)]) -/// ); -/// -/// // Values are polymorphic and so require a downcast. -/// let av = array.values(); -/// let ava: &UInt32Array = av.as_any().downcast_ref::().unwrap(); -/// let avs: &[u32] = ava.values(); -/// -/// assert!(!array.is_null(0)); -/// assert!(array.is_null(1)); -/// assert!(!array.is_null(2)); -/// -/// assert_eq!(avs, &[12345678, 22345678]); -/// ``` -#[derive(Debug)] -pub struct PrimitiveDictionaryBuilder -where - K: ArrowPrimitiveType, - V: ArrowPrimitiveType, -{ - keys_builder: PrimitiveBuilder, - values_builder: PrimitiveBuilder, - map: HashMap, K::Native>, -} - -impl PrimitiveDictionaryBuilder -where - K: ArrowPrimitiveType, - V: ArrowPrimitiveType, -{ - /// Creates a new `PrimitiveDictionaryBuilder` from a keys builder and a value builder. - pub fn new( - keys_builder: PrimitiveBuilder, - values_builder: PrimitiveBuilder, - ) -> Self { - Self { - keys_builder, - values_builder, - map: HashMap::new(), - } - } -} - -impl ArrayBuilder for PrimitiveDictionaryBuilder -where - K: ArrowPrimitiveType, - V: ArrowPrimitiveType, -{ - /// Returns the builder as an non-mutable `Any` reference. - fn as_any(&self) -> &dyn Any { - self - } - - /// Returns the builder as an mutable `Any` reference. - fn as_any_mut(&mut self) -> &mut dyn Any { - self - } - - /// Returns the boxed builder as a box of `Any`. - fn into_box_any(self: Box) -> Box { - self - } - - /// Returns the number of array slots in the builder - fn len(&self) -> usize { - self.keys_builder.len() - } - - /// Returns whether the number of array slots is zero - fn is_empty(&self) -> bool { - self.keys_builder.is_empty() - } - - /// Builds the array and reset this builder. - fn finish(&mut self) -> ArrayRef { - Arc::new(self.finish()) - } -} - -impl PrimitiveDictionaryBuilder -where - K: ArrowPrimitiveType, - V: ArrowPrimitiveType, -{ - /// Append a primitive value to the array. Return an existing index - /// if already present in the values array or a new index if the - /// value is appended to the values array. - #[inline] - pub fn append(&mut self, value: V::Native) -> Result { - if let Some(&key) = self.map.get(value.to_byte_slice()) { - // Append existing value. - self.keys_builder.append_value(key)?; - Ok(key) - } else { - // Append new value. - let key = K::Native::from_usize(self.values_builder.len()) - .ok_or(ArrowError::DictionaryKeyOverflowError)?; - self.values_builder.append_value(value)?; - self.keys_builder.append_value(key as K::Native)?; - self.map.insert(value.to_byte_slice().into(), key); - Ok(key) - } - } - - #[inline] - pub fn append_null(&mut self) -> Result<()> { - self.keys_builder.append_null() - } - - /// Builds the `DictionaryArray` and reset this builder. - pub fn finish(&mut self) -> DictionaryArray { - self.map.clear(); - let value_ref: ArrayRef = Arc::new(self.values_builder.finish()); - self.keys_builder.finish_dict(value_ref) - } -} - -/// Array builder for `DictionaryArray` that stores Strings. For example to map a set of byte indices -/// to String values. Note that the use of a `HashMap` here will not scale to very large -/// arrays or result in an ordered dictionary. -/// -/// ``` -/// use arrow::{ -/// array::{ -/// Int8Array, StringArray, -/// PrimitiveBuilder, StringBuilder, StringDictionaryBuilder, -/// }, -/// datatypes::Int8Type, -/// }; -/// -/// // Create a dictionary array indexed by bytes whose values are Strings. -/// // It can thus hold up to 256 distinct string values. -/// -/// let key_builder = PrimitiveBuilder::::new(100); -/// let value_builder = StringBuilder::new(100); -/// let mut builder = StringDictionaryBuilder::new(key_builder, value_builder); -/// -/// // The builder builds the dictionary value by value -/// builder.append("abc").unwrap(); -/// builder.append_null().unwrap(); -/// builder.append("def").unwrap(); -/// builder.append("def").unwrap(); -/// builder.append("abc").unwrap(); -/// let array = builder.finish(); -/// -/// assert_eq!( -/// array.keys(), -/// &Int8Array::from(vec![Some(0), None, Some(1), Some(1), Some(0)]) -/// ); -/// -/// // Values are polymorphic and so require a downcast. -/// let av = array.values(); -/// let ava: &StringArray = av.as_any().downcast_ref::().unwrap(); -/// -/// assert_eq!(ava.value(0), "abc"); -/// assert_eq!(ava.value(1), "def"); -/// -/// ``` -#[derive(Debug)] -pub struct StringDictionaryBuilder -where - K: ArrowDictionaryKeyType, -{ - keys_builder: PrimitiveBuilder, - values_builder: StringBuilder, - map: HashMap, K::Native>, -} - -impl StringDictionaryBuilder -where - K: ArrowDictionaryKeyType, -{ - /// Creates a new `StringDictionaryBuilder` from a keys builder and a value builder. - pub fn new(keys_builder: PrimitiveBuilder, values_builder: StringBuilder) -> Self { - Self { - keys_builder, - values_builder, - map: HashMap::new(), - } - } - - /// Creates a new `StringDictionaryBuilder` from a keys builder and a dictionary - /// which is initialized with the given values. - /// The indices of those dictionary values are used as keys. - /// - /// # Example - /// - /// ``` - /// use arrow::datatypes::Int16Type; - /// use arrow::array::{StringArray, StringDictionaryBuilder, PrimitiveBuilder, Int16Array}; - /// use std::convert::TryFrom; - /// - /// let dictionary_values = StringArray::from(vec![None, Some("abc"), Some("def")]); - /// - /// let mut builder = StringDictionaryBuilder::new_with_dictionary(PrimitiveBuilder::::new(3), &dictionary_values).unwrap(); - /// builder.append("def").unwrap(); - /// builder.append_null().unwrap(); - /// builder.append("abc").unwrap(); - /// - /// let dictionary_array = builder.finish(); - /// - /// let keys = dictionary_array.keys(); - /// - /// assert_eq!(keys, &Int16Array::from(vec![Some(2), None, Some(1)])); - /// ``` - pub fn new_with_dictionary( - keys_builder: PrimitiveBuilder, - dictionary_values: &StringArray, - ) -> Result { - let dict_len = dictionary_values.len(); - let mut values_builder = - StringBuilder::with_capacity(dict_len, dictionary_values.value_data().len()); - let mut map: HashMap, K::Native> = HashMap::with_capacity(dict_len); - for i in 0..dict_len { - if dictionary_values.is_valid(i) { - let value = dictionary_values.value(i); - map.insert( - value.as_bytes().into(), - K::Native::from_usize(i) - .ok_or(ArrowError::DictionaryKeyOverflowError)?, - ); - values_builder.append_value(value)?; - } else { - values_builder.append_null()?; - } - } - Ok(Self { - keys_builder, - values_builder, - map, - }) - } -} - -impl ArrayBuilder for StringDictionaryBuilder -where - K: ArrowDictionaryKeyType, -{ - /// Returns the builder as an non-mutable `Any` reference. - fn as_any(&self) -> &dyn Any { - self - } - - /// Returns the builder as an mutable `Any` reference. - fn as_any_mut(&mut self) -> &mut dyn Any { - self - } - - /// Returns the boxed builder as a box of `Any`. - fn into_box_any(self: Box) -> Box { - self - } - - /// Returns the number of array slots in the builder - fn len(&self) -> usize { - self.keys_builder.len() - } - - /// Returns whether the number of array slots is zero - fn is_empty(&self) -> bool { - self.keys_builder.is_empty() - } - - /// Builds the array and reset this builder. - fn finish(&mut self) -> ArrayRef { - Arc::new(self.finish()) - } -} - -impl StringDictionaryBuilder -where - K: ArrowDictionaryKeyType, -{ - /// Append a primitive value to the array. Return an existing index - /// if already present in the values array or a new index if the - /// value is appended to the values array. - pub fn append(&mut self, value: impl AsRef) -> Result { - if let Some(&key) = self.map.get(value.as_ref().as_bytes()) { - // Append existing value. - self.keys_builder.append_value(key)?; - Ok(key) - } else { - // Append new value. - let key = K::Native::from_usize(self.values_builder.len()) - .ok_or(ArrowError::DictionaryKeyOverflowError)?; - self.values_builder.append_value(value.as_ref())?; - self.keys_builder.append_value(key as K::Native)?; - self.map.insert(value.as_ref().as_bytes().into(), key); - Ok(key) - } - } - - #[inline] - pub fn append_null(&mut self) -> Result<()> { - self.keys_builder.append_null() - } - - /// Builds the `DictionaryArray` and reset this builder. - pub fn finish(&mut self) -> DictionaryArray { - self.map.clear(); - let value_ref: ArrayRef = Arc::new(self.values_builder.finish()); - self.keys_builder.finish_dict(value_ref) - } -} - -#[cfg(test)] -mod tests { - use super::*; - - use crate::array::Array; - use crate::bitmap::Bitmap; - - #[test] - fn test_builder_i32_empty() { - let mut b = Int32BufferBuilder::new(5); - assert_eq!(0, b.len()); - assert_eq!(16, b.capacity()); - let a = b.finish(); - assert_eq!(0, a.len()); - } - - #[test] - fn test_builder_i32_alloc_zero_bytes() { - let mut b = Int32BufferBuilder::new(0); - b.append(123); - let a = b.finish(); - assert_eq!(4, a.len()); - } - - #[test] - fn test_builder_i32() { - let mut b = Int32BufferBuilder::new(5); - for i in 0..5 { - b.append(i); - } - assert_eq!(16, b.capacity()); - let a = b.finish(); - assert_eq!(20, a.len()); - } - - #[test] - fn test_builder_i32_grow_buffer() { - let mut b = Int32BufferBuilder::new(2); - assert_eq!(16, b.capacity()); - for i in 0..20 { - b.append(i); - } - assert_eq!(32, b.capacity()); - let a = b.finish(); - assert_eq!(80, a.len()); - } - - #[test] - fn test_builder_finish() { - let mut b = Int32BufferBuilder::new(5); - assert_eq!(16, b.capacity()); - for i in 0..10 { - b.append(i); - } - let mut a = b.finish(); - assert_eq!(40, a.len()); - assert_eq!(0, b.len()); - assert_eq!(0, b.capacity()); - - // Try build another buffer after cleaning up. - for i in 0..20 { - b.append(i) - } - assert_eq!(32, b.capacity()); - a = b.finish(); - assert_eq!(80, a.len()); - } - - #[test] - fn test_reserve() { - let mut b = UInt8BufferBuilder::new(2); - assert_eq!(64, b.capacity()); - b.reserve(64); - assert_eq!(64, b.capacity()); - b.reserve(65); - assert_eq!(128, b.capacity()); - - let mut b = Int32BufferBuilder::new(2); - assert_eq!(16, b.capacity()); - b.reserve(16); - assert_eq!(16, b.capacity()); - b.reserve(17); - assert_eq!(32, b.capacity()); - } - - #[test] - fn test_append_slice() { - let mut b = UInt8BufferBuilder::new(0); - b.append_slice(b"Hello, "); - b.append_slice(b"World!"); - let buffer = b.finish(); - assert_eq!(13, buffer.len()); - - let mut b = Int32BufferBuilder::new(0); - b.append_slice(&[32, 54]); - let buffer = b.finish(); - assert_eq!(8, buffer.len()); - } - - #[test] - fn test_append_values() -> Result<()> { - let mut a = Int8Builder::new(0); - a.append_value(1)?; - a.append_null()?; - a.append_value(-2)?; - assert_eq!(a.len(), 3); - - // append values - let values = &[1, 2, 3, 4]; - let is_valid = &[true, true, false, true]; - a.append_values(values, is_valid)?; - - assert_eq!(a.len(), 7); - let array = a.finish(); - assert_eq!(array.value(0), 1); - assert!(array.is_null(1)); - assert_eq!(array.value(2), -2); - assert_eq!(array.value(3), 1); - assert_eq!(array.value(4), 2); - assert!(array.is_null(5)); - assert_eq!(array.value(6), 4); - - Ok(()) - } - - #[test] - fn test_boolean_buffer_builder_write_bytes() { - let mut b = BooleanBufferBuilder::new(4); - b.append(false); - b.append(true); - b.append(false); - b.append(true); - assert_eq!(4, b.len()); - assert_eq!(512, b.capacity()); - let buffer = b.finish(); - assert_eq!(1, buffer.len()); - - // Overallocate capacity - let mut b = BooleanBufferBuilder::new(8); - b.append_slice(&[false, true, false, true]); - assert_eq!(4, b.len()); - assert_eq!(512, b.capacity()); - let buffer = b.finish(); - assert_eq!(1, buffer.len()); - } - - #[test] - fn test_boolean_buffer_builder_unset_first_bit() { - let mut buffer = BooleanBufferBuilder::new(4); - buffer.append(true); - buffer.append(true); - buffer.append(false); - buffer.append(true); - buffer.set_bit(0, false); - assert_eq!(buffer.len(), 4); - assert_eq!(buffer.finish().as_slice(), &[0b1010_u8]); - } - - #[test] - fn test_boolean_buffer_builder_unset_last_bit() { - let mut buffer = BooleanBufferBuilder::new(4); - buffer.append(true); - buffer.append(true); - buffer.append(false); - buffer.append(true); - buffer.set_bit(3, false); - assert_eq!(buffer.len(), 4); - assert_eq!(buffer.finish().as_slice(), &[0b0011_u8]); - } - - #[test] - fn test_boolean_buffer_builder_unset_an_inner_bit() { - let mut buffer = BooleanBufferBuilder::new(5); - buffer.append(true); - buffer.append(true); - buffer.append(false); - buffer.append(true); - buffer.set_bit(1, false); - assert_eq!(buffer.len(), 4); - assert_eq!(buffer.finish().as_slice(), &[0b1001_u8]); - } - - #[test] - fn test_boolean_buffer_builder_unset_several_bits() { - let mut buffer = BooleanBufferBuilder::new(5); - buffer.append(true); - buffer.append(true); - buffer.append(true); - buffer.append(false); - buffer.append(true); - buffer.set_bit(1, false); - buffer.set_bit(2, false); - assert_eq!(buffer.len(), 5); - assert_eq!(buffer.finish().as_slice(), &[0b10001_u8]); - } - - #[test] - fn test_boolean_buffer_builder_unset_several_bits_bigger_than_one_byte() { - let mut buffer = BooleanBufferBuilder::new(16); - buffer.append_n(10, true); - buffer.set_bit(0, false); - buffer.set_bit(3, false); - buffer.set_bit(9, false); - assert_eq!(buffer.len(), 10); - assert_eq!(buffer.finish().as_slice(), &[0b11110110_u8, 0b01_u8]); - } - - #[test] - fn test_boolean_buffer_builder_flip_several_bits_bigger_than_one_byte() { - let mut buffer = BooleanBufferBuilder::new(16); - buffer.append_n(5, true); - buffer.append_n(5, false); - buffer.append_n(5, true); - buffer.set_bit(0, false); - buffer.set_bit(3, false); - buffer.set_bit(9, false); - buffer.set_bit(6, true); - buffer.set_bit(14, true); - buffer.set_bit(13, false); - assert_eq!(buffer.len(), 15); - assert_eq!(buffer.finish().as_slice(), &[0b01010110_u8, 0b1011100_u8]); - } - - #[test] - fn test_bool_buffer_builder_get_first_bit() { - let mut buffer = BooleanBufferBuilder::new(16); - buffer.append_n(8, true); - buffer.append_n(8, false); - assert!(buffer.get_bit(0)); - } - - #[test] - fn test_bool_buffer_builder_get_first_bit_not_requires_mutability() { - let buffer = { - let mut buffer = BooleanBufferBuilder::new(16); - buffer.append_n(8, true); - buffer - }; - - assert!(buffer.get_bit(0)); - } - - #[test] - fn test_bool_buffer_builder_get_last_bit() { - let mut buffer = BooleanBufferBuilder::new(16); - buffer.append_n(8, true); - buffer.append_n(8, false); - assert!(!buffer.get_bit(15)); - } - - #[test] - fn test_bool_buffer_builder_get_an_inner_bit() { - let mut buffer = BooleanBufferBuilder::new(16); - buffer.append_n(4, false); - buffer.append_n(8, true); - buffer.append_n(4, false); - assert!(buffer.get_bit(11)); - } - - #[test] - fn test_bool_buffer_fuzz() { - use rand::prelude::*; - - let mut buffer = BooleanBufferBuilder::new(12); - let mut all_bools = vec![]; - let mut rng = rand::thread_rng(); - - let src_len = 32; - let (src, compacted_src) = { - let src: Vec<_> = std::iter::from_fn(|| Some(rng.next_u32() & 1 == 0)) - .take(src_len) - .collect(); - - let mut compacted_src = BooleanBufferBuilder::new(src_len); - compacted_src.append_slice(&src); - (src, compacted_src.finish()) - }; - - for _ in 0..100 { - let a = rng.next_u32() as usize % src_len; - let b = rng.next_u32() as usize % src_len; - - let start = a.min(b); - let end = a.max(b); - - buffer.append_packed_range(start..end, compacted_src.as_slice()); - all_bools.extend_from_slice(&src[start..end]); - } - - let mut compacted = BooleanBufferBuilder::new(all_bools.len()); - compacted.append_slice(&all_bools); - - assert_eq!(buffer.finish(), compacted.finish()) - } - - #[test] - fn test_boolean_array_builder_append_slice() { - let arr1 = - BooleanArray::from(vec![Some(true), Some(false), None, None, Some(false)]); - - let mut builder = BooleanArray::builder(0); - builder.append_slice(&[true, false]).unwrap(); - builder.append_null().unwrap(); - builder.append_null().unwrap(); - builder.append_value(false).unwrap(); - let arr2 = builder.finish(); - - assert_eq!(arr1, arr2); - } - - #[test] - fn test_boolean_array_builder_append_slice_large() { - let arr1 = BooleanArray::from(vec![true; 513]); - - let mut builder = BooleanArray::builder(512); - builder.append_slice(&[true; 513]).unwrap(); - let arr2 = builder.finish(); - - assert_eq!(arr1, arr2); - } - - #[test] - fn test_boolean_array_builder_resize() { - let mut builder = BooleanBufferBuilder::new(20); - builder.append_n(4, true); - builder.append_n(7, false); - builder.append_n(2, true); - builder.resize(20); - - assert_eq!(builder.len, 20); - assert_eq!( - builder.buffer.as_slice(), - &[0b00001111, 0b00011000, 0b00000000] - ); - - builder.resize(5); - assert_eq!(builder.len, 5); - assert_eq!(builder.buffer.as_slice(), &[0b00001111]); - - builder.append_n(4, true); - assert_eq!(builder.len, 9); - assert_eq!(builder.buffer.as_slice(), &[0b11101111, 0b00000001]); - } - - #[test] - fn test_boolean_builder_increases_buffer_len() { - // 00000010 01001000 - let buf = Buffer::from([72_u8, 2_u8]); - let mut builder = BooleanBufferBuilder::new(8); - - for i in 0..16 { - if i == 3 || i == 6 || i == 9 { - builder.append(true); - } else { - builder.append(false); - } - } - let buf2 = builder.finish(); - - assert_eq!(buf.len(), buf2.len()); - assert_eq!(buf.as_slice(), buf2.as_slice()); - } - - #[test] - fn test_primitive_array_builder_i32() { - let mut builder = Int32Array::builder(5); - for i in 0..5 { - builder.append_value(i).unwrap(); - } - let arr = builder.finish(); - assert_eq!(5, arr.len()); - assert_eq!(0, arr.offset()); - assert_eq!(0, arr.null_count()); - for i in 0..5 { - assert!(!arr.is_null(i)); - assert!(arr.is_valid(i)); - assert_eq!(i as i32, arr.value(i)); - } - } - - #[test] - fn test_primitive_array_builder_i32_append_iter() { - let mut builder = Int32Array::builder(5); - unsafe { builder.append_trusted_len_iter(0..5) }.unwrap(); - let arr = builder.finish(); - assert_eq!(5, arr.len()); - assert_eq!(0, arr.offset()); - assert_eq!(0, arr.null_count()); - for i in 0..5 { - assert!(!arr.is_null(i)); - assert!(arr.is_valid(i)); - assert_eq!(i as i32, arr.value(i)); - } - } - - #[test] - fn test_primitive_array_builder_i32_append_nulls() { - let mut builder = Int32Array::builder(5); - builder.append_nulls(5).unwrap(); - let arr = builder.finish(); - assert_eq!(5, arr.len()); - assert_eq!(0, arr.offset()); - assert_eq!(5, arr.null_count()); - for i in 0..5 { - assert!(arr.is_null(i)); - assert!(!arr.is_valid(i)); - } - } - - #[test] - fn test_primitive_array_builder_date32() { - let mut builder = Date32Array::builder(5); - for i in 0..5 { - builder.append_value(i).unwrap(); - } - let arr = builder.finish(); - assert_eq!(5, arr.len()); - assert_eq!(0, arr.offset()); - assert_eq!(0, arr.null_count()); - for i in 0..5 { - assert!(!arr.is_null(i)); - assert!(arr.is_valid(i)); - assert_eq!(i as i32, arr.value(i)); - } - } - - #[test] - fn test_primitive_array_builder_timestamp_second() { - let mut builder = TimestampSecondArray::builder(5); - for i in 0..5 { - builder.append_value(i).unwrap(); - } - let arr = builder.finish(); - assert_eq!(5, arr.len()); - assert_eq!(0, arr.offset()); - assert_eq!(0, arr.null_count()); - for i in 0..5 { - assert!(!arr.is_null(i)); - assert!(arr.is_valid(i)); - assert_eq!(i as i64, arr.value(i)); - } - } - - #[test] - fn test_primitive_array_builder_bool() { - // 00000010 01001000 - let buf = Buffer::from([72_u8, 2_u8]); - let mut builder = BooleanArray::builder(10); - for i in 0..10 { - if i == 3 || i == 6 || i == 9 { - builder.append_value(true).unwrap(); - } else { - builder.append_value(false).unwrap(); - } - } - - let arr = builder.finish(); - assert_eq!(&buf, arr.values()); - assert_eq!(10, arr.len()); - assert_eq!(0, arr.offset()); - assert_eq!(0, arr.null_count()); - for i in 0..10 { - assert!(!arr.is_null(i)); - assert!(arr.is_valid(i)); - assert_eq!(i == 3 || i == 6 || i == 9, arr.value(i), "failed at {}", i) - } - } - - #[test] - fn test_primitive_array_builder_append_option() { - let arr1 = Int32Array::from(vec![Some(0), None, Some(2), None, Some(4)]); - - let mut builder = Int32Array::builder(5); - builder.append_option(Some(0)).unwrap(); - builder.append_option(None).unwrap(); - builder.append_option(Some(2)).unwrap(); - builder.append_option(None).unwrap(); - builder.append_option(Some(4)).unwrap(); - let arr2 = builder.finish(); - - assert_eq!(arr1.len(), arr2.len()); - assert_eq!(arr1.offset(), arr2.offset()); - assert_eq!(arr1.null_count(), arr2.null_count()); - for i in 0..5 { - assert_eq!(arr1.is_null(i), arr2.is_null(i)); - assert_eq!(arr1.is_valid(i), arr2.is_valid(i)); - if arr1.is_valid(i) { - assert_eq!(arr1.value(i), arr2.value(i)); - } - } - } - - #[test] - fn test_primitive_array_builder_append_null() { - let arr1 = Int32Array::from(vec![Some(0), Some(2), None, None, Some(4)]); - - let mut builder = Int32Array::builder(5); - builder.append_value(0).unwrap(); - builder.append_value(2).unwrap(); - builder.append_null().unwrap(); - builder.append_null().unwrap(); - builder.append_value(4).unwrap(); - let arr2 = builder.finish(); - - assert_eq!(arr1.len(), arr2.len()); - assert_eq!(arr1.offset(), arr2.offset()); - assert_eq!(arr1.null_count(), arr2.null_count()); - for i in 0..5 { - assert_eq!(arr1.is_null(i), arr2.is_null(i)); - assert_eq!(arr1.is_valid(i), arr2.is_valid(i)); - if arr1.is_valid(i) { - assert_eq!(arr1.value(i), arr2.value(i)); - } - } - } - - #[test] - fn test_primitive_array_builder_append_slice() { - let arr1 = Int32Array::from(vec![Some(0), Some(2), None, None, Some(4)]); - - let mut builder = Int32Array::builder(5); - builder.append_slice(&[0, 2]).unwrap(); - builder.append_null().unwrap(); - builder.append_null().unwrap(); - builder.append_value(4).unwrap(); - let arr2 = builder.finish(); - - assert_eq!(arr1.len(), arr2.len()); - assert_eq!(arr1.offset(), arr2.offset()); - assert_eq!(arr1.null_count(), arr2.null_count()); - for i in 0..5 { - assert_eq!(arr1.is_null(i), arr2.is_null(i)); - assert_eq!(arr1.is_valid(i), arr2.is_valid(i)); - if arr1.is_valid(i) { - assert_eq!(arr1.value(i), arr2.value(i)); - } - } - } - - #[test] - fn test_primitive_array_builder_finish() { - let mut builder = Int32Builder::new(5); - builder.append_slice(&[2, 4, 6, 8]).unwrap(); - let mut arr = builder.finish(); - assert_eq!(4, arr.len()); - assert_eq!(0, builder.len()); - - builder.append_slice(&[1, 3, 5, 7, 9]).unwrap(); - arr = builder.finish(); - assert_eq!(5, arr.len()); - assert_eq!(0, builder.len()); - } - - #[test] - fn test_list_array_builder() { - let values_builder = Int32Builder::new(10); - let mut builder = ListBuilder::new(values_builder); - - // [[0, 1, 2], [3, 4, 5], [6, 7]] - builder.values().append_value(0).unwrap(); - builder.values().append_value(1).unwrap(); - builder.values().append_value(2).unwrap(); - builder.append(true).unwrap(); - builder.values().append_value(3).unwrap(); - builder.values().append_value(4).unwrap(); - builder.values().append_value(5).unwrap(); - builder.append(true).unwrap(); - builder.values().append_value(6).unwrap(); - builder.values().append_value(7).unwrap(); - builder.append(true).unwrap(); - let list_array = builder.finish(); - - let values = list_array.values().data().buffers()[0].clone(); - assert_eq!(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7]), values); - assert_eq!( - Buffer::from_slice_ref(&[0, 3, 6, 8]), - list_array.data().buffers()[0].clone() - ); - assert_eq!(DataType::Int32, list_array.value_type()); - assert_eq!(3, list_array.len()); - assert_eq!(0, list_array.null_count()); - assert_eq!(6, list_array.value_offsets()[2]); - assert_eq!(2, list_array.value_length(2)); - for i in 0..3 { - assert!(list_array.is_valid(i)); - assert!(!list_array.is_null(i)); - } - } - - #[test] - fn test_large_list_array_builder() { - let values_builder = Int32Builder::new(10); - let mut builder = LargeListBuilder::new(values_builder); - - // [[0, 1, 2], [3, 4, 5], [6, 7]] - builder.values().append_value(0).unwrap(); - builder.values().append_value(1).unwrap(); - builder.values().append_value(2).unwrap(); - builder.append(true).unwrap(); - builder.values().append_value(3).unwrap(); - builder.values().append_value(4).unwrap(); - builder.values().append_value(5).unwrap(); - builder.append(true).unwrap(); - builder.values().append_value(6).unwrap(); - builder.values().append_value(7).unwrap(); - builder.append(true).unwrap(); - let list_array = builder.finish(); - - let values = list_array.values().data().buffers()[0].clone(); - assert_eq!(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7]), values); - assert_eq!( - Buffer::from_slice_ref(&[0i64, 3, 6, 8]), - list_array.data().buffers()[0].clone() - ); - assert_eq!(DataType::Int32, list_array.value_type()); - assert_eq!(3, list_array.len()); - assert_eq!(0, list_array.null_count()); - assert_eq!(6, list_array.value_offsets()[2]); - assert_eq!(2, list_array.value_length(2)); - for i in 0..3 { - assert!(list_array.is_valid(i)); - assert!(!list_array.is_null(i)); - } - } - - #[test] - fn test_list_array_builder_nulls() { - let values_builder = Int32Builder::new(10); - let mut builder = ListBuilder::new(values_builder); - - // [[0, 1, 2], null, [3, null, 5], [6, 7]] - builder.values().append_value(0).unwrap(); - builder.values().append_value(1).unwrap(); - builder.values().append_value(2).unwrap(); - builder.append(true).unwrap(); - builder.append(false).unwrap(); - builder.values().append_value(3).unwrap(); - builder.values().append_null().unwrap(); - builder.values().append_value(5).unwrap(); - builder.append(true).unwrap(); - builder.values().append_value(6).unwrap(); - builder.values().append_value(7).unwrap(); - builder.append(true).unwrap(); - let list_array = builder.finish(); - - assert_eq!(DataType::Int32, list_array.value_type()); - assert_eq!(4, list_array.len()); - assert_eq!(1, list_array.null_count()); - assert_eq!(3, list_array.value_offsets()[2]); - assert_eq!(3, list_array.value_length(2)); - } - - #[test] - fn test_large_list_array_builder_nulls() { - let values_builder = Int32Builder::new(10); - let mut builder = LargeListBuilder::new(values_builder); - - // [[0, 1, 2], null, [3, null, 5], [6, 7]] - builder.values().append_value(0).unwrap(); - builder.values().append_value(1).unwrap(); - builder.values().append_value(2).unwrap(); - builder.append(true).unwrap(); - builder.append(false).unwrap(); - builder.values().append_value(3).unwrap(); - builder.values().append_null().unwrap(); - builder.values().append_value(5).unwrap(); - builder.append(true).unwrap(); - builder.values().append_value(6).unwrap(); - builder.values().append_value(7).unwrap(); - builder.append(true).unwrap(); - let list_array = builder.finish(); - - assert_eq!(DataType::Int32, list_array.value_type()); - assert_eq!(4, list_array.len()); - assert_eq!(1, list_array.null_count()); - assert_eq!(3, list_array.value_offsets()[2]); - assert_eq!(3, list_array.value_length(2)); - } - - #[test] - fn test_fixed_size_list_array_builder() { - let values_builder = Int32Builder::new(10); - let mut builder = FixedSizeListBuilder::new(values_builder, 3); - - // [[0, 1, 2], null, [3, null, 5], [6, 7, null]] - builder.values().append_value(0).unwrap(); - builder.values().append_value(1).unwrap(); - builder.values().append_value(2).unwrap(); - builder.append(true).unwrap(); - builder.values().append_null().unwrap(); - builder.values().append_null().unwrap(); - builder.values().append_null().unwrap(); - builder.append(false).unwrap(); - builder.values().append_value(3).unwrap(); - builder.values().append_null().unwrap(); - builder.values().append_value(5).unwrap(); - builder.append(true).unwrap(); - builder.values().append_value(6).unwrap(); - builder.values().append_value(7).unwrap(); - builder.values().append_null().unwrap(); - builder.append(true).unwrap(); - let list_array = builder.finish(); - - assert_eq!(DataType::Int32, list_array.value_type()); - assert_eq!(4, list_array.len()); - assert_eq!(1, list_array.null_count()); - assert_eq!(6, list_array.value_offset(2)); - assert_eq!(3, list_array.value_length()); - } - - #[test] - fn test_list_array_builder_finish() { - let values_builder = Int32Array::builder(5); - let mut builder = ListBuilder::new(values_builder); - - builder.values().append_slice(&[1, 2, 3]).unwrap(); - builder.append(true).unwrap(); - builder.values().append_slice(&[4, 5, 6]).unwrap(); - builder.append(true).unwrap(); - - let mut arr = builder.finish(); - assert_eq!(2, arr.len()); - assert_eq!(0, builder.len()); - - builder.values().append_slice(&[7, 8, 9]).unwrap(); - builder.append(true).unwrap(); - arr = builder.finish(); - assert_eq!(1, arr.len()); - assert_eq!(0, builder.len()); - } - - #[test] - fn test_fixed_size_list_array_builder_empty() { - let values_builder = Int32Array::builder(5); - let mut builder = FixedSizeListBuilder::new(values_builder, 3); - - let arr = builder.finish(); - assert_eq!(0, arr.len()); - assert_eq!(0, builder.len()); - } - - #[test] - fn test_fixed_size_list_array_builder_finish() { - let values_builder = Int32Array::builder(5); - let mut builder = FixedSizeListBuilder::new(values_builder, 3); - - builder.values().append_slice(&[1, 2, 3]).unwrap(); - builder.append(true).unwrap(); - builder.values().append_slice(&[4, 5, 6]).unwrap(); - builder.append(true).unwrap(); - - let mut arr = builder.finish(); - assert_eq!(2, arr.len()); - assert_eq!(0, builder.len()); - - builder.values().append_slice(&[7, 8, 9]).unwrap(); - builder.append(true).unwrap(); - arr = builder.finish(); - assert_eq!(1, arr.len()); - assert_eq!(0, builder.len()); - } - - #[test] - fn test_list_list_array_builder() { - let primitive_builder = Int32Builder::new(10); - let values_builder = ListBuilder::new(primitive_builder); - let mut builder = ListBuilder::new(values_builder); - - // [[[1, 2], [3, 4]], [[5, 6, 7], null, [8]], null, [[9, 10]]] - builder.values().values().append_value(1).unwrap(); - builder.values().values().append_value(2).unwrap(); - builder.values().append(true).unwrap(); - builder.values().values().append_value(3).unwrap(); - builder.values().values().append_value(4).unwrap(); - builder.values().append(true).unwrap(); - builder.append(true).unwrap(); - - builder.values().values().append_value(5).unwrap(); - builder.values().values().append_value(6).unwrap(); - builder.values().values().append_value(7).unwrap(); - builder.values().append(true).unwrap(); - builder.values().append(false).unwrap(); - builder.values().values().append_value(8).unwrap(); - builder.values().append(true).unwrap(); - builder.append(true).unwrap(); - - builder.append(false).unwrap(); - - builder.values().values().append_value(9).unwrap(); - builder.values().values().append_value(10).unwrap(); - builder.values().append(true).unwrap(); - builder.append(true).unwrap(); - - let list_array = builder.finish(); - - assert_eq!(4, list_array.len()); - assert_eq!(1, list_array.null_count()); - assert_eq!( - Buffer::from_slice_ref(&[0, 2, 5, 5, 6]), - list_array.data().buffers()[0].clone() - ); - - assert_eq!(6, list_array.values().data().len()); - assert_eq!(1, list_array.values().data().null_count()); - assert_eq!( - Buffer::from_slice_ref(&[0, 2, 4, 7, 7, 8, 10]), - list_array.values().data().buffers()[0].clone() - ); - - assert_eq!(10, list_array.values().data().child_data()[0].len()); - assert_eq!(0, list_array.values().data().child_data()[0].null_count()); - assert_eq!( - Buffer::from_slice_ref(&[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), - list_array.values().data().child_data()[0].buffers()[0].clone() - ); - } - - #[test] - fn test_binary_array_builder() { - let mut builder = BinaryBuilder::new(20); - - builder.append_byte(b'h').unwrap(); - builder.append_byte(b'e').unwrap(); - builder.append_byte(b'l').unwrap(); - builder.append_byte(b'l').unwrap(); - builder.append_byte(b'o').unwrap(); - builder.append(true).unwrap(); - builder.append(true).unwrap(); - builder.append_byte(b'w').unwrap(); - builder.append_byte(b'o').unwrap(); - builder.append_byte(b'r').unwrap(); - builder.append_byte(b'l').unwrap(); - builder.append_byte(b'd').unwrap(); - builder.append(true).unwrap(); - - let binary_array = builder.finish(); - - assert_eq!(3, binary_array.len()); - assert_eq!(0, binary_array.null_count()); - assert_eq!([b'h', b'e', b'l', b'l', b'o'], binary_array.value(0)); - assert_eq!([] as [u8; 0], binary_array.value(1)); - assert_eq!([b'w', b'o', b'r', b'l', b'd'], binary_array.value(2)); - assert_eq!(5, binary_array.value_offsets()[2]); - assert_eq!(5, binary_array.value_length(2)); - } - - #[test] - fn test_large_binary_array_builder() { - let mut builder = LargeBinaryBuilder::new(20); - - builder.append_byte(b'h').unwrap(); - builder.append_byte(b'e').unwrap(); - builder.append_byte(b'l').unwrap(); - builder.append_byte(b'l').unwrap(); - builder.append_byte(b'o').unwrap(); - builder.append(true).unwrap(); - builder.append(true).unwrap(); - builder.append_byte(b'w').unwrap(); - builder.append_byte(b'o').unwrap(); - builder.append_byte(b'r').unwrap(); - builder.append_byte(b'l').unwrap(); - builder.append_byte(b'd').unwrap(); - builder.append(true).unwrap(); - - let binary_array = builder.finish(); - - assert_eq!(3, binary_array.len()); - assert_eq!(0, binary_array.null_count()); - assert_eq!([b'h', b'e', b'l', b'l', b'o'], binary_array.value(0)); - assert_eq!([] as [u8; 0], binary_array.value(1)); - assert_eq!([b'w', b'o', b'r', b'l', b'd'], binary_array.value(2)); - assert_eq!(5, binary_array.value_offsets()[2]); - assert_eq!(5, binary_array.value_length(2)); - } - - #[test] - fn test_string_array_builder() { - let mut builder = StringBuilder::new(20); - - builder.append_value("hello").unwrap(); - builder.append(true).unwrap(); - builder.append_value("world").unwrap(); - - let string_array = builder.finish(); - - assert_eq!(3, string_array.len()); - assert_eq!(0, string_array.null_count()); - assert_eq!("hello", string_array.value(0)); - assert_eq!("", string_array.value(1)); - assert_eq!("world", string_array.value(2)); - assert_eq!(5, string_array.value_offsets()[2]); - assert_eq!(5, string_array.value_length(2)); - } - - #[test] - fn test_fixed_size_binary_builder() { - let mut builder = FixedSizeBinaryBuilder::new(15, 5); - - // [b"hello", null, "arrow"] - builder.append_value(b"hello").unwrap(); - builder.append_null().unwrap(); - builder.append_value(b"arrow").unwrap(); - let fixed_size_binary_array: FixedSizeBinaryArray = builder.finish(); - - assert_eq!( - &DataType::FixedSizeBinary(5), - fixed_size_binary_array.data_type() - ); - assert_eq!(3, fixed_size_binary_array.len()); - assert_eq!(1, fixed_size_binary_array.null_count()); - assert_eq!(10, fixed_size_binary_array.value_offset(2)); - assert_eq!(5, fixed_size_binary_array.value_length()); - } - - #[test] - fn test_decimal_builder() { - let mut builder = DecimalBuilder::new(30, 23, 6); - - builder.append_value(8_887_000_000).unwrap(); - builder.append_null().unwrap(); - builder.append_value(-8_887_000_000).unwrap(); - let decimal_array: DecimalArray = builder.finish(); - - assert_eq!(&DataType::Decimal(23, 6), decimal_array.data_type()); - assert_eq!(3, decimal_array.len()); - assert_eq!(1, decimal_array.null_count()); - assert_eq!(32, decimal_array.value_offset(2)); - assert_eq!(16, decimal_array.value_length()); - } - - #[test] - fn test_string_array_builder_finish() { - let mut builder = StringBuilder::new(10); - - builder.append_value("hello").unwrap(); - builder.append_value("world").unwrap(); - - let mut arr = builder.finish(); - assert_eq!(2, arr.len()); - assert_eq!(0, builder.len()); - - builder.append_value("arrow").unwrap(); - arr = builder.finish(); - assert_eq!(1, arr.len()); - assert_eq!(0, builder.len()); - } - - #[test] - fn test_string_array_builder_append_string() { - let mut builder = StringBuilder::new(20); - - let var = "hello".to_owned(); - builder.append_value(&var).unwrap(); - builder.append(true).unwrap(); - builder.append_value("world").unwrap(); - - let string_array = builder.finish(); - - assert_eq!(3, string_array.len()); - assert_eq!(0, string_array.null_count()); - assert_eq!("hello", string_array.value(0)); - assert_eq!("", string_array.value(1)); - assert_eq!("world", string_array.value(2)); - assert_eq!(5, string_array.value_offsets()[2]); - assert_eq!(5, string_array.value_length(2)); - } - - #[test] - fn test_string_array_builder_append_option() { - let mut builder = StringBuilder::new(20); - builder.append_option(Some("hello")).unwrap(); - builder.append_option(None::<&str>).unwrap(); - builder.append_option(None::).unwrap(); - builder.append_option(Some("world")).unwrap(); - - let string_array = builder.finish(); - - assert_eq!(4, string_array.len()); - assert_eq!("hello", string_array.value(0)); - assert!(string_array.is_null(1)); - assert!(string_array.is_null(2)); - assert_eq!("world", string_array.value(3)); - } - - #[test] - fn test_struct_array_builder() { - let string_builder = StringBuilder::new(4); - let int_builder = Int32Builder::new(4); - - let mut fields = Vec::new(); - let mut field_builders = Vec::new(); - fields.push(Field::new("f1", DataType::Utf8, false)); - field_builders.push(Box::new(string_builder) as Box); - fields.push(Field::new("f2", DataType::Int32, false)); - field_builders.push(Box::new(int_builder) as Box); - - let mut builder = StructBuilder::new(fields, field_builders); - assert_eq!(2, builder.num_fields()); - - let string_builder = builder - .field_builder::(0) - .expect("builder at field 0 should be string builder"); - string_builder.append_value("joe").unwrap(); - string_builder.append_null().unwrap(); - string_builder.append_null().unwrap(); - string_builder.append_value("mark").unwrap(); - - let int_builder = builder - .field_builder::(1) - .expect("builder at field 1 should be int builder"); - int_builder.append_value(1).unwrap(); - int_builder.append_value(2).unwrap(); - int_builder.append_null().unwrap(); - int_builder.append_value(4).unwrap(); - - builder.append(true).unwrap(); - builder.append(true).unwrap(); - builder.append_null().unwrap(); - builder.append(true).unwrap(); - - let arr = builder.finish(); - - let struct_data = arr.data(); - assert_eq!(4, struct_data.len()); - assert_eq!(1, struct_data.null_count()); - assert_eq!( - Some(&Bitmap::from(Buffer::from(&[11_u8]))), - struct_data.null_bitmap() - ); - - let expected_string_data = ArrayData::builder(DataType::Utf8) - .len(4) - .null_bit_buffer(Some(Buffer::from(&[9_u8]))) - .add_buffer(Buffer::from_slice_ref(&[0, 3, 3, 3, 7])) - .add_buffer(Buffer::from_slice_ref(b"joemark")) - .build() - .unwrap(); - - let expected_int_data = ArrayData::builder(DataType::Int32) - .len(4) - .null_bit_buffer(Some(Buffer::from_slice_ref(&[11_u8]))) - .add_buffer(Buffer::from_slice_ref(&[1, 2, 0, 4])) - .build() - .unwrap(); - - assert_eq!(expected_string_data, *arr.column(0).data()); - assert_eq!(expected_int_data, *arr.column(1).data()); - } - - #[test] - fn test_struct_array_builder_finish() { - let int_builder = Int32Builder::new(10); - let bool_builder = BooleanBuilder::new(10); - - let mut fields = Vec::new(); - let mut field_builders = Vec::new(); - fields.push(Field::new("f1", DataType::Int32, false)); - field_builders.push(Box::new(int_builder) as Box); - fields.push(Field::new("f2", DataType::Boolean, false)); - field_builders.push(Box::new(bool_builder) as Box); - - let mut builder = StructBuilder::new(fields, field_builders); - builder - .field_builder::(0) - .unwrap() - .append_slice(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) - .unwrap(); - builder - .field_builder::(1) - .unwrap() - .append_slice(&[ - false, true, false, true, false, true, false, true, false, true, - ]) - .unwrap(); - - // Append slot values - all are valid. - for _ in 0..10 { - assert!(builder.append(true).is_ok()) - } - - assert_eq!(10, builder.len()); - - let arr = builder.finish(); - - assert_eq!(10, arr.len()); - assert_eq!(0, builder.len()); - - builder - .field_builder::(0) - .unwrap() - .append_slice(&[1, 3, 5, 7, 9]) - .unwrap(); - builder - .field_builder::(1) - .unwrap() - .append_slice(&[false, true, false, true, false]) - .unwrap(); - - // Append slot values - all are valid. - for _ in 0..5 { - assert!(builder.append(true).is_ok()) - } - - assert_eq!(5, builder.len()); - - let arr = builder.finish(); - - assert_eq!(5, arr.len()); - assert_eq!(0, builder.len()); - } - - #[test] - fn test_map_array_builder() { - let string_builder = StringBuilder::new(4); - let int_builder = Int32Builder::new(4); - - let mut builder = MapBuilder::new(None, string_builder, int_builder); - - let string_builder = builder.keys(); - string_builder.append_value("joe").unwrap(); - string_builder.append_null().unwrap(); - string_builder.append_null().unwrap(); - string_builder.append_value("mark").unwrap(); - - let int_builder = builder.values(); - int_builder.append_value(1).unwrap(); - int_builder.append_value(2).unwrap(); - int_builder.append_null().unwrap(); - int_builder.append_value(4).unwrap(); - - builder.append(true).unwrap(); - builder.append(false).unwrap(); - builder.append(true).unwrap(); - - let arr = builder.finish(); - - let map_data = arr.data(); - assert_eq!(3, map_data.len()); - assert_eq!(1, map_data.null_count()); - assert_eq!( - Some(&Bitmap::from(Buffer::from(&[5_u8]))), - map_data.null_bitmap() - ); - - let expected_string_data = ArrayData::builder(DataType::Utf8) - .len(4) - .null_bit_buffer(Some(Buffer::from(&[9_u8]))) - .add_buffer(Buffer::from_slice_ref(&[0, 3, 3, 3, 7])) - .add_buffer(Buffer::from_slice_ref(b"joemark")) - .build() - .unwrap(); - - let expected_int_data = ArrayData::builder(DataType::Int32) - .len(4) - .null_bit_buffer(Some(Buffer::from_slice_ref(&[11_u8]))) - .add_buffer(Buffer::from_slice_ref(&[1, 2, 0, 4])) - .build() - .unwrap(); - - assert_eq!(&expected_string_data, arr.keys().data()); - assert_eq!(&expected_int_data, arr.values().data()); - } - - // TODO: add a test that finishes building, after designing a spec-compliant - // way of inserting values to the map. - // A map's values shouldn't be repeated within a slot - - #[test] - fn test_struct_array_builder_from_schema() { - let mut fields = vec![ - Field::new("f1", DataType::Float32, false), - Field::new("f2", DataType::Utf8, false), - ]; - let sub_fields = vec![ - Field::new("g1", DataType::Int32, false), - Field::new("g2", DataType::Boolean, false), - ]; - let struct_type = DataType::Struct(sub_fields); - fields.push(Field::new("f3", struct_type, false)); - - let mut builder = StructBuilder::from_fields(fields, 5); - assert_eq!(3, builder.num_fields()); - assert!(builder.field_builder::(0).is_some()); - assert!(builder.field_builder::(1).is_some()); - assert!(builder.field_builder::(2).is_some()); - } - - #[test] - #[should_panic( - expected = "Data type List(Field { name: \"item\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: None }) is not currently supported" - )] - fn test_struct_array_builder_from_schema_unsupported_type() { - let mut fields = vec![Field::new("f1", DataType::Int16, false)]; - let list_type = - DataType::List(Box::new(Field::new("item", DataType::Int64, true))); - fields.push(Field::new("f2", list_type, false)); - - let _ = StructBuilder::from_fields(fields, 5); - } - - #[test] - fn test_struct_array_builder_field_builder_type_mismatch() { - let int_builder = Int32Builder::new(10); - - let mut fields = Vec::new(); - let mut field_builders = Vec::new(); - fields.push(Field::new("f1", DataType::Int32, false)); - field_builders.push(Box::new(int_builder) as Box); - - let mut builder = StructBuilder::new(fields, field_builders); - assert!(builder.field_builder::(0).is_none()); - } - - #[test] - fn test_primitive_dictionary_builder() { - let key_builder = PrimitiveBuilder::::new(3); - let value_builder = PrimitiveBuilder::::new(2); - let mut builder = PrimitiveDictionaryBuilder::new(key_builder, value_builder); - builder.append(12345678).unwrap(); - builder.append_null().unwrap(); - builder.append(22345678).unwrap(); - let array = builder.finish(); - - assert_eq!( - array.keys(), - &UInt8Array::from(vec![Some(0), None, Some(1)]) - ); - - // Values are polymorphic and so require a downcast. - let av = array.values(); - let ava: &UInt32Array = av.as_any().downcast_ref::().unwrap(); - let avs: &[u32] = ava.values(); - - assert!(!array.is_null(0)); - assert!(array.is_null(1)); - assert!(!array.is_null(2)); - - assert_eq!(avs, &[12345678, 22345678]); - } - - #[test] - fn test_string_dictionary_builder() { - let key_builder = PrimitiveBuilder::::new(5); - let value_builder = StringBuilder::new(2); - let mut builder = StringDictionaryBuilder::new(key_builder, value_builder); - builder.append("abc").unwrap(); - builder.append_null().unwrap(); - builder.append("def").unwrap(); - builder.append("def").unwrap(); - builder.append("abc").unwrap(); - let array = builder.finish(); - - assert_eq!( - array.keys(), - &Int8Array::from(vec![Some(0), None, Some(1), Some(1), Some(0)]) - ); - - // Values are polymorphic and so require a downcast. - let av = array.values(); - let ava: &StringArray = av.as_any().downcast_ref::().unwrap(); - - assert_eq!(ava.value(0), "abc"); - assert_eq!(ava.value(1), "def"); - } - - #[test] - fn test_string_dictionary_builder_with_existing_dictionary() { - let dictionary = StringArray::from(vec![None, Some("def"), Some("abc")]); - - let key_builder = PrimitiveBuilder::::new(6); - let mut builder = - StringDictionaryBuilder::new_with_dictionary(key_builder, &dictionary) - .unwrap(); - builder.append("abc").unwrap(); - builder.append_null().unwrap(); - builder.append("def").unwrap(); - builder.append("def").unwrap(); - builder.append("abc").unwrap(); - builder.append("ghi").unwrap(); - let array = builder.finish(); - - assert_eq!( - array.keys(), - &Int8Array::from(vec![Some(2), None, Some(1), Some(1), Some(2), Some(3)]) - ); - - // Values are polymorphic and so require a downcast. - let av = array.values(); - let ava: &StringArray = av.as_any().downcast_ref::().unwrap(); - - assert!(!ava.is_valid(0)); - assert_eq!(ava.value(1), "def"); - assert_eq!(ava.value(2), "abc"); - assert_eq!(ava.value(3), "ghi"); - } - - #[test] - fn test_string_dictionary_builder_with_reserved_null_value() { - let dictionary: Vec> = vec![None]; - let dictionary = StringArray::from(dictionary); - - let key_builder = PrimitiveBuilder::::new(4); - let mut builder = - StringDictionaryBuilder::new_with_dictionary(key_builder, &dictionary) - .unwrap(); - builder.append("abc").unwrap(); - builder.append_null().unwrap(); - builder.append("def").unwrap(); - builder.append("abc").unwrap(); - let array = builder.finish(); - - assert!(array.is_null(1)); - assert!(!array.is_valid(1)); - - let keys = array.keys(); - - assert_eq!(keys.value(0), 1); - assert!(keys.is_null(1)); - // zero initialization is currently guaranteed by Buffer allocation and resizing - assert_eq!(keys.value(1), 0); - assert_eq!(keys.value(2), 2); - assert_eq!(keys.value(3), 1); - } - - #[test] - #[should_panic(expected = "DictionaryKeyOverflowError")] - fn test_primitive_dictionary_overflow() { - let key_builder = PrimitiveBuilder::::new(257); - let value_builder = PrimitiveBuilder::::new(257); - let mut builder = PrimitiveDictionaryBuilder::new(key_builder, value_builder); - // 256 unique keys. - for i in 0..256 { - builder.append(i + 1000).unwrap(); - } - // Special error if the key overflows (256th entry) - builder.append(1257).unwrap(); - } -} diff --git a/arrow/src/array/builder/boolean_buffer_builder.rs b/arrow/src/array/builder/boolean_buffer_builder.rs new file mode 100644 index 000000000000..5b6d1ce48478 --- /dev/null +++ b/arrow/src/array/builder/boolean_buffer_builder.rs @@ -0,0 +1,383 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::buffer::{Buffer, MutableBuffer}; + +use super::Range; + +use crate::util::bit_util; + +#[derive(Debug)] +pub struct BooleanBufferBuilder { + buffer: MutableBuffer, + len: usize, +} + +impl BooleanBufferBuilder { + #[inline] + pub fn new(capacity: usize) -> Self { + let byte_capacity = bit_util::ceil(capacity, 8); + let buffer = MutableBuffer::new(byte_capacity); + Self { buffer, len: 0 } + } + + #[inline] + pub fn len(&self) -> usize { + self.len + } + + #[inline] + pub fn set_bit(&mut self, index: usize, v: bool) { + if v { + bit_util::set_bit(self.buffer.as_mut(), index); + } else { + bit_util::unset_bit(self.buffer.as_mut(), index); + } + } + + #[inline] + pub fn get_bit(&self, index: usize) -> bool { + bit_util::get_bit(self.buffer.as_slice(), index) + } + + #[inline] + pub fn is_empty(&self) -> bool { + self.len == 0 + } + + #[inline] + pub fn capacity(&self) -> usize { + self.buffer.capacity() * 8 + } + + #[inline] + pub fn advance(&mut self, additional: usize) { + let new_len = self.len + additional; + let new_len_bytes = bit_util::ceil(new_len, 8); + if new_len_bytes > self.buffer.len() { + self.buffer.resize(new_len_bytes, 0); + } + self.len = new_len; + } + + /// Reserve space to at least `additional` new bits. + /// Capacity will be `>= self.len() + additional`. + /// New bytes are uninitialized and reading them is undefined behavior. + #[inline] + pub fn reserve(&mut self, additional: usize) { + let capacity = self.len + additional; + if capacity > self.capacity() { + // convert differential to bytes + let additional = bit_util::ceil(capacity, 8) - self.buffer.len(); + self.buffer.reserve(additional); + } + } + + /// Resizes the buffer, either truncating its contents (with no change in capacity), or + /// growing it (potentially reallocating it) and writing `false` in the newly available bits. + #[inline] + pub fn resize(&mut self, len: usize) { + let len_bytes = bit_util::ceil(len, 8); + self.buffer.resize(len_bytes, 0); + self.len = len; + } + + #[inline] + pub fn append(&mut self, v: bool) { + self.advance(1); + if v { + unsafe { bit_util::set_bit_raw(self.buffer.as_mut_ptr(), self.len - 1) }; + } + } + + #[inline] + pub fn append_n(&mut self, additional: usize, v: bool) { + self.advance(additional); + if additional > 0 && v { + let offset = self.len() - additional; + (0..additional).for_each(|i| unsafe { + bit_util::set_bit_raw(self.buffer.as_mut_ptr(), offset + i) + }) + } + } + + #[inline] + pub fn append_slice(&mut self, slice: &[bool]) { + let additional = slice.len(); + self.advance(additional); + + let offset = self.len() - additional; + for (i, v) in slice.iter().enumerate() { + if *v { + unsafe { bit_util::set_bit_raw(self.buffer.as_mut_ptr(), offset + i) } + } + } + } + + /// Append `range` bits from `to_set` + /// + /// `to_set` is a slice of bits packed LSB-first into `[u8]` + /// + /// # Panics + /// + /// Panics if `to_set` does not contain `ceil(range.end / 8)` bytes + pub fn append_packed_range(&mut self, range: Range, to_set: &[u8]) { + let offset_write = self.len; + let len = range.end - range.start; + self.advance(len); + crate::util::bit_mask::set_bits( + self.buffer.as_slice_mut(), + to_set, + offset_write, + range.start, + len, + ); + } + + /// Returns the packed bits + pub fn as_slice(&self) -> &[u8] { + self.buffer.as_slice() + } + + #[inline] + pub fn finish(&mut self) -> Buffer { + let buf = std::mem::replace(&mut self.buffer, MutableBuffer::new(0)); + self.len = 0; + buf.into() + } +} + +impl From for Buffer { + #[inline] + fn from(builder: BooleanBufferBuilder) -> Self { + builder.buffer.into() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_boolean_buffer_builder_write_bytes() { + let mut b = BooleanBufferBuilder::new(4); + b.append(false); + b.append(true); + b.append(false); + b.append(true); + assert_eq!(4, b.len()); + assert_eq!(512, b.capacity()); + let buffer = b.finish(); + assert_eq!(1, buffer.len()); + + // Overallocate capacity + let mut b = BooleanBufferBuilder::new(8); + b.append_slice(&[false, true, false, true]); + assert_eq!(4, b.len()); + assert_eq!(512, b.capacity()); + let buffer = b.finish(); + assert_eq!(1, buffer.len()); + } + + #[test] + fn test_boolean_buffer_builder_unset_first_bit() { + let mut buffer = BooleanBufferBuilder::new(4); + buffer.append(true); + buffer.append(true); + buffer.append(false); + buffer.append(true); + buffer.set_bit(0, false); + assert_eq!(buffer.len(), 4); + assert_eq!(buffer.finish().as_slice(), &[0b1010_u8]); + } + + #[test] + fn test_boolean_buffer_builder_unset_last_bit() { + let mut buffer = BooleanBufferBuilder::new(4); + buffer.append(true); + buffer.append(true); + buffer.append(false); + buffer.append(true); + buffer.set_bit(3, false); + assert_eq!(buffer.len(), 4); + assert_eq!(buffer.finish().as_slice(), &[0b0011_u8]); + } + + #[test] + fn test_boolean_buffer_builder_unset_an_inner_bit() { + let mut buffer = BooleanBufferBuilder::new(5); + buffer.append(true); + buffer.append(true); + buffer.append(false); + buffer.append(true); + buffer.set_bit(1, false); + assert_eq!(buffer.len(), 4); + assert_eq!(buffer.finish().as_slice(), &[0b1001_u8]); + } + + #[test] + fn test_boolean_buffer_builder_unset_several_bits() { + let mut buffer = BooleanBufferBuilder::new(5); + buffer.append(true); + buffer.append(true); + buffer.append(true); + buffer.append(false); + buffer.append(true); + buffer.set_bit(1, false); + buffer.set_bit(2, false); + assert_eq!(buffer.len(), 5); + assert_eq!(buffer.finish().as_slice(), &[0b10001_u8]); + } + + #[test] + fn test_boolean_buffer_builder_unset_several_bits_bigger_than_one_byte() { + let mut buffer = BooleanBufferBuilder::new(16); + buffer.append_n(10, true); + buffer.set_bit(0, false); + buffer.set_bit(3, false); + buffer.set_bit(9, false); + assert_eq!(buffer.len(), 10); + assert_eq!(buffer.finish().as_slice(), &[0b11110110_u8, 0b01_u8]); + } + + #[test] + fn test_boolean_buffer_builder_flip_several_bits_bigger_than_one_byte() { + let mut buffer = BooleanBufferBuilder::new(16); + buffer.append_n(5, true); + buffer.append_n(5, false); + buffer.append_n(5, true); + buffer.set_bit(0, false); + buffer.set_bit(3, false); + buffer.set_bit(9, false); + buffer.set_bit(6, true); + buffer.set_bit(14, true); + buffer.set_bit(13, false); + assert_eq!(buffer.len(), 15); + assert_eq!(buffer.finish().as_slice(), &[0b01010110_u8, 0b1011100_u8]); + } + + #[test] + fn test_bool_buffer_builder_get_first_bit() { + let mut buffer = BooleanBufferBuilder::new(16); + buffer.append_n(8, true); + buffer.append_n(8, false); + assert!(buffer.get_bit(0)); + } + + #[test] + fn test_bool_buffer_builder_get_first_bit_not_requires_mutability() { + let buffer = { + let mut buffer = BooleanBufferBuilder::new(16); + buffer.append_n(8, true); + buffer + }; + + assert!(buffer.get_bit(0)); + } + + #[test] + fn test_bool_buffer_builder_get_last_bit() { + let mut buffer = BooleanBufferBuilder::new(16); + buffer.append_n(8, true); + buffer.append_n(8, false); + assert!(!buffer.get_bit(15)); + } + + #[test] + fn test_bool_buffer_builder_get_an_inner_bit() { + let mut buffer = BooleanBufferBuilder::new(16); + buffer.append_n(4, false); + buffer.append_n(8, true); + buffer.append_n(4, false); + assert!(buffer.get_bit(11)); + } + + #[test] + fn test_bool_buffer_fuzz() { + use rand::prelude::*; + + let mut buffer = BooleanBufferBuilder::new(12); + let mut all_bools = vec![]; + let mut rng = rand::thread_rng(); + + let src_len = 32; + let (src, compacted_src) = { + let src: Vec<_> = std::iter::from_fn(|| Some(rng.next_u32() & 1 == 0)) + .take(src_len) + .collect(); + + let mut compacted_src = BooleanBufferBuilder::new(src_len); + compacted_src.append_slice(&src); + (src, compacted_src.finish()) + }; + + for _ in 0..100 { + let a = rng.next_u32() as usize % src_len; + let b = rng.next_u32() as usize % src_len; + + let start = a.min(b); + let end = a.max(b); + + buffer.append_packed_range(start..end, compacted_src.as_slice()); + all_bools.extend_from_slice(&src[start..end]); + } + + let mut compacted = BooleanBufferBuilder::new(all_bools.len()); + compacted.append_slice(&all_bools); + + assert_eq!(buffer.finish(), compacted.finish()) + } + + #[test] + fn test_boolean_array_builder_resize() { + let mut builder = BooleanBufferBuilder::new(20); + builder.append_n(4, true); + builder.append_n(7, false); + builder.append_n(2, true); + builder.resize(20); + + assert_eq!(builder.len(), 20); + assert_eq!(builder.as_slice(), &[0b00001111, 0b00011000, 0b00000000]); + + builder.resize(5); + assert_eq!(builder.len(), 5); + assert_eq!(builder.as_slice(), &[0b00001111]); + + builder.append_n(4, true); + assert_eq!(builder.len(), 9); + assert_eq!(builder.as_slice(), &[0b11101111, 0b00000001]); + } + + #[test] + fn test_boolean_builder_increases_buffer_len() { + // 00000010 01001000 + let buf = Buffer::from([72_u8, 2_u8]); + let mut builder = BooleanBufferBuilder::new(8); + + for i in 0..16 { + if i == 3 || i == 6 || i == 9 { + builder.append(true); + } else { + builder.append(false); + } + } + let buf2 = builder.finish(); + + assert_eq!(buf.len(), buf2.len()); + assert_eq!(buf.as_slice(), buf2.as_slice()); + } +} diff --git a/arrow/src/array/builder/boolean_builder.rs b/arrow/src/array/builder/boolean_builder.rs new file mode 100644 index 000000000000..98acb641b1a8 --- /dev/null +++ b/arrow/src/array/builder/boolean_builder.rs @@ -0,0 +1,203 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::any::Any; +use std::sync::Arc; + +use crate::array::ArrayBuilder; +use crate::array::ArrayData; +use crate::array::ArrayRef; +use crate::array::BooleanArray; +use crate::datatypes::DataType; +use crate::error::{ArrowError, Result}; + +use super::BooleanBufferBuilder; + +/// Array builder for fixed-width primitive types +/// +/// # Example +/// +/// Create a `BooleanArray` from a `BooleanBuilder` +/// +/// ``` +/// use arrow::array::{Array, BooleanArray, BooleanBuilder}; +/// +/// let mut b = BooleanBuilder::new(4); +/// b.append_value(true); +/// b.append_null(); +/// b.append_value(false); +/// b.append_value(true); +/// let arr = b.finish(); +/// +/// assert_eq!(4, arr.len()); +/// assert_eq!(1, arr.null_count()); +/// assert_eq!(true, arr.value(0)); +/// assert!(arr.is_valid(0)); +/// assert!(!arr.is_null(0)); +/// assert!(!arr.is_valid(1)); +/// assert!(arr.is_null(1)); +/// assert_eq!(false, arr.value(2)); +/// assert!(arr.is_valid(2)); +/// assert!(!arr.is_null(2)); +/// assert_eq!(true, arr.value(3)); +/// assert!(arr.is_valid(3)); +/// assert!(!arr.is_null(3)); +/// ``` +#[derive(Debug)] +pub struct BooleanBuilder { + values_builder: BooleanBufferBuilder, + bitmap_builder: BooleanBufferBuilder, +} + +impl BooleanBuilder { + /// Creates a new primitive array builder + pub fn new(capacity: usize) -> Self { + Self { + values_builder: BooleanBufferBuilder::new(capacity), + bitmap_builder: BooleanBufferBuilder::new(capacity), + } + } + + /// Returns the capacity of this builder measured in slots of type `T` + pub fn capacity(&self) -> usize { + self.values_builder.capacity() + } + + /// Appends a value of type `T` into the builder + #[inline] + pub fn append_value(&mut self, v: bool) -> Result<()> { + self.bitmap_builder.append(true); + self.values_builder.append(v); + Ok(()) + } + + /// Appends a null slot into the builder + #[inline] + pub fn append_null(&mut self) -> Result<()> { + self.bitmap_builder.append(false); + self.values_builder.advance(1); + Ok(()) + } + + /// Appends an `Option` into the builder + #[inline] + pub fn append_option(&mut self, v: Option) -> Result<()> { + match v { + None => self.append_null()?, + Some(v) => self.append_value(v)?, + }; + Ok(()) + } + + /// Appends a slice of type `T` into the builder + #[inline] + pub fn append_slice(&mut self, v: &[bool]) -> Result<()> { + self.bitmap_builder.append_n(v.len(), true); + self.values_builder.append_slice(v); + Ok(()) + } + + /// Appends values from a slice of type `T` and a validity boolean slice + #[inline] + pub fn append_values(&mut self, values: &[bool], is_valid: &[bool]) -> Result<()> { + if values.len() != is_valid.len() { + return Err(ArrowError::InvalidArgumentError( + "Value and validity lengths must be equal".to_string(), + )); + } + self.bitmap_builder.append_slice(is_valid); + self.values_builder.append_slice(values); + Ok(()) + } + + /// Builds the [BooleanArray] and reset this builder. + pub fn finish(&mut self) -> BooleanArray { + let len = self.len(); + let null_bit_buffer = self.bitmap_builder.finish(); + let null_count = len - null_bit_buffer.count_set_bits(); + let builder = ArrayData::builder(DataType::Boolean) + .len(len) + .add_buffer(self.values_builder.finish()) + .null_bit_buffer((null_count > 0).then(|| null_bit_buffer)); + + let array_data = unsafe { builder.build_unchecked() }; + BooleanArray::from(array_data) + } +} + +impl ArrayBuilder for BooleanBuilder { + /// Returns the builder as a non-mutable `Any` reference. + fn as_any(&self) -> &dyn Any { + self + } + + /// Returns the builder as a mutable `Any` reference. + fn as_any_mut(&mut self) -> &mut dyn Any { + self + } + + /// Returns the boxed builder as a box of `Any`. + fn into_box_any(self: Box) -> Box { + self + } + + /// Returns the number of array slots in the builder + fn len(&self) -> usize { + self.values_builder.len() + } + + /// Returns whether the number of array slots is zero + fn is_empty(&self) -> bool { + self.values_builder.is_empty() + } + + /// Builds the array and reset this builder. + fn finish(&mut self) -> ArrayRef { + Arc::new(self.finish()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_boolean_array_builder_append_slice() { + let arr1 = + BooleanArray::from(vec![Some(true), Some(false), None, None, Some(false)]); + + let mut builder = BooleanArray::builder(0); + builder.append_slice(&[true, false]).unwrap(); + builder.append_null().unwrap(); + builder.append_null().unwrap(); + builder.append_value(false).unwrap(); + let arr2 = builder.finish(); + + assert_eq!(arr1, arr2); + } + + #[test] + fn test_boolean_array_builder_append_slice_large() { + let arr1 = BooleanArray::from(vec![true; 513]); + + let mut builder = BooleanArray::builder(512); + builder.append_slice(&[true; 513]).unwrap(); + let arr2 = builder.finish(); + + assert_eq!(arr1, arr2); + } +} diff --git a/arrow/src/array/builder/buffer_builder.rs b/arrow/src/array/builder/buffer_builder.rs new file mode 100644 index 000000000000..83b2afb44e7a --- /dev/null +++ b/arrow/src/array/builder/buffer_builder.rs @@ -0,0 +1,418 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::mem; + +use crate::buffer::{Buffer, MutableBuffer}; +use crate::datatypes::ArrowNativeType; + +use super::PhantomData; + +/// Converts a `MutableBuffer` to a `BufferBuilder`. +/// +/// `slots` is the number of array slots currently represented in the `MutableBuffer`. +pub(crate) fn mutable_buffer_to_builder( + mutable_buffer: MutableBuffer, + slots: usize, +) -> BufferBuilder { + BufferBuilder:: { + buffer: mutable_buffer, + len: slots, + _marker: PhantomData, + } +} + +/// Converts a `BufferBuilder` into its underlying `MutableBuffer`. +/// +/// `From` is not implemented because associated type bounds are unstable. +pub(crate) fn builder_to_mutable_buffer( + builder: BufferBuilder, +) -> MutableBuffer { + builder.buffer +} + +/// Builder for creating a [`Buffer`](crate::buffer::Buffer) object. +/// +/// A [`Buffer`](crate::buffer::Buffer) is the underlying data +/// structure of Arrow's [`Arrays`](crate::array::Array). +/// +/// For all supported types, there are type definitions for the +/// generic version of `BufferBuilder`, e.g. `UInt8BufferBuilder`. +/// +/// # Example: +/// +/// ``` +/// use arrow::array::UInt8BufferBuilder; +/// +/// # fn main() -> arrow::error::Result<()> { +/// let mut builder = UInt8BufferBuilder::new(100); +/// builder.append_slice(&[42, 43, 44]); +/// builder.append(45); +/// let buffer = builder.finish(); +/// +/// assert_eq!(unsafe { buffer.typed_data::() }, &[42, 43, 44, 45]); +/// # Ok(()) +/// # } +/// ``` +#[derive(Debug)] +pub struct BufferBuilder { + buffer: MutableBuffer, + len: usize, + _marker: PhantomData, +} + +impl BufferBuilder { + /// Creates a new builder with initial capacity for _at least_ `capacity` + /// elements of type `T`. + /// + /// The capacity can later be manually adjusted with the + /// [`reserve()`](BufferBuilder::reserve) method. + /// Also the + /// [`append()`](BufferBuilder::append), + /// [`append_slice()`](BufferBuilder::append_slice) and + /// [`advance()`](BufferBuilder::advance) + /// methods automatically increase the capacity if needed. + /// + /// # Example: + /// + /// ``` + /// use arrow::array::UInt8BufferBuilder; + /// + /// let mut builder = UInt8BufferBuilder::new(10); + /// + /// assert!(builder.capacity() >= 10); + /// ``` + #[inline] + pub fn new(capacity: usize) -> Self { + let buffer = MutableBuffer::new(capacity * mem::size_of::()); + + Self { + buffer, + len: 0, + _marker: PhantomData, + } + } + + /// Returns the current number of array elements in the internal buffer. + /// + /// # Example: + /// + /// ``` + /// use arrow::array::UInt8BufferBuilder; + /// + /// let mut builder = UInt8BufferBuilder::new(10); + /// builder.append(42); + /// + /// assert_eq!(builder.len(), 1); + /// ``` + pub fn len(&self) -> usize { + self.len + } + + /// Returns whether the internal buffer is empty. + /// + /// # Example: + /// + /// ``` + /// use arrow::array::UInt8BufferBuilder; + /// + /// let mut builder = UInt8BufferBuilder::new(10); + /// builder.append(42); + /// + /// assert_eq!(builder.is_empty(), false); + /// ``` + pub fn is_empty(&self) -> bool { + self.len == 0 + } + + /// Returns the actual capacity (number of elements) of the internal buffer. + /// + /// Note: the internal capacity returned by this method might be larger than + /// what you'd expect after setting the capacity in the `new()` or `reserve()` + /// functions. + pub fn capacity(&self) -> usize { + let byte_capacity = self.buffer.capacity(); + byte_capacity / std::mem::size_of::() + } + + /// Increases the number of elements in the internal buffer by `n` + /// and resizes the buffer as needed. + /// + /// The values of the newly added elements are 0. + /// This method is usually used when appending `NULL` values to the buffer + /// as they still require physical memory space. + /// + /// # Example: + /// + /// ``` + /// use arrow::array::UInt8BufferBuilder; + /// + /// let mut builder = UInt8BufferBuilder::new(10); + /// builder.advance(2); + /// + /// assert_eq!(builder.len(), 2); + /// ``` + #[inline] + pub fn advance(&mut self, i: usize) { + let new_buffer_len = (self.len + i) * mem::size_of::(); + self.buffer.resize(new_buffer_len, 0); + self.len += i; + } + + /// Reserves memory for _at least_ `n` more elements of type `T`. + /// + /// # Example: + /// + /// ``` + /// use arrow::array::UInt8BufferBuilder; + /// + /// let mut builder = UInt8BufferBuilder::new(10); + /// builder.reserve(10); + /// + /// assert!(builder.capacity() >= 20); + /// ``` + #[inline] + pub fn reserve(&mut self, n: usize) { + self.buffer.reserve(n * mem::size_of::()); + } + + /// Appends a value of type `T` into the builder, + /// growing the internal buffer as needed. + /// + /// # Example: + /// + /// ``` + /// use arrow::array::UInt8BufferBuilder; + /// + /// let mut builder = UInt8BufferBuilder::new(10); + /// builder.append(42); + /// + /// assert_eq!(builder.len(), 1); + /// ``` + #[inline] + pub fn append(&mut self, v: T) { + self.reserve(1); + self.buffer.push(v); + self.len += 1; + } + + /// Appends a value of type `T` into the builder N times, + /// growing the internal buffer as needed. + /// + /// # Example: + /// + /// ``` + /// use arrow::array::UInt8BufferBuilder; + /// + /// let mut builder = UInt8BufferBuilder::new(10); + /// builder.append_n(10, 42); + /// + /// assert_eq!(builder.len(), 10); + /// ``` + #[inline] + pub fn append_n(&mut self, n: usize, v: T) { + self.reserve(n); + for _ in 0..n { + self.buffer.push(v); + } + self.len += n; + } + + /// Appends a slice of type `T`, growing the internal buffer as needed. + /// + /// # Example: + /// + /// ``` + /// use arrow::array::UInt8BufferBuilder; + /// + /// let mut builder = UInt8BufferBuilder::new(10); + /// builder.append_slice(&[42, 44, 46]); + /// + /// assert_eq!(builder.len(), 3); + /// ``` + #[inline] + pub fn append_slice(&mut self, slice: &[T]) { + self.buffer.extend_from_slice(slice); + self.len += slice.len(); + } + + /// # Safety + /// This requires the iterator be a trusted length. This could instead require + /// the iterator implement `TrustedLen` once that is stabilized. + #[inline] + pub unsafe fn append_trusted_len_iter(&mut self, iter: impl IntoIterator) { + let iter = iter.into_iter(); + let len = iter + .size_hint() + .1 + .expect("append_trusted_len_iter expects upper bound"); + self.reserve(len); + for v in iter { + self.buffer.push(v) + } + self.len += len; + } + + /// Resets this builder and returns an immutable [`Buffer`](crate::buffer::Buffer). + /// + /// # Example: + /// + /// ``` + /// use arrow::array::UInt8BufferBuilder; + /// + /// let mut builder = UInt8BufferBuilder::new(10); + /// builder.append_slice(&[42, 44, 46]); + /// + /// let buffer = builder.finish(); + /// + /// assert_eq!(unsafe { buffer.typed_data::() }, &[42, 44, 46]); + /// ``` + #[inline] + pub fn finish(&mut self) -> Buffer { + let buf = std::mem::replace(&mut self.buffer, MutableBuffer::new(0)); + self.len = 0; + buf.into() + } +} + +#[cfg(test)] +mod tests { + use crate::array::array::Array; + use crate::array::builder::ArrayBuilder; + use crate::array::Int32BufferBuilder; + use crate::array::Int8Builder; + use crate::array::UInt8BufferBuilder; + use crate::error::Result; + + #[test] + fn test_builder_i32_empty() { + let mut b = Int32BufferBuilder::new(5); + assert_eq!(0, b.len()); + assert_eq!(16, b.capacity()); + let a = b.finish(); + assert_eq!(0, a.len()); + } + + #[test] + fn test_builder_i32_alloc_zero_bytes() { + let mut b = Int32BufferBuilder::new(0); + b.append(123); + let a = b.finish(); + assert_eq!(4, a.len()); + } + + #[test] + fn test_builder_i32() { + let mut b = Int32BufferBuilder::new(5); + for i in 0..5 { + b.append(i); + } + assert_eq!(16, b.capacity()); + let a = b.finish(); + assert_eq!(20, a.len()); + } + + #[test] + fn test_builder_i32_grow_buffer() { + let mut b = Int32BufferBuilder::new(2); + assert_eq!(16, b.capacity()); + for i in 0..20 { + b.append(i); + } + assert_eq!(32, b.capacity()); + let a = b.finish(); + assert_eq!(80, a.len()); + } + + #[test] + fn test_builder_finish() { + let mut b = Int32BufferBuilder::new(5); + assert_eq!(16, b.capacity()); + for i in 0..10 { + b.append(i); + } + let mut a = b.finish(); + assert_eq!(40, a.len()); + assert_eq!(0, b.len()); + assert_eq!(0, b.capacity()); + + // Try build another buffer after cleaning up. + for i in 0..20 { + b.append(i) + } + assert_eq!(32, b.capacity()); + a = b.finish(); + assert_eq!(80, a.len()); + } + + #[test] + fn test_reserve() { + let mut b = UInt8BufferBuilder::new(2); + assert_eq!(64, b.capacity()); + b.reserve(64); + assert_eq!(64, b.capacity()); + b.reserve(65); + assert_eq!(128, b.capacity()); + + let mut b = Int32BufferBuilder::new(2); + assert_eq!(16, b.capacity()); + b.reserve(16); + assert_eq!(16, b.capacity()); + b.reserve(17); + assert_eq!(32, b.capacity()); + } + + #[test] + fn test_append_slice() { + let mut b = UInt8BufferBuilder::new(0); + b.append_slice(b"Hello, "); + b.append_slice(b"World!"); + let buffer = b.finish(); + assert_eq!(13, buffer.len()); + + let mut b = Int32BufferBuilder::new(0); + b.append_slice(&[32, 54]); + let buffer = b.finish(); + assert_eq!(8, buffer.len()); + } + + #[test] + fn test_append_values() -> Result<()> { + let mut a = Int8Builder::new(0); + a.append_value(1)?; + a.append_null()?; + a.append_value(-2)?; + assert_eq!(a.len(), 3); + + // append values + let values = &[1, 2, 3, 4]; + let is_valid = &[true, true, false, true]; + a.append_values(values, is_valid)?; + + assert_eq!(a.len(), 7); + let array = a.finish(); + assert_eq!(array.value(0), 1); + assert!(array.is_null(1)); + assert_eq!(array.value(2), -2); + assert_eq!(array.value(3), 1); + assert_eq!(array.value(4), 2); + assert!(array.is_null(5)); + assert_eq!(array.value(6), 4); + + Ok(()) + } +} diff --git a/arrow/src/array/builder/decimal_builder.rs b/arrow/src/array/builder/decimal_builder.rs new file mode 100644 index 000000000000..a7925358b8f8 --- /dev/null +++ b/arrow/src/array/builder/decimal_builder.rs @@ -0,0 +1,452 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::any::Any; +use std::sync::Arc; + +use crate::array::ArrayBuilder; +use crate::array::ArrayRef; +use crate::array::DecimalArray; +use crate::array::FixedSizeBinaryArray; +use crate::array::OffsetSizeTrait; +use crate::array::UInt8Builder; +use crate::array::{GenericBinaryArray, GenericStringArray}; + +use crate::error::{ArrowError, Result}; + +use super::{FixedSizeBinaryBuilder, FixedSizeListBuilder}; +use super::{GenericBinaryBuilder, GenericListBuilder, GenericStringBuilder}; + +use crate::datatypes::validate_decimal_precision; + +/// Array Builder for [`DecimalArray`] +/// +/// See [`DecimalArray`] for example. +/// +#[derive(Debug)] +pub struct DecimalBuilder { + builder: FixedSizeListBuilder, + precision: usize, + scale: usize, + + /// Should i128 values be validated for compatibility with scale and precision? + /// defaults to true + value_validation: bool, +} + +impl ArrayBuilder for GenericBinaryBuilder { + /// Returns the builder as a non-mutable `Any` reference. + fn as_any(&self) -> &dyn Any { + self + } + + /// Returns the builder as a mutable `Any` reference. + fn as_any_mut(&mut self) -> &mut dyn Any { + self + } + + /// Returns the boxed builder as a box of `Any`. + fn into_box_any(self: Box) -> Box { + self + } + + /// Returns the number of array slots in the builder + fn len(&self) -> usize { + self.builder.len() + } + + /// Returns whether the number of array slots is zero + fn is_empty(&self) -> bool { + self.builder.is_empty() + } + + /// Builds the array and reset this builder. + fn finish(&mut self) -> ArrayRef { + Arc::new(self.finish()) + } +} + +impl ArrayBuilder for GenericStringBuilder { + /// Returns the builder as a non-mutable `Any` reference. + fn as_any(&self) -> &dyn Any { + self + } + + /// Returns the builder as a mutable `Any` reference. + fn as_any_mut(&mut self) -> &mut dyn Any { + self + } + + /// Returns the boxed builder as a box of `Any`. + fn into_box_any(self: Box) -> Box { + self + } + + /// Returns the number of array slots in the builder + fn len(&self) -> usize { + self.builder.len() + } + + /// Returns whether the number of array slots is zero + fn is_empty(&self) -> bool { + self.builder.is_empty() + } + + /// Builds the array and reset this builder. + fn finish(&mut self) -> ArrayRef { + let a = GenericStringBuilder::::finish(self); + Arc::new(a) + } +} + +impl ArrayBuilder for FixedSizeBinaryBuilder { + /// Returns the builder as a non-mutable `Any` reference. + fn as_any(&self) -> &dyn Any { + self + } + + /// Returns the builder as a mutable `Any` reference. + fn as_any_mut(&mut self) -> &mut dyn Any { + self + } + + /// Returns the boxed builder as a box of `Any`. + fn into_box_any(self: Box) -> Box { + self + } + + /// Returns the number of array slots in the builder + fn len(&self) -> usize { + self.builder.len() + } + + /// Returns whether the number of array slots is zero + fn is_empty(&self) -> bool { + self.builder.is_empty() + } + + /// Builds the array and reset this builder. + fn finish(&mut self) -> ArrayRef { + Arc::new(self.finish()) + } +} + +impl ArrayBuilder for DecimalBuilder { + /// Returns the builder as a non-mutable `Any` reference. + fn as_any(&self) -> &dyn Any { + self + } + + /// Returns the builder as a mutable `Any` reference. + fn as_any_mut(&mut self) -> &mut dyn Any { + self + } + + /// Returns the boxed builder as a box of `Any`. + fn into_box_any(self: Box) -> Box { + self + } + + /// Returns the number of array slots in the builder + fn len(&self) -> usize { + self.builder.len() + } + + /// Returns whether the number of array slots is zero + fn is_empty(&self) -> bool { + self.builder.is_empty() + } + + /// Builds the array and reset this builder. + fn finish(&mut self) -> ArrayRef { + Arc::new(self.finish()) + } +} + +impl GenericBinaryBuilder { + /// Creates a new `GenericBinaryBuilder`, `capacity` is the number of bytes in the values + /// array + pub fn new(capacity: usize) -> Self { + let values_builder = UInt8Builder::new(capacity); + Self { + builder: GenericListBuilder::new(values_builder), + } + } + + /// Appends a single byte value into the builder's values array. + /// + /// Note, when appending individual byte values you must call `append` to delimit each + /// distinct list value. + #[inline] + pub fn append_byte(&mut self, value: u8) -> Result<()> { + self.builder.values().append_value(value)?; + Ok(()) + } + + /// Appends a byte slice into the builder. + /// + /// Automatically calls the `append` method to delimit the slice appended in as a + /// distinct array element. + #[inline] + pub fn append_value(&mut self, value: impl AsRef<[u8]>) -> Result<()> { + self.builder.values().append_slice(value.as_ref())?; + self.builder.append(true)?; + Ok(()) + } + + /// Finish the current variable-length list array slot. + #[inline] + pub fn append(&mut self, is_valid: bool) -> Result<()> { + self.builder.append(is_valid) + } + + /// Append a null value to the array. + #[inline] + pub fn append_null(&mut self) -> Result<()> { + self.append(false) + } + + /// Builds the `BinaryArray` and reset this builder. + pub fn finish(&mut self) -> GenericBinaryArray { + GenericBinaryArray::::from(self.builder.finish()) + } +} + +impl GenericStringBuilder { + /// Creates a new `StringBuilder`, + /// `capacity` is the number of bytes of string data to pre-allocate space for in this builder + pub fn new(capacity: usize) -> Self { + let values_builder = UInt8Builder::new(capacity); + Self { + builder: GenericListBuilder::new(values_builder), + } + } + + /// Creates a new `StringBuilder`, + /// `data_capacity` is the number of bytes of string data to pre-allocate space for in this builder + /// `item_capacity` is the number of items to pre-allocate space for in this builder + pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self { + let values_builder = UInt8Builder::new(data_capacity); + Self { + builder: GenericListBuilder::with_capacity(values_builder, item_capacity), + } + } + + /// Appends a string into the builder. + /// + /// Automatically calls the `append` method to delimit the string appended in as a + /// distinct array element. + #[inline] + pub fn append_value(&mut self, value: impl AsRef) -> Result<()> { + self.builder + .values() + .append_slice(value.as_ref().as_bytes())?; + self.builder.append(true)?; + Ok(()) + } + + /// Finish the current variable-length list array slot. + #[inline] + pub fn append(&mut self, is_valid: bool) -> Result<()> { + self.builder.append(is_valid) + } + + /// Append a null value to the array. + #[inline] + pub fn append_null(&mut self) -> Result<()> { + self.append(false) + } + + /// Append an `Option` value to the array. + #[inline] + pub fn append_option(&mut self, value: Option>) -> Result<()> { + match value { + None => self.append_null()?, + Some(v) => self.append_value(v)?, + }; + Ok(()) + } + + /// Builds the `StringArray` and reset this builder. + pub fn finish(&mut self) -> GenericStringArray { + GenericStringArray::::from(self.builder.finish()) + } +} + +impl FixedSizeBinaryBuilder { + /// Creates a new `BinaryBuilder`, `capacity` is the number of bytes in the values + /// array + pub fn new(capacity: usize, byte_width: i32) -> Self { + let values_builder = UInt8Builder::new(capacity); + Self { + builder: FixedSizeListBuilder::new(values_builder, byte_width), + } + } + + /// Appends a byte slice into the builder. + /// + /// Automatically calls the `append` method to delimit the slice appended in as a + /// distinct array element. + #[inline] + pub fn append_value(&mut self, value: impl AsRef<[u8]>) -> Result<()> { + if self.builder.value_length() != value.as_ref().len() as i32 { + return Err(ArrowError::InvalidArgumentError( + "Byte slice does not have the same length as FixedSizeBinaryBuilder value lengths".to_string() + )); + } + self.builder.values().append_slice(value.as_ref())?; + self.builder.append(true) + } + + /// Append a null value to the array. + #[inline] + pub fn append_null(&mut self) -> Result<()> { + let length: usize = self.builder.value_length() as usize; + self.builder.values().append_slice(&vec![0u8; length][..])?; + self.builder.append(false) + } + + /// Builds the `FixedSizeBinaryArray` and reset this builder. + pub fn finish(&mut self) -> FixedSizeBinaryArray { + FixedSizeBinaryArray::from(self.builder.finish()) + } +} + +impl DecimalBuilder { + /// Creates a new `BinaryBuilder`, `capacity` is the number of bytes in the values + /// array + pub fn new(capacity: usize, precision: usize, scale: usize) -> Self { + let values_builder = UInt8Builder::new(capacity); + let byte_width = 16; + Self { + builder: FixedSizeListBuilder::new(values_builder, byte_width), + precision, + scale, + value_validation: true, + } + } + + /// Disable validation + /// + /// # Safety + /// + /// After disabling validation, caller must ensure that appended values are compatible + /// for the specified precision and scale. + pub unsafe fn disable_value_validation(&mut self) { + self.value_validation = false; + } + + /// Appends a byte slice into the builder. + /// + /// Automatically calls the `append` method to delimit the slice appended in as a + /// distinct array element. + #[inline] + pub fn append_value(&mut self, value: impl Into) -> Result<()> { + let value = if self.value_validation { + validate_decimal_precision(value.into(), self.precision)? + } else { + value.into() + }; + + let value_as_bytes = Self::from_i128_to_fixed_size_bytes( + value, + self.builder.value_length() as usize, + )?; + if self.builder.value_length() != value_as_bytes.len() as i32 { + return Err(ArrowError::InvalidArgumentError( + "Byte slice does not have the same length as DecimalBuilder value lengths".to_string() + )); + } + self.builder + .values() + .append_slice(value_as_bytes.as_slice())?; + self.builder.append(true) + } + + pub(crate) fn from_i128_to_fixed_size_bytes(v: i128, size: usize) -> Result> { + if size > 16 { + return Err(ArrowError::InvalidArgumentError( + "DecimalBuilder only supports values up to 16 bytes.".to_string(), + )); + } + let res = v.to_le_bytes(); + let start_byte = 16 - size; + Ok(res[start_byte..16].to_vec()) + } + + /// Append a null value to the array. + #[inline] + pub fn append_null(&mut self) -> Result<()> { + let length: usize = self.builder.value_length() as usize; + self.builder.values().append_slice(&vec![0u8; length][..])?; + self.builder.append(false) + } + + /// Builds the `DecimalArray` and reset this builder. + pub fn finish(&mut self) -> DecimalArray { + DecimalArray::from_fixed_size_list_array( + self.builder.finish(), + self.precision, + self.scale, + ) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use crate::array::Array; + use crate::datatypes::DataType; + use crate::util::decimal::Decimal128; + + #[test] + fn test_decimal_builder() { + let mut builder = DecimalBuilder::new(30, 38, 6); + + builder.append_value(8_887_000_000_i128).unwrap(); + builder.append_null().unwrap(); + builder.append_value(-8_887_000_000_i128).unwrap(); + let decimal_array: DecimalArray = builder.finish(); + + assert_eq!(&DataType::Decimal(38, 6), decimal_array.data_type()); + assert_eq!(3, decimal_array.len()); + assert_eq!(1, decimal_array.null_count()); + assert_eq!(32, decimal_array.value_offset(2)); + assert_eq!(16, decimal_array.value_length()); + } + + #[test] + fn test_decimal_builder_with_decimal128() { + let mut builder = DecimalBuilder::new(30, 38, 6); + + builder + .append_value(Decimal128::new_from_i128(30, 38, 8_887_000_000_i128)) + .unwrap(); + builder.append_null().unwrap(); + builder + .append_value(Decimal128::new_from_i128(30, 38, -8_887_000_000_i128)) + .unwrap(); + let decimal_array: DecimalArray = builder.finish(); + + assert_eq!(&DataType::Decimal(38, 6), decimal_array.data_type()); + assert_eq!(3, decimal_array.len()); + assert_eq!(1, decimal_array.null_count()); + assert_eq!(32, decimal_array.value_offset(2)); + assert_eq!(16, decimal_array.value_length()); + } +} diff --git a/arrow/src/array/builder/fixed_size_list_builder.rs b/arrow/src/array/builder/fixed_size_list_builder.rs new file mode 100644 index 000000000000..f0233e2638bd --- /dev/null +++ b/arrow/src/array/builder/fixed_size_list_builder.rs @@ -0,0 +1,251 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::any::Any; +use std::sync::Arc; + +use crate::array::ArrayData; +use crate::array::ArrayRef; +use crate::array::FixedSizeListArray; +use crate::array::Int32BufferBuilder; +use crate::datatypes::DataType; +use crate::datatypes::Field; +use crate::error::Result; + +use super::ArrayBuilder; +use super::BooleanBufferBuilder; + +/// Array builder for `ListArray` +#[derive(Debug)] +pub struct FixedSizeListBuilder { + bitmap_builder: BooleanBufferBuilder, + values_builder: T, + len: usize, + list_len: i32, +} + +impl FixedSizeListBuilder { + /// Creates a new `FixedSizeListBuilder` from a given values array builder + /// `length` is the number of values within each array + pub fn new(values_builder: T, length: i32) -> Self { + let capacity = values_builder.len(); + Self::with_capacity(values_builder, length, capacity) + } + + /// Creates a new `FixedSizeListBuilder` from a given values array builder + /// `length` is the number of values within each array + /// `capacity` is the number of items to pre-allocate space for in this builder + pub fn with_capacity(values_builder: T, length: i32, capacity: usize) -> Self { + let mut offsets_builder = Int32BufferBuilder::new(capacity + 1); + offsets_builder.append(0); + Self { + bitmap_builder: BooleanBufferBuilder::new(capacity), + values_builder, + len: 0, + list_len: length, + } + } +} + +impl ArrayBuilder for FixedSizeListBuilder +where + T: 'static, +{ + /// Returns the builder as a non-mutable `Any` reference. + fn as_any(&self) -> &dyn Any { + self + } + + /// Returns the builder as a mutable `Any` reference. + fn as_any_mut(&mut self) -> &mut dyn Any { + self + } + + /// Returns the boxed builder as a box of `Any`. + fn into_box_any(self: Box) -> Box { + self + } + + /// Returns the number of array slots in the builder + fn len(&self) -> usize { + self.len + } + + /// Returns whether the number of array slots is zero + fn is_empty(&self) -> bool { + self.len == 0 + } + + /// Builds the array and reset this builder. + fn finish(&mut self) -> ArrayRef { + Arc::new(self.finish()) + } +} + +impl FixedSizeListBuilder +where + T: 'static, +{ + /// Returns the child array builder as a mutable reference. + /// + /// This mutable reference can be used to append values into the child array builder, + /// but you must call `append` to delimit each distinct list value. + pub fn values(&mut self) -> &mut T { + &mut self.values_builder + } + + pub fn value_length(&self) -> i32 { + self.list_len + } + + /// Finish the current variable-length list array slot + #[inline] + pub fn append(&mut self, is_valid: bool) -> Result<()> { + self.bitmap_builder.append(is_valid); + self.len += 1; + Ok(()) + } + + /// Builds the `FixedSizeListBuilder` and reset this builder. + pub fn finish(&mut self) -> FixedSizeListArray { + let len = self.len(); + self.len = 0; + let values_arr = self + .values_builder + .as_any_mut() + .downcast_mut::() + .unwrap() + .finish(); + let values_data = values_arr.data(); + + // check that values_data length is multiple of len if we have data + if len != 0 { + assert!( + values_data.len() / len == self.list_len as usize, + "Values of FixedSizeList must have equal lengths, values have length {} and list has {}", + values_data.len() / len, + self.list_len + ); + } + + let null_bit_buffer = self.bitmap_builder.finish(); + let array_data = ArrayData::builder(DataType::FixedSizeList( + Box::new(Field::new("item", values_data.data_type().clone(), true)), + self.list_len, + )) + .len(len) + .add_child_data(values_data.clone()) + .null_bit_buffer(Some(null_bit_buffer)); + + let array_data = unsafe { array_data.build_unchecked() }; + + FixedSizeListArray::from(array_data) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use crate::array::Array; + use crate::array::FixedSizeBinaryArray; + use crate::array::FixedSizeBinaryBuilder; + use crate::array::Int32Array; + use crate::array::Int32Builder; + + #[test] + fn test_fixed_size_list_array_builder() { + let values_builder = Int32Builder::new(10); + let mut builder = FixedSizeListBuilder::new(values_builder, 3); + + // [[0, 1, 2], null, [3, null, 5], [6, 7, null]] + builder.values().append_value(0).unwrap(); + builder.values().append_value(1).unwrap(); + builder.values().append_value(2).unwrap(); + builder.append(true).unwrap(); + builder.values().append_null().unwrap(); + builder.values().append_null().unwrap(); + builder.values().append_null().unwrap(); + builder.append(false).unwrap(); + builder.values().append_value(3).unwrap(); + builder.values().append_null().unwrap(); + builder.values().append_value(5).unwrap(); + builder.append(true).unwrap(); + builder.values().append_value(6).unwrap(); + builder.values().append_value(7).unwrap(); + builder.values().append_null().unwrap(); + builder.append(true).unwrap(); + let list_array = builder.finish(); + + assert_eq!(DataType::Int32, list_array.value_type()); + assert_eq!(4, list_array.len()); + assert_eq!(1, list_array.null_count()); + assert_eq!(6, list_array.value_offset(2)); + assert_eq!(3, list_array.value_length()); + } + + #[test] + fn test_fixed_size_list_array_builder_empty() { + let values_builder = Int32Array::builder(5); + let mut builder = FixedSizeListBuilder::new(values_builder, 3); + + let arr = builder.finish(); + assert_eq!(0, arr.len()); + assert_eq!(0, builder.len()); + } + + #[test] + fn test_fixed_size_list_array_builder_finish() { + let values_builder = Int32Array::builder(5); + let mut builder = FixedSizeListBuilder::new(values_builder, 3); + + builder.values().append_slice(&[1, 2, 3]).unwrap(); + builder.append(true).unwrap(); + builder.values().append_slice(&[4, 5, 6]).unwrap(); + builder.append(true).unwrap(); + + let mut arr = builder.finish(); + assert_eq!(2, arr.len()); + assert_eq!(0, builder.len()); + + builder.values().append_slice(&[7, 8, 9]).unwrap(); + builder.append(true).unwrap(); + arr = builder.finish(); + assert_eq!(1, arr.len()); + assert_eq!(0, builder.len()); + } + + #[test] + fn test_fixed_size_binary_builder() { + let mut builder = FixedSizeBinaryBuilder::new(15, 5); + + // [b"hello", null, "arrow"] + builder.append_value(b"hello").unwrap(); + builder.append_null().unwrap(); + builder.append_value(b"arrow").unwrap(); + let fixed_size_binary_array: FixedSizeBinaryArray = builder.finish(); + + assert_eq!( + &DataType::FixedSizeBinary(5), + fixed_size_binary_array.data_type() + ); + assert_eq!(3, fixed_size_binary_array.len()); + assert_eq!(1, fixed_size_binary_array.null_count()); + assert_eq!(10, fixed_size_binary_array.value_offset(2)); + assert_eq!(5, fixed_size_binary_array.value_length()); + } +} diff --git a/arrow/src/array/builder/generic_list_builder.rs b/arrow/src/array/builder/generic_list_builder.rs new file mode 100644 index 000000000000..1449b5c09cc0 --- /dev/null +++ b/arrow/src/array/builder/generic_list_builder.rs @@ -0,0 +1,502 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::any::Any; +use std::sync::Arc; + +use crate::array::ArrayData; +use crate::array::ArrayRef; +use crate::array::GenericListArray; +use crate::array::OffsetSizeTrait; +use crate::datatypes::DataType; +use crate::datatypes::Field; +use crate::error::Result; + +use super::{ArrayBuilder, BooleanBufferBuilder, BufferBuilder}; + +/// Array builder for `ListArray` +#[derive(Debug)] +pub struct GenericListBuilder { + offsets_builder: BufferBuilder, + bitmap_builder: BooleanBufferBuilder, + values_builder: T, + len: OffsetSize, +} + +impl GenericListBuilder { + /// Creates a new `ListArrayBuilder` from a given values array builder + pub fn new(values_builder: T) -> Self { + let capacity = values_builder.len(); + Self::with_capacity(values_builder, capacity) + } + + /// Creates a new `ListArrayBuilder` from a given values array builder + /// `capacity` is the number of items to pre-allocate space for in this builder + pub fn with_capacity(values_builder: T, capacity: usize) -> Self { + let mut offsets_builder = BufferBuilder::::new(capacity + 1); + let len = OffsetSize::zero(); + offsets_builder.append(len); + Self { + offsets_builder, + bitmap_builder: BooleanBufferBuilder::new(capacity), + values_builder, + len, + } + } +} + +impl ArrayBuilder + for GenericListBuilder +where + T: 'static, +{ + /// Returns the builder as a non-mutable `Any` reference. + fn as_any(&self) -> &dyn Any { + self + } + + /// Returns the builder as a mutable `Any` reference. + fn as_any_mut(&mut self) -> &mut dyn Any { + self + } + + /// Returns the boxed builder as a box of `Any`. + fn into_box_any(self: Box) -> Box { + self + } + + /// Returns the number of array slots in the builder + fn len(&self) -> usize { + self.len.to_usize().unwrap() + } + + /// Returns whether the number of array slots is zero + fn is_empty(&self) -> bool { + self.len == OffsetSize::zero() + } + + /// Builds the array and reset this builder. + fn finish(&mut self) -> ArrayRef { + Arc::new(self.finish()) + } +} + +impl GenericListBuilder +where + T: 'static, +{ + /// Returns the child array builder as a mutable reference. + /// + /// This mutable reference can be used to append values into the child array builder, + /// but you must call `append` to delimit each distinct list value. + pub fn values(&mut self) -> &mut T { + &mut self.values_builder + } + + /// Finish the current variable-length list array slot + #[inline] + pub fn append(&mut self, is_valid: bool) -> Result<()> { + self.offsets_builder + .append(OffsetSize::from_usize(self.values_builder.len()).unwrap()); + self.bitmap_builder.append(is_valid); + self.len += OffsetSize::one(); + Ok(()) + } + + /// Builds the `ListArray` and reset this builder. + pub fn finish(&mut self) -> GenericListArray { + let len = self.len(); + self.len = OffsetSize::zero(); + let values_arr = self + .values_builder + .as_any_mut() + .downcast_mut::() + .unwrap() + .finish(); + let values_data = values_arr.data(); + + let offset_buffer = self.offsets_builder.finish(); + let null_bit_buffer = self.bitmap_builder.finish(); + self.offsets_builder.append(self.len); + let field = Box::new(Field::new( + "item", + values_data.data_type().clone(), + true, // TODO: find a consistent way of getting this + )); + let data_type = if OffsetSize::IS_LARGE { + DataType::LargeList(field) + } else { + DataType::List(field) + }; + let array_data = ArrayData::builder(data_type) + .len(len) + .add_buffer(offset_buffer) + .add_child_data(values_data.clone()) + .null_bit_buffer(Some(null_bit_buffer)); + + let array_data = unsafe { array_data.build_unchecked() }; + + GenericListArray::::from(array_data) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use crate::array::Array; + use crate::array::Int32Array; + use crate::array::Int32Builder; + use crate::buffer::Buffer; + + use crate::array::builder::{ + BinaryBuilder, LargeBinaryBuilder, LargeListBuilder, ListBuilder, StringBuilder, + }; + + #[test] + fn test_list_array_builder() { + let values_builder = Int32Builder::new(10); + let mut builder = ListBuilder::new(values_builder); + + // [[0, 1, 2], [3, 4, 5], [6, 7]] + builder.values().append_value(0).unwrap(); + builder.values().append_value(1).unwrap(); + builder.values().append_value(2).unwrap(); + builder.append(true).unwrap(); + builder.values().append_value(3).unwrap(); + builder.values().append_value(4).unwrap(); + builder.values().append_value(5).unwrap(); + builder.append(true).unwrap(); + builder.values().append_value(6).unwrap(); + builder.values().append_value(7).unwrap(); + builder.append(true).unwrap(); + let list_array = builder.finish(); + + let values = list_array.values().data().buffers()[0].clone(); + assert_eq!(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7]), values); + assert_eq!( + Buffer::from_slice_ref(&[0, 3, 6, 8]), + list_array.data().buffers()[0].clone() + ); + assert_eq!(DataType::Int32, list_array.value_type()); + assert_eq!(3, list_array.len()); + assert_eq!(0, list_array.null_count()); + assert_eq!(6, list_array.value_offsets()[2]); + assert_eq!(2, list_array.value_length(2)); + for i in 0..3 { + assert!(list_array.is_valid(i)); + assert!(!list_array.is_null(i)); + } + } + + #[test] + fn test_large_list_array_builder() { + let values_builder = Int32Builder::new(10); + let mut builder = LargeListBuilder::new(values_builder); + + // [[0, 1, 2], [3, 4, 5], [6, 7]] + builder.values().append_value(0).unwrap(); + builder.values().append_value(1).unwrap(); + builder.values().append_value(2).unwrap(); + builder.append(true).unwrap(); + builder.values().append_value(3).unwrap(); + builder.values().append_value(4).unwrap(); + builder.values().append_value(5).unwrap(); + builder.append(true).unwrap(); + builder.values().append_value(6).unwrap(); + builder.values().append_value(7).unwrap(); + builder.append(true).unwrap(); + let list_array = builder.finish(); + + let values = list_array.values().data().buffers()[0].clone(); + assert_eq!(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7]), values); + assert_eq!( + Buffer::from_slice_ref(&[0i64, 3, 6, 8]), + list_array.data().buffers()[0].clone() + ); + assert_eq!(DataType::Int32, list_array.value_type()); + assert_eq!(3, list_array.len()); + assert_eq!(0, list_array.null_count()); + assert_eq!(6, list_array.value_offsets()[2]); + assert_eq!(2, list_array.value_length(2)); + for i in 0..3 { + assert!(list_array.is_valid(i)); + assert!(!list_array.is_null(i)); + } + } + + #[test] + fn test_list_array_builder_nulls() { + let values_builder = Int32Builder::new(10); + let mut builder = ListBuilder::new(values_builder); + + // [[0, 1, 2], null, [3, null, 5], [6, 7]] + builder.values().append_value(0).unwrap(); + builder.values().append_value(1).unwrap(); + builder.values().append_value(2).unwrap(); + builder.append(true).unwrap(); + builder.append(false).unwrap(); + builder.values().append_value(3).unwrap(); + builder.values().append_null().unwrap(); + builder.values().append_value(5).unwrap(); + builder.append(true).unwrap(); + builder.values().append_value(6).unwrap(); + builder.values().append_value(7).unwrap(); + builder.append(true).unwrap(); + let list_array = builder.finish(); + + assert_eq!(DataType::Int32, list_array.value_type()); + assert_eq!(4, list_array.len()); + assert_eq!(1, list_array.null_count()); + assert_eq!(3, list_array.value_offsets()[2]); + assert_eq!(3, list_array.value_length(2)); + } + + #[test] + fn test_large_list_array_builder_nulls() { + let values_builder = Int32Builder::new(10); + let mut builder = LargeListBuilder::new(values_builder); + + // [[0, 1, 2], null, [3, null, 5], [6, 7]] + builder.values().append_value(0).unwrap(); + builder.values().append_value(1).unwrap(); + builder.values().append_value(2).unwrap(); + builder.append(true).unwrap(); + builder.append(false).unwrap(); + builder.values().append_value(3).unwrap(); + builder.values().append_null().unwrap(); + builder.values().append_value(5).unwrap(); + builder.append(true).unwrap(); + builder.values().append_value(6).unwrap(); + builder.values().append_value(7).unwrap(); + builder.append(true).unwrap(); + let list_array = builder.finish(); + + assert_eq!(DataType::Int32, list_array.value_type()); + assert_eq!(4, list_array.len()); + assert_eq!(1, list_array.null_count()); + assert_eq!(3, list_array.value_offsets()[2]); + assert_eq!(3, list_array.value_length(2)); + } + + #[test] + fn test_list_array_builder_finish() { + let values_builder = Int32Array::builder(5); + let mut builder = ListBuilder::new(values_builder); + + builder.values().append_slice(&[1, 2, 3]).unwrap(); + builder.append(true).unwrap(); + builder.values().append_slice(&[4, 5, 6]).unwrap(); + builder.append(true).unwrap(); + + let mut arr = builder.finish(); + assert_eq!(2, arr.len()); + assert_eq!(0, builder.len()); + + builder.values().append_slice(&[7, 8, 9]).unwrap(); + builder.append(true).unwrap(); + arr = builder.finish(); + assert_eq!(1, arr.len()); + assert_eq!(0, builder.len()); + } + + #[test] + fn test_list_list_array_builder() { + let primitive_builder = Int32Builder::new(10); + let values_builder = ListBuilder::new(primitive_builder); + let mut builder = ListBuilder::new(values_builder); + + // [[[1, 2], [3, 4]], [[5, 6, 7], null, [8]], null, [[9, 10]]] + builder.values().values().append_value(1).unwrap(); + builder.values().values().append_value(2).unwrap(); + builder.values().append(true).unwrap(); + builder.values().values().append_value(3).unwrap(); + builder.values().values().append_value(4).unwrap(); + builder.values().append(true).unwrap(); + builder.append(true).unwrap(); + + builder.values().values().append_value(5).unwrap(); + builder.values().values().append_value(6).unwrap(); + builder.values().values().append_value(7).unwrap(); + builder.values().append(true).unwrap(); + builder.values().append(false).unwrap(); + builder.values().values().append_value(8).unwrap(); + builder.values().append(true).unwrap(); + builder.append(true).unwrap(); + + builder.append(false).unwrap(); + + builder.values().values().append_value(9).unwrap(); + builder.values().values().append_value(10).unwrap(); + builder.values().append(true).unwrap(); + builder.append(true).unwrap(); + + let list_array = builder.finish(); + + assert_eq!(4, list_array.len()); + assert_eq!(1, list_array.null_count()); + assert_eq!( + Buffer::from_slice_ref(&[0, 2, 5, 5, 6]), + list_array.data().buffers()[0].clone() + ); + + assert_eq!(6, list_array.values().data().len()); + assert_eq!(1, list_array.values().data().null_count()); + assert_eq!( + Buffer::from_slice_ref(&[0, 2, 4, 7, 7, 8, 10]), + list_array.values().data().buffers()[0].clone() + ); + + assert_eq!(10, list_array.values().data().child_data()[0].len()); + assert_eq!(0, list_array.values().data().child_data()[0].null_count()); + assert_eq!( + Buffer::from_slice_ref(&[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), + list_array.values().data().child_data()[0].buffers()[0].clone() + ); + } + + #[test] + fn test_binary_array_builder() { + let mut builder = BinaryBuilder::new(20); + + builder.append_byte(b'h').unwrap(); + builder.append_byte(b'e').unwrap(); + builder.append_byte(b'l').unwrap(); + builder.append_byte(b'l').unwrap(); + builder.append_byte(b'o').unwrap(); + builder.append(true).unwrap(); + builder.append(true).unwrap(); + builder.append_byte(b'w').unwrap(); + builder.append_byte(b'o').unwrap(); + builder.append_byte(b'r').unwrap(); + builder.append_byte(b'l').unwrap(); + builder.append_byte(b'd').unwrap(); + builder.append(true).unwrap(); + + let binary_array = builder.finish(); + + assert_eq!(3, binary_array.len()); + assert_eq!(0, binary_array.null_count()); + assert_eq!([b'h', b'e', b'l', b'l', b'o'], binary_array.value(0)); + assert_eq!([] as [u8; 0], binary_array.value(1)); + assert_eq!([b'w', b'o', b'r', b'l', b'd'], binary_array.value(2)); + assert_eq!(5, binary_array.value_offsets()[2]); + assert_eq!(5, binary_array.value_length(2)); + } + + #[test] + fn test_large_binary_array_builder() { + let mut builder = LargeBinaryBuilder::new(20); + + builder.append_byte(b'h').unwrap(); + builder.append_byte(b'e').unwrap(); + builder.append_byte(b'l').unwrap(); + builder.append_byte(b'l').unwrap(); + builder.append_byte(b'o').unwrap(); + builder.append(true).unwrap(); + builder.append(true).unwrap(); + builder.append_byte(b'w').unwrap(); + builder.append_byte(b'o').unwrap(); + builder.append_byte(b'r').unwrap(); + builder.append_byte(b'l').unwrap(); + builder.append_byte(b'd').unwrap(); + builder.append(true).unwrap(); + + let binary_array = builder.finish(); + + assert_eq!(3, binary_array.len()); + assert_eq!(0, binary_array.null_count()); + assert_eq!([b'h', b'e', b'l', b'l', b'o'], binary_array.value(0)); + assert_eq!([] as [u8; 0], binary_array.value(1)); + assert_eq!([b'w', b'o', b'r', b'l', b'd'], binary_array.value(2)); + assert_eq!(5, binary_array.value_offsets()[2]); + assert_eq!(5, binary_array.value_length(2)); + } + + #[test] + fn test_string_array_builder() { + let mut builder = StringBuilder::new(20); + + builder.append_value("hello").unwrap(); + builder.append(true).unwrap(); + builder.append_value("world").unwrap(); + + let string_array = builder.finish(); + + assert_eq!(3, string_array.len()); + assert_eq!(0, string_array.null_count()); + assert_eq!("hello", string_array.value(0)); + assert_eq!("", string_array.value(1)); + assert_eq!("world", string_array.value(2)); + assert_eq!(5, string_array.value_offsets()[2]); + assert_eq!(5, string_array.value_length(2)); + } + + #[test] + fn test_string_array_builder_finish() { + let mut builder = StringBuilder::new(10); + + builder.append_value("hello").unwrap(); + builder.append_value("world").unwrap(); + + let mut arr = builder.finish(); + assert_eq!(2, arr.len()); + assert_eq!(0, builder.len()); + + builder.append_value("arrow").unwrap(); + arr = builder.finish(); + assert_eq!(1, arr.len()); + assert_eq!(0, builder.len()); + } + + #[test] + fn test_string_array_builder_append_string() { + let mut builder = StringBuilder::new(20); + + let var = "hello".to_owned(); + builder.append_value(&var).unwrap(); + builder.append(true).unwrap(); + builder.append_value("world").unwrap(); + + let string_array = builder.finish(); + + assert_eq!(3, string_array.len()); + assert_eq!(0, string_array.null_count()); + assert_eq!("hello", string_array.value(0)); + assert_eq!("", string_array.value(1)); + assert_eq!("world", string_array.value(2)); + assert_eq!(5, string_array.value_offsets()[2]); + assert_eq!(5, string_array.value_length(2)); + } + + #[test] + fn test_string_array_builder_append_option() { + let mut builder = StringBuilder::new(20); + builder.append_option(Some("hello")).unwrap(); + builder.append_option(None::<&str>).unwrap(); + builder.append_option(None::).unwrap(); + builder.append_option(Some("world")).unwrap(); + + let string_array = builder.finish(); + + assert_eq!(4, string_array.len()); + assert_eq!("hello", string_array.value(0)); + assert!(string_array.is_null(1)); + assert!(string_array.is_null(2)); + assert_eq!("world", string_array.value(3)); + } +} diff --git a/arrow/src/array/builder/map_builder.rs b/arrow/src/array/builder/map_builder.rs new file mode 100644 index 000000000000..30ea9ad1b2ae --- /dev/null +++ b/arrow/src/array/builder/map_builder.rs @@ -0,0 +1,255 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::any::Any; +use std::sync::Arc; + +use crate::array::array::Array; +use crate::array::ArrayData; +use crate::array::ArrayRef; +use crate::array::MapArray; +use crate::array::StructArray; +use crate::datatypes::DataType; +use crate::datatypes::Field; +use crate::error::{ArrowError, Result}; + +use super::{ArrayBuilder, BooleanBufferBuilder, BufferBuilder}; + +#[derive(Debug)] +pub struct MapBuilder { + offsets_builder: BufferBuilder, + bitmap_builder: BooleanBufferBuilder, + field_names: MapFieldNames, + key_builder: K, + value_builder: V, + len: i32, +} + +#[derive(Debug, Clone)] +pub struct MapFieldNames { + pub entry: String, + pub key: String, + pub value: String, +} + +impl Default for MapFieldNames { + fn default() -> Self { + Self { + entry: "entries".to_string(), + key: "keys".to_string(), + value: "values".to_string(), + } + } +} + +#[allow(dead_code)] +impl MapBuilder { + pub fn new( + field_names: Option, + key_builder: K, + value_builder: V, + ) -> Self { + let capacity = key_builder.len(); + Self::with_capacity(field_names, key_builder, value_builder, capacity) + } + + pub fn with_capacity( + field_names: Option, + key_builder: K, + value_builder: V, + capacity: usize, + ) -> Self { + let mut offsets_builder = BufferBuilder::::new(capacity + 1); + let len = 0; + offsets_builder.append(len); + Self { + offsets_builder, + bitmap_builder: BooleanBufferBuilder::new(capacity), + field_names: field_names.unwrap_or_default(), + key_builder, + value_builder, + len, + } + } + + pub fn keys(&mut self) -> &mut K { + &mut self.key_builder + } + + pub fn values(&mut self) -> &mut V { + &mut self.value_builder + } + + /// Finish the current map array slot + #[inline] + pub fn append(&mut self, is_valid: bool) -> Result<()> { + if self.key_builder.len() != self.value_builder.len() { + return Err(ArrowError::InvalidArgumentError(format!( + "Cannot append to a map builder when its keys and values have unequal lengths of {} and {}", + self.key_builder.len(), + self.value_builder.len() + ))); + } + self.offsets_builder.append(self.key_builder.len() as i32); + self.bitmap_builder.append(is_valid); + self.len += 1; + Ok(()) + } + + pub fn finish(&mut self) -> MapArray { + let len = self.len(); + self.len = 0; + + // Build the keys + let keys_arr = self + .key_builder + .as_any_mut() + .downcast_mut::() + .unwrap() + .finish(); + let values_arr = self + .value_builder + .as_any_mut() + .downcast_mut::() + .unwrap() + .finish(); + + let keys_field = Field::new( + self.field_names.key.as_str(), + keys_arr.data_type().clone(), + false, // always nullable + ); + let values_field = Field::new( + self.field_names.value.as_str(), + values_arr.data_type().clone(), + true, + ); + + let struct_array = + StructArray::from(vec![(keys_field, keys_arr), (values_field, values_arr)]); + + let offset_buffer = self.offsets_builder.finish(); + let null_bit_buffer = self.bitmap_builder.finish(); + self.offsets_builder.append(self.len); + let map_field = Box::new(Field::new( + self.field_names.entry.as_str(), + struct_array.data_type().clone(), + false, // always non-nullable + )); + let array_data = ArrayData::builder(DataType::Map(map_field, false)) // TODO: support sorted keys + .len(len) + .add_buffer(offset_buffer) + .add_child_data(struct_array.data().clone()) + .null_bit_buffer(Some(null_bit_buffer)); + + let array_data = unsafe { array_data.build_unchecked() }; + + MapArray::from(array_data) + } +} + +impl ArrayBuilder for MapBuilder { + fn len(&self) -> usize { + self.len as usize + } + + fn is_empty(&self) -> bool { + self.len == 0 + } + + fn finish(&mut self) -> ArrayRef { + Arc::new(self.finish()) + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn as_any_mut(&mut self) -> &mut dyn Any { + self + } + + fn into_box_any(self: Box) -> Box { + self + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use crate::array::builder::StringBuilder; + use crate::array::Int32Builder; + use crate::bitmap::Bitmap; + use crate::buffer::Buffer; + + // TODO: add a test that finishes building, after designing a spec-compliant + // way of inserting values to the map. + // A map's values shouldn't be repeated within a slot + + #[test] + fn test_map_array_builder() { + let string_builder = StringBuilder::new(4); + let int_builder = Int32Builder::new(4); + + let mut builder = MapBuilder::new(None, string_builder, int_builder); + + let string_builder = builder.keys(); + string_builder.append_value("joe").unwrap(); + string_builder.append_null().unwrap(); + string_builder.append_null().unwrap(); + string_builder.append_value("mark").unwrap(); + + let int_builder = builder.values(); + int_builder.append_value(1).unwrap(); + int_builder.append_value(2).unwrap(); + int_builder.append_null().unwrap(); + int_builder.append_value(4).unwrap(); + + builder.append(true).unwrap(); + builder.append(false).unwrap(); + builder.append(true).unwrap(); + + let arr = builder.finish(); + + let map_data = arr.data(); + assert_eq!(3, map_data.len()); + assert_eq!(1, map_data.null_count()); + assert_eq!( + Some(&Bitmap::from(Buffer::from(&[5_u8]))), + map_data.null_bitmap() + ); + + let expected_string_data = ArrayData::builder(DataType::Utf8) + .len(4) + .null_bit_buffer(Some(Buffer::from(&[9_u8]))) + .add_buffer(Buffer::from_slice_ref(&[0, 3, 3, 3, 7])) + .add_buffer(Buffer::from_slice_ref(b"joemark")) + .build() + .unwrap(); + + let expected_int_data = ArrayData::builder(DataType::Int32) + .len(4) + .null_bit_buffer(Some(Buffer::from_slice_ref(&[11_u8]))) + .add_buffer(Buffer::from_slice_ref(&[1, 2, 0, 4])) + .build() + .unwrap(); + + assert_eq!(&expected_string_data, arr.keys().data()); + assert_eq!(&expected_int_data, arr.values().data()); + } +} diff --git a/arrow/src/array/builder/mod.rs b/arrow/src/array/builder/mod.rs new file mode 100644 index 000000000000..4cd82d9bfe3e --- /dev/null +++ b/arrow/src/array/builder/mod.rs @@ -0,0 +1,165 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Defines a [`BufferBuilder`](crate::array::BufferBuilder) capable +//! of creating a [`Buffer`](crate::buffer::Buffer) which can be used +//! as an internal buffer in an [`ArrayData`](crate::array::ArrayData) +//! object. + +mod boolean_buffer_builder; +mod boolean_builder; +mod buffer_builder; +mod decimal_builder; +mod fixed_size_list_builder; +mod generic_list_builder; +mod map_builder; +mod primitive_builder; +mod primitive_dictionary_builder; +mod string_dictionary_builder; +mod struct_builder; +mod union_builder; + +use std::any::Any; +use std::marker::PhantomData; +use std::ops::Range; + +use super::ArrayRef; +use super::OffsetSizeTrait; +use super::UInt8Builder; + +pub use boolean_buffer_builder::BooleanBufferBuilder; +pub use boolean_builder::BooleanBuilder; +pub use buffer_builder::BufferBuilder; +pub use decimal_builder::DecimalBuilder; +pub use fixed_size_list_builder::FixedSizeListBuilder; +pub use generic_list_builder::GenericListBuilder; +pub use map_builder::MapBuilder; +pub use primitive_builder::PrimitiveBuilder; +pub use primitive_dictionary_builder::PrimitiveDictionaryBuilder; +pub use string_dictionary_builder::StringDictionaryBuilder; +pub use struct_builder::StructBuilder; +pub use union_builder::UnionBuilder; + +pub use struct_builder::make_builder; + +/// Trait for dealing with different array builders at runtime +/// +/// # Example +/// +/// ``` +/// # use arrow::{ +/// # array::{ArrayBuilder, ArrayRef, Float64Builder, Int64Builder, StringArray, StringBuilder}, +/// # error::ArrowError, +/// # }; +/// # fn main() -> std::result::Result<(), ArrowError> { +/// // Create +/// let mut data_builders: Vec> = vec![ +/// Box::new(Float64Builder::new(1024)), +/// Box::new(Int64Builder::new(1024)), +/// Box::new(StringBuilder::new(1024)), +/// ]; +/// +/// // Fill +/// data_builders[0] +/// .as_any_mut() +/// .downcast_mut::() +/// .unwrap() +/// .append_value(3.14)?; +/// data_builders[1] +/// .as_any_mut() +/// .downcast_mut::() +/// .unwrap() +/// .append_value(-1)?; +/// data_builders[2] +/// .as_any_mut() +/// .downcast_mut::() +/// .unwrap() +/// .append_value("🍎")?; +/// +/// // Finish +/// let array_refs: Vec = data_builders +/// .iter_mut() +/// .map(|builder| builder.finish()) +/// .collect(); +/// assert_eq!(array_refs[0].len(), 1); +/// assert_eq!(array_refs[1].is_null(0), false); +/// assert_eq!( +/// array_refs[2] +/// .as_any() +/// .downcast_ref::() +/// .unwrap() +/// .value(0), +/// "🍎" +/// ); +/// # Ok(()) +/// # } +/// ``` +pub trait ArrayBuilder: Any + Send { + /// Returns the number of array slots in the builder + fn len(&self) -> usize; + + /// Returns whether number of array slots is zero + fn is_empty(&self) -> bool; + + /// Builds the array + fn finish(&mut self) -> ArrayRef; + + /// Returns the builder as a non-mutable `Any` reference. + /// + /// This is most useful when one wants to call non-mutable APIs on a specific builder + /// type. In this case, one can first cast this into a `Any`, and then use + /// `downcast_ref` to get a reference on the specific builder. + fn as_any(&self) -> &dyn Any; + + /// Returns the builder as a mutable `Any` reference. + /// + /// This is most useful when one wants to call mutable APIs on a specific builder + /// type. In this case, one can first cast this into a `Any`, and then use + /// `downcast_mut` to get a reference on the specific builder. + fn as_any_mut(&mut self) -> &mut dyn Any; + + /// Returns the boxed builder as a box of `Any`. + fn into_box_any(self: Box) -> Box; +} + +pub type ListBuilder = GenericListBuilder; +pub type LargeListBuilder = GenericListBuilder; + +/// Array builder for `BinaryArray` +#[derive(Debug)] +pub struct GenericBinaryBuilder { + builder: GenericListBuilder, +} + +pub type BinaryBuilder = GenericBinaryBuilder; +pub type LargeBinaryBuilder = GenericBinaryBuilder; + +#[derive(Debug)] +pub struct GenericStringBuilder { + builder: GenericListBuilder, +} + +pub type StringBuilder = GenericStringBuilder; +pub type LargeStringBuilder = GenericStringBuilder; + +#[derive(Debug)] +pub struct FixedSizeBinaryBuilder { + builder: FixedSizeListBuilder, +} + +#[cfg(test)] +mod tests {} diff --git a/arrow/src/array/builder/primitive_builder.rs b/arrow/src/array/builder/primitive_builder.rs new file mode 100644 index 000000000000..83c62509cfb0 --- /dev/null +++ b/arrow/src/array/builder/primitive_builder.rs @@ -0,0 +1,436 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::any::Any; +use std::sync::Arc; + +use crate::array::ArrayData; +use crate::array::ArrayRef; +use crate::array::DictionaryArray; +use crate::array::PrimitiveArray; +use crate::datatypes::ArrowPrimitiveType; +use crate::datatypes::DataType; +use crate::error::{ArrowError, Result}; + +use super::{ArrayBuilder, BooleanBufferBuilder, BufferBuilder}; + +/// Array builder for fixed-width primitive types +#[derive(Debug)] +pub struct PrimitiveBuilder { + values_builder: BufferBuilder, + /// We only materialize the builder when we add `false`. + /// This optimization is **very** important for performance of `StringBuilder`. + bitmap_builder: Option, +} + +impl ArrayBuilder for PrimitiveBuilder { + /// Returns the builder as a non-mutable `Any` reference. + fn as_any(&self) -> &dyn Any { + self + } + + /// Returns the builder as a mutable `Any` reference. + fn as_any_mut(&mut self) -> &mut dyn Any { + self + } + + /// Returns the boxed builder as a box of `Any`. + fn into_box_any(self: Box) -> Box { + self + } + + /// Returns the number of array slots in the builder + fn len(&self) -> usize { + self.values_builder.len() + } + + /// Returns whether the number of array slots is zero + fn is_empty(&self) -> bool { + self.values_builder.is_empty() + } + + /// Builds the array and reset this builder. + fn finish(&mut self) -> ArrayRef { + Arc::new(self.finish()) + } +} + +impl PrimitiveBuilder { + /// Creates a new primitive array builder + pub fn new(capacity: usize) -> Self { + Self { + values_builder: BufferBuilder::::new(capacity), + bitmap_builder: None, + } + } + + /// Returns the capacity of this builder measured in slots of type `T` + pub fn capacity(&self) -> usize { + self.values_builder.capacity() + } + + /// Appends a value of type `T` into the builder + #[inline] + pub fn append_value(&mut self, v: T::Native) -> Result<()> { + if let Some(b) = self.bitmap_builder.as_mut() { + b.append(true); + } + self.values_builder.append(v); + Ok(()) + } + + /// Appends a null slot into the builder + #[inline] + pub fn append_null(&mut self) -> Result<()> { + self.materialize_bitmap_builder(); + self.bitmap_builder.as_mut().unwrap().append(false); + self.values_builder.advance(1); + Ok(()) + } + + #[inline] + pub fn append_nulls(&mut self, n: usize) -> Result<()> { + self.materialize_bitmap_builder(); + self.bitmap_builder.as_mut().unwrap().append_n(n, false); + self.values_builder.advance(n); + Ok(()) + } + + /// Appends an `Option` into the builder + #[inline] + pub fn append_option(&mut self, v: Option) -> Result<()> { + match v { + None => self.append_null()?, + Some(v) => self.append_value(v)?, + }; + Ok(()) + } + + /// Appends a slice of type `T` into the builder + #[inline] + pub fn append_slice(&mut self, v: &[T::Native]) -> Result<()> { + if let Some(b) = self.bitmap_builder.as_mut() { + b.append_n(v.len(), true); + } + self.values_builder.append_slice(v); + Ok(()) + } + + /// Appends values from a slice of type `T` and a validity boolean slice + #[inline] + pub fn append_values( + &mut self, + values: &[T::Native], + is_valid: &[bool], + ) -> Result<()> { + if values.len() != is_valid.len() { + return Err(ArrowError::InvalidArgumentError( + "Value and validity lengths must be equal".to_string(), + )); + } + if is_valid.iter().any(|v| !*v) { + self.materialize_bitmap_builder(); + } + if let Some(b) = self.bitmap_builder.as_mut() { + b.append_slice(is_valid); + } + self.values_builder.append_slice(values); + Ok(()) + } + + /// Appends values from a trusted length iterator. + /// + /// # Safety + /// This requires the iterator be a trusted length. This could instead require + /// the iterator implement `TrustedLen` once that is stabilized. + #[inline] + pub unsafe fn append_trusted_len_iter( + &mut self, + iter: impl IntoIterator, + ) -> Result<()> { + let iter = iter.into_iter(); + let len = iter + .size_hint() + .1 + .expect("append_trusted_len_iter requires an upper bound"); + + if let Some(b) = self.bitmap_builder.as_mut() { + b.append_n(len, true); + } + self.values_builder.append_trusted_len_iter(iter); + Ok(()) + } + + /// Builds the `PrimitiveArray` and reset this builder. + pub fn finish(&mut self) -> PrimitiveArray { + let len = self.len(); + let null_bit_buffer = self.bitmap_builder.as_mut().map(|b| b.finish()); + let null_count = len + - null_bit_buffer + .as_ref() + .map(|b| b.count_set_bits()) + .unwrap_or(len); + let builder = ArrayData::builder(T::DATA_TYPE) + .len(len) + .add_buffer(self.values_builder.finish()) + .null_bit_buffer(if null_count > 0 { + null_bit_buffer + } else { + None + }); + + let array_data = unsafe { builder.build_unchecked() }; + PrimitiveArray::::from(array_data) + } + + /// Builds the `DictionaryArray` and reset this builder. + pub fn finish_dict(&mut self, values: ArrayRef) -> DictionaryArray { + let len = self.len(); + let null_bit_buffer = self.bitmap_builder.as_mut().map(|b| b.finish()); + let null_count = len + - null_bit_buffer + .as_ref() + .map(|b| b.count_set_bits()) + .unwrap_or(len); + let data_type = DataType::Dictionary( + Box::new(T::DATA_TYPE), + Box::new(values.data_type().clone()), + ); + let mut builder = ArrayData::builder(data_type) + .len(len) + .add_buffer(self.values_builder.finish()); + if null_count > 0 { + builder = builder.null_bit_buffer(null_bit_buffer); + } + builder = builder.add_child_data(values.data().clone()); + let array_data = unsafe { builder.build_unchecked() }; + DictionaryArray::::from(array_data) + } + + fn materialize_bitmap_builder(&mut self) { + if self.bitmap_builder.is_some() { + return; + } + let mut b = BooleanBufferBuilder::new(0); + b.reserve(self.values_builder.capacity()); + b.append_n(self.values_builder.len(), true); + self.bitmap_builder = Some(b); + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use crate::array::Array; + use crate::array::BooleanArray; + use crate::array::Date32Array; + use crate::array::Int32Array; + use crate::array::Int32Builder; + use crate::array::TimestampSecondArray; + use crate::buffer::Buffer; + + #[test] + fn test_primitive_array_builder_i32() { + let mut builder = Int32Array::builder(5); + for i in 0..5 { + builder.append_value(i).unwrap(); + } + let arr = builder.finish(); + assert_eq!(5, arr.len()); + assert_eq!(0, arr.offset()); + assert_eq!(0, arr.null_count()); + for i in 0..5 { + assert!(!arr.is_null(i)); + assert!(arr.is_valid(i)); + assert_eq!(i as i32, arr.value(i)); + } + } + + #[test] + fn test_primitive_array_builder_i32_append_iter() { + let mut builder = Int32Array::builder(5); + unsafe { builder.append_trusted_len_iter(0..5) }.unwrap(); + let arr = builder.finish(); + assert_eq!(5, arr.len()); + assert_eq!(0, arr.offset()); + assert_eq!(0, arr.null_count()); + for i in 0..5 { + assert!(!arr.is_null(i)); + assert!(arr.is_valid(i)); + assert_eq!(i as i32, arr.value(i)); + } + } + + #[test] + fn test_primitive_array_builder_i32_append_nulls() { + let mut builder = Int32Array::builder(5); + builder.append_nulls(5).unwrap(); + let arr = builder.finish(); + assert_eq!(5, arr.len()); + assert_eq!(0, arr.offset()); + assert_eq!(5, arr.null_count()); + for i in 0..5 { + assert!(arr.is_null(i)); + assert!(!arr.is_valid(i)); + } + } + + #[test] + fn test_primitive_array_builder_date32() { + let mut builder = Date32Array::builder(5); + for i in 0..5 { + builder.append_value(i).unwrap(); + } + let arr = builder.finish(); + assert_eq!(5, arr.len()); + assert_eq!(0, arr.offset()); + assert_eq!(0, arr.null_count()); + for i in 0..5 { + assert!(!arr.is_null(i)); + assert!(arr.is_valid(i)); + assert_eq!(i as i32, arr.value(i)); + } + } + + #[test] + fn test_primitive_array_builder_timestamp_second() { + let mut builder = TimestampSecondArray::builder(5); + for i in 0..5 { + builder.append_value(i).unwrap(); + } + let arr = builder.finish(); + assert_eq!(5, arr.len()); + assert_eq!(0, arr.offset()); + assert_eq!(0, arr.null_count()); + for i in 0..5 { + assert!(!arr.is_null(i)); + assert!(arr.is_valid(i)); + assert_eq!(i as i64, arr.value(i)); + } + } + + #[test] + fn test_primitive_array_builder_bool() { + // 00000010 01001000 + let buf = Buffer::from([72_u8, 2_u8]); + let mut builder = BooleanArray::builder(10); + for i in 0..10 { + if i == 3 || i == 6 || i == 9 { + builder.append_value(true).unwrap(); + } else { + builder.append_value(false).unwrap(); + } + } + + let arr = builder.finish(); + assert_eq!(&buf, arr.values()); + assert_eq!(10, arr.len()); + assert_eq!(0, arr.offset()); + assert_eq!(0, arr.null_count()); + for i in 0..10 { + assert!(!arr.is_null(i)); + assert!(arr.is_valid(i)); + assert_eq!(i == 3 || i == 6 || i == 9, arr.value(i), "failed at {}", i) + } + } + + #[test] + fn test_primitive_array_builder_append_option() { + let arr1 = Int32Array::from(vec![Some(0), None, Some(2), None, Some(4)]); + + let mut builder = Int32Array::builder(5); + builder.append_option(Some(0)).unwrap(); + builder.append_option(None).unwrap(); + builder.append_option(Some(2)).unwrap(); + builder.append_option(None).unwrap(); + builder.append_option(Some(4)).unwrap(); + let arr2 = builder.finish(); + + assert_eq!(arr1.len(), arr2.len()); + assert_eq!(arr1.offset(), arr2.offset()); + assert_eq!(arr1.null_count(), arr2.null_count()); + for i in 0..5 { + assert_eq!(arr1.is_null(i), arr2.is_null(i)); + assert_eq!(arr1.is_valid(i), arr2.is_valid(i)); + if arr1.is_valid(i) { + assert_eq!(arr1.value(i), arr2.value(i)); + } + } + } + + #[test] + fn test_primitive_array_builder_append_null() { + let arr1 = Int32Array::from(vec![Some(0), Some(2), None, None, Some(4)]); + + let mut builder = Int32Array::builder(5); + builder.append_value(0).unwrap(); + builder.append_value(2).unwrap(); + builder.append_null().unwrap(); + builder.append_null().unwrap(); + builder.append_value(4).unwrap(); + let arr2 = builder.finish(); + + assert_eq!(arr1.len(), arr2.len()); + assert_eq!(arr1.offset(), arr2.offset()); + assert_eq!(arr1.null_count(), arr2.null_count()); + for i in 0..5 { + assert_eq!(arr1.is_null(i), arr2.is_null(i)); + assert_eq!(arr1.is_valid(i), arr2.is_valid(i)); + if arr1.is_valid(i) { + assert_eq!(arr1.value(i), arr2.value(i)); + } + } + } + + #[test] + fn test_primitive_array_builder_append_slice() { + let arr1 = Int32Array::from(vec![Some(0), Some(2), None, None, Some(4)]); + + let mut builder = Int32Array::builder(5); + builder.append_slice(&[0, 2]).unwrap(); + builder.append_null().unwrap(); + builder.append_null().unwrap(); + builder.append_value(4).unwrap(); + let arr2 = builder.finish(); + + assert_eq!(arr1.len(), arr2.len()); + assert_eq!(arr1.offset(), arr2.offset()); + assert_eq!(arr1.null_count(), arr2.null_count()); + for i in 0..5 { + assert_eq!(arr1.is_null(i), arr2.is_null(i)); + assert_eq!(arr1.is_valid(i), arr2.is_valid(i)); + if arr1.is_valid(i) { + assert_eq!(arr1.value(i), arr2.value(i)); + } + } + } + + #[test] + fn test_primitive_array_builder_finish() { + let mut builder = Int32Builder::new(5); + builder.append_slice(&[2, 4, 6, 8]).unwrap(); + let mut arr = builder.finish(); + assert_eq!(4, arr.len()); + assert_eq!(0, builder.len()); + + builder.append_slice(&[1, 3, 5, 7, 9]).unwrap(); + arr = builder.finish(); + assert_eq!(5, arr.len()); + assert_eq!(0, builder.len()); + } +} diff --git a/arrow/src/array/builder/primitive_dictionary_builder.rs b/arrow/src/array/builder/primitive_dictionary_builder.rs new file mode 100644 index 000000000000..93695e0b730a --- /dev/null +++ b/arrow/src/array/builder/primitive_dictionary_builder.rs @@ -0,0 +1,222 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::any::Any; +use std::collections::HashMap; +use std::sync::Arc; + +use crate::array::ArrayRef; +use crate::array::ArrowPrimitiveType; +use crate::array::DictionaryArray; +use crate::datatypes::ArrowNativeType; +use crate::datatypes::ToByteSlice; +use crate::error::{ArrowError, Result}; + +use super::ArrayBuilder; +use super::PrimitiveBuilder; + +/// Array builder for `DictionaryArray`. For example to map a set of byte indices +/// to f32 values. Note that the use of a `HashMap` here will not scale to very large +/// arrays or result in an ordered dictionary. +/// +/// # Example: +/// +/// ``` +/// use arrow::array::{ +/// Array, PrimitiveBuilder, PrimitiveDictionaryBuilder, +/// UInt8Array, UInt32Array, +/// }; +/// use arrow::datatypes::{UInt8Type, UInt32Type}; +/// +/// let key_builder = PrimitiveBuilder::::new(3); +/// let value_builder = PrimitiveBuilder::::new(2); +/// let mut builder = PrimitiveDictionaryBuilder::new(key_builder, value_builder); +/// builder.append(12345678).unwrap(); +/// builder.append_null().unwrap(); +/// builder.append(22345678).unwrap(); +/// let array = builder.finish(); +/// +/// assert_eq!( +/// array.keys(), +/// &UInt8Array::from(vec![Some(0), None, Some(1)]) +/// ); +/// +/// // Values are polymorphic and so require a downcast. +/// let av = array.values(); +/// let ava: &UInt32Array = av.as_any().downcast_ref::().unwrap(); +/// let avs: &[u32] = ava.values(); +/// +/// assert!(!array.is_null(0)); +/// assert!(array.is_null(1)); +/// assert!(!array.is_null(2)); +/// +/// assert_eq!(avs, &[12345678, 22345678]); +/// ``` +#[derive(Debug)] +pub struct PrimitiveDictionaryBuilder +where + K: ArrowPrimitiveType, + V: ArrowPrimitiveType, +{ + keys_builder: PrimitiveBuilder, + values_builder: PrimitiveBuilder, + map: HashMap, K::Native>, +} + +impl PrimitiveDictionaryBuilder +where + K: ArrowPrimitiveType, + V: ArrowPrimitiveType, +{ + /// Creates a new `PrimitiveDictionaryBuilder` from a keys builder and a value builder. + pub fn new( + keys_builder: PrimitiveBuilder, + values_builder: PrimitiveBuilder, + ) -> Self { + Self { + keys_builder, + values_builder, + map: HashMap::new(), + } + } +} + +impl ArrayBuilder for PrimitiveDictionaryBuilder +where + K: ArrowPrimitiveType, + V: ArrowPrimitiveType, +{ + /// Returns the builder as an non-mutable `Any` reference. + fn as_any(&self) -> &dyn Any { + self + } + + /// Returns the builder as an mutable `Any` reference. + fn as_any_mut(&mut self) -> &mut dyn Any { + self + } + + /// Returns the boxed builder as a box of `Any`. + fn into_box_any(self: Box) -> Box { + self + } + + /// Returns the number of array slots in the builder + fn len(&self) -> usize { + self.keys_builder.len() + } + + /// Returns whether the number of array slots is zero + fn is_empty(&self) -> bool { + self.keys_builder.is_empty() + } + + /// Builds the array and reset this builder. + fn finish(&mut self) -> ArrayRef { + Arc::new(self.finish()) + } +} + +impl PrimitiveDictionaryBuilder +where + K: ArrowPrimitiveType, + V: ArrowPrimitiveType, +{ + /// Append a primitive value to the array. Return an existing index + /// if already present in the values array or a new index if the + /// value is appended to the values array. + #[inline] + pub fn append(&mut self, value: V::Native) -> Result { + if let Some(&key) = self.map.get(value.to_byte_slice()) { + // Append existing value. + self.keys_builder.append_value(key)?; + Ok(key) + } else { + // Append new value. + let key = K::Native::from_usize(self.values_builder.len()) + .ok_or(ArrowError::DictionaryKeyOverflowError)?; + self.values_builder.append_value(value)?; + self.keys_builder.append_value(key as K::Native)?; + self.map.insert(value.to_byte_slice().into(), key); + Ok(key) + } + } + + #[inline] + pub fn append_null(&mut self) -> Result<()> { + self.keys_builder.append_null() + } + + /// Builds the `DictionaryArray` and reset this builder. + pub fn finish(&mut self) -> DictionaryArray { + self.map.clear(); + let value_ref: ArrayRef = Arc::new(self.values_builder.finish()); + self.keys_builder.finish_dict(value_ref) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use crate::array::Array; + use crate::array::UInt32Array; + use crate::array::UInt8Array; + use crate::datatypes::UInt32Type; + use crate::datatypes::UInt8Type; + + #[test] + fn test_primitive_dictionary_builder() { + let key_builder = PrimitiveBuilder::::new(3); + let value_builder = PrimitiveBuilder::::new(2); + let mut builder = PrimitiveDictionaryBuilder::new(key_builder, value_builder); + builder.append(12345678).unwrap(); + builder.append_null().unwrap(); + builder.append(22345678).unwrap(); + let array = builder.finish(); + + assert_eq!( + array.keys(), + &UInt8Array::from(vec![Some(0), None, Some(1)]) + ); + + // Values are polymorphic and so require a downcast. + let av = array.values(); + let ava: &UInt32Array = av.as_any().downcast_ref::().unwrap(); + let avs: &[u32] = ava.values(); + + assert!(!array.is_null(0)); + assert!(array.is_null(1)); + assert!(!array.is_null(2)); + + assert_eq!(avs, &[12345678, 22345678]); + } + + #[test] + #[should_panic(expected = "DictionaryKeyOverflowError")] + fn test_primitive_dictionary_overflow() { + let key_builder = PrimitiveBuilder::::new(257); + let value_builder = PrimitiveBuilder::::new(257); + let mut builder = PrimitiveDictionaryBuilder::new(key_builder, value_builder); + // 256 unique keys. + for i in 0..256 { + builder.append(i + 1000).unwrap(); + } + // Special error if the key overflows (256th entry) + builder.append(1257).unwrap(); + } +} diff --git a/arrow/src/array/builder/string_dictionary_builder.rs b/arrow/src/array/builder/string_dictionary_builder.rs new file mode 100644 index 000000000000..d1b872fd3134 --- /dev/null +++ b/arrow/src/array/builder/string_dictionary_builder.rs @@ -0,0 +1,313 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::any::Any; +use std::collections::HashMap; +use std::sync::Arc; + +use crate::array::array::Array; +use crate::array::ArrayBuilder; +use crate::array::ArrayRef; +use crate::array::ArrowDictionaryKeyType; +use crate::array::DictionaryArray; +use crate::array::PrimitiveBuilder; +use crate::array::StringArray; +use crate::array::StringBuilder; +use crate::datatypes::ArrowNativeType; +use crate::error::{ArrowError, Result}; + +/// Array builder for `DictionaryArray` that stores Strings. For example to map a set of byte indices +/// to String values. Note that the use of a `HashMap` here will not scale to very large +/// arrays or result in an ordered dictionary. +/// +/// ``` +/// use arrow::{ +/// array::{ +/// Int8Array, StringArray, +/// PrimitiveBuilder, StringBuilder, StringDictionaryBuilder, +/// }, +/// datatypes::Int8Type, +/// }; +/// +/// // Create a dictionary array indexed by bytes whose values are Strings. +/// // It can thus hold up to 256 distinct string values. +/// +/// let key_builder = PrimitiveBuilder::::new(100); +/// let value_builder = StringBuilder::new(100); +/// let mut builder = StringDictionaryBuilder::new(key_builder, value_builder); +/// +/// // The builder builds the dictionary value by value +/// builder.append("abc").unwrap(); +/// builder.append_null().unwrap(); +/// builder.append("def").unwrap(); +/// builder.append("def").unwrap(); +/// builder.append("abc").unwrap(); +/// let array = builder.finish(); +/// +/// assert_eq!( +/// array.keys(), +/// &Int8Array::from(vec![Some(0), None, Some(1), Some(1), Some(0)]) +/// ); +/// +/// // Values are polymorphic and so require a downcast. +/// let av = array.values(); +/// let ava: &StringArray = av.as_any().downcast_ref::().unwrap(); +/// +/// assert_eq!(ava.value(0), "abc"); +/// assert_eq!(ava.value(1), "def"); +/// +/// ``` +#[derive(Debug)] +pub struct StringDictionaryBuilder +where + K: ArrowDictionaryKeyType, +{ + keys_builder: PrimitiveBuilder, + values_builder: StringBuilder, + map: HashMap, K::Native>, +} + +impl StringDictionaryBuilder +where + K: ArrowDictionaryKeyType, +{ + /// Creates a new `StringDictionaryBuilder` from a keys builder and a value builder. + pub fn new(keys_builder: PrimitiveBuilder, values_builder: StringBuilder) -> Self { + Self { + keys_builder, + values_builder, + map: HashMap::new(), + } + } + + /// Creates a new `StringDictionaryBuilder` from a keys builder and a dictionary + /// which is initialized with the given values. + /// The indices of those dictionary values are used as keys. + /// + /// # Example + /// + /// ``` + /// use arrow::datatypes::Int16Type; + /// use arrow::array::{StringArray, StringDictionaryBuilder, PrimitiveBuilder, Int16Array}; + /// use std::convert::TryFrom; + /// + /// let dictionary_values = StringArray::from(vec![None, Some("abc"), Some("def")]); + /// + /// let mut builder = StringDictionaryBuilder::new_with_dictionary(PrimitiveBuilder::::new(3), &dictionary_values).unwrap(); + /// builder.append("def").unwrap(); + /// builder.append_null().unwrap(); + /// builder.append("abc").unwrap(); + /// + /// let dictionary_array = builder.finish(); + /// + /// let keys = dictionary_array.keys(); + /// + /// assert_eq!(keys, &Int16Array::from(vec![Some(2), None, Some(1)])); + /// ``` + pub fn new_with_dictionary( + keys_builder: PrimitiveBuilder, + dictionary_values: &StringArray, + ) -> Result { + let dict_len = dictionary_values.len(); + let mut values_builder = + StringBuilder::with_capacity(dict_len, dictionary_values.value_data().len()); + let mut map: HashMap, K::Native> = HashMap::with_capacity(dict_len); + for i in 0..dict_len { + if dictionary_values.is_valid(i) { + let value = dictionary_values.value(i); + map.insert( + value.as_bytes().into(), + K::Native::from_usize(i) + .ok_or(ArrowError::DictionaryKeyOverflowError)?, + ); + values_builder.append_value(value)?; + } else { + values_builder.append_null()?; + } + } + Ok(Self { + keys_builder, + values_builder, + map, + }) + } +} + +impl ArrayBuilder for StringDictionaryBuilder +where + K: ArrowDictionaryKeyType, +{ + /// Returns the builder as an non-mutable `Any` reference. + fn as_any(&self) -> &dyn Any { + self + } + + /// Returns the builder as an mutable `Any` reference. + fn as_any_mut(&mut self) -> &mut dyn Any { + self + } + + /// Returns the boxed builder as a box of `Any`. + fn into_box_any(self: Box) -> Box { + self + } + + /// Returns the number of array slots in the builder + fn len(&self) -> usize { + self.keys_builder.len() + } + + /// Returns whether the number of array slots is zero + fn is_empty(&self) -> bool { + self.keys_builder.is_empty() + } + + /// Builds the array and reset this builder. + fn finish(&mut self) -> ArrayRef { + Arc::new(self.finish()) + } +} + +impl StringDictionaryBuilder +where + K: ArrowDictionaryKeyType, +{ + /// Append a primitive value to the array. Return an existing index + /// if already present in the values array or a new index if the + /// value is appended to the values array. + pub fn append(&mut self, value: impl AsRef) -> Result { + if let Some(&key) = self.map.get(value.as_ref().as_bytes()) { + // Append existing value. + self.keys_builder.append_value(key)?; + Ok(key) + } else { + // Append new value. + let key = K::Native::from_usize(self.values_builder.len()) + .ok_or(ArrowError::DictionaryKeyOverflowError)?; + self.values_builder.append_value(value.as_ref())?; + self.keys_builder.append_value(key as K::Native)?; + self.map.insert(value.as_ref().as_bytes().into(), key); + Ok(key) + } + } + + #[inline] + pub fn append_null(&mut self) -> Result<()> { + self.keys_builder.append_null() + } + + /// Builds the `DictionaryArray` and reset this builder. + pub fn finish(&mut self) -> DictionaryArray { + self.map.clear(); + let value_ref: ArrayRef = Arc::new(self.values_builder.finish()); + self.keys_builder.finish_dict(value_ref) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use crate::array::Array; + use crate::array::Int8Array; + use crate::datatypes::Int16Type; + use crate::datatypes::Int8Type; + + #[test] + fn test_string_dictionary_builder() { + let key_builder = PrimitiveBuilder::::new(5); + let value_builder = StringBuilder::new(2); + let mut builder = StringDictionaryBuilder::new(key_builder, value_builder); + builder.append("abc").unwrap(); + builder.append_null().unwrap(); + builder.append("def").unwrap(); + builder.append("def").unwrap(); + builder.append("abc").unwrap(); + let array = builder.finish(); + + assert_eq!( + array.keys(), + &Int8Array::from(vec![Some(0), None, Some(1), Some(1), Some(0)]) + ); + + // Values are polymorphic and so require a downcast. + let av = array.values(); + let ava: &StringArray = av.as_any().downcast_ref::().unwrap(); + + assert_eq!(ava.value(0), "abc"); + assert_eq!(ava.value(1), "def"); + } + + #[test] + fn test_string_dictionary_builder_with_existing_dictionary() { + let dictionary = StringArray::from(vec![None, Some("def"), Some("abc")]); + + let key_builder = PrimitiveBuilder::::new(6); + let mut builder = + StringDictionaryBuilder::new_with_dictionary(key_builder, &dictionary) + .unwrap(); + builder.append("abc").unwrap(); + builder.append_null().unwrap(); + builder.append("def").unwrap(); + builder.append("def").unwrap(); + builder.append("abc").unwrap(); + builder.append("ghi").unwrap(); + let array = builder.finish(); + + assert_eq!( + array.keys(), + &Int8Array::from(vec![Some(2), None, Some(1), Some(1), Some(2), Some(3)]) + ); + + // Values are polymorphic and so require a downcast. + let av = array.values(); + let ava: &StringArray = av.as_any().downcast_ref::().unwrap(); + + assert!(!ava.is_valid(0)); + assert_eq!(ava.value(1), "def"); + assert_eq!(ava.value(2), "abc"); + assert_eq!(ava.value(3), "ghi"); + } + + #[test] + fn test_string_dictionary_builder_with_reserved_null_value() { + let dictionary: Vec> = vec![None]; + let dictionary = StringArray::from(dictionary); + + let key_builder = PrimitiveBuilder::::new(4); + let mut builder = + StringDictionaryBuilder::new_with_dictionary(key_builder, &dictionary) + .unwrap(); + builder.append("abc").unwrap(); + builder.append_null().unwrap(); + builder.append("def").unwrap(); + builder.append("abc").unwrap(); + let array = builder.finish(); + + assert!(array.is_null(1)); + assert!(!array.is_valid(1)); + + let keys = array.keys(); + + assert_eq!(keys.value(0), 1); + assert!(keys.is_null(1)); + // zero initialization is currently guaranteed by Buffer allocation and resizing + assert_eq!(keys.value(1), 0); + assert_eq!(keys.value(2), 2); + assert_eq!(keys.value(3), 1); + } +} diff --git a/arrow/src/array/builder/struct_builder.rs b/arrow/src/array/builder/struct_builder.rs new file mode 100644 index 000000000000..e69844b71739 --- /dev/null +++ b/arrow/src/array/builder/struct_builder.rs @@ -0,0 +1,420 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::any::Any; +use std::fmt; +use std::sync::Arc; + +use crate::array::*; +use crate::datatypes::DataType; +use crate::datatypes::Field; +use crate::error::Result; + +/// Array builder for Struct types. +/// +/// Note that callers should make sure that methods of all the child field builders are +/// properly called to maintain the consistency of the data structure. +pub struct StructBuilder { + fields: Vec, + field_builders: Vec>, + bitmap_builder: BooleanBufferBuilder, + len: usize, +} + +impl fmt::Debug for StructBuilder { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("StructBuilder") + .field("fields", &self.fields) + .field("bitmap_builder", &self.bitmap_builder) + .field("len", &self.len) + .finish() + } +} + +impl ArrayBuilder for StructBuilder { + /// Returns the number of array slots in the builder. + /// + /// Note that this always return the first child field builder's length, and it is + /// the caller's responsibility to maintain the consistency that all the child field + /// builder should have the equal number of elements. + fn len(&self) -> usize { + self.len + } + + /// Returns whether the number of array slots is zero + fn is_empty(&self) -> bool { + self.len == 0 + } + + /// Builds the array. + fn finish(&mut self) -> ArrayRef { + Arc::new(self.finish()) + } + + /// Returns the builder as a non-mutable `Any` reference. + /// + /// This is most useful when one wants to call non-mutable APIs on a specific builder + /// type. In this case, one can first cast this into a `Any`, and then use + /// `downcast_ref` to get a reference on the specific builder. + fn as_any(&self) -> &dyn Any { + self + } + + /// Returns the builder as a mutable `Any` reference. + /// + /// This is most useful when one wants to call mutable APIs on a specific builder + /// type. In this case, one can first cast this into a `Any`, and then use + /// `downcast_mut` to get a reference on the specific builder. + fn as_any_mut(&mut self) -> &mut dyn Any { + self + } + + /// Returns the boxed builder as a box of `Any`. + fn into_box_any(self: Box) -> Box { + self + } +} + +/// Returns a builder with capacity `capacity` that corresponds to the datatype `DataType` +/// This function is useful to construct arrays from an arbitrary vectors with known/expected +/// schema. +pub fn make_builder(datatype: &DataType, capacity: usize) -> Box { + match datatype { + DataType::Null => unimplemented!(), + DataType::Boolean => Box::new(BooleanBuilder::new(capacity)), + DataType::Int8 => Box::new(Int8Builder::new(capacity)), + DataType::Int16 => Box::new(Int16Builder::new(capacity)), + DataType::Int32 => Box::new(Int32Builder::new(capacity)), + DataType::Int64 => Box::new(Int64Builder::new(capacity)), + DataType::UInt8 => Box::new(UInt8Builder::new(capacity)), + DataType::UInt16 => Box::new(UInt16Builder::new(capacity)), + DataType::UInt32 => Box::new(UInt32Builder::new(capacity)), + DataType::UInt64 => Box::new(UInt64Builder::new(capacity)), + DataType::Float32 => Box::new(Float32Builder::new(capacity)), + DataType::Float64 => Box::new(Float64Builder::new(capacity)), + DataType::Binary => Box::new(BinaryBuilder::new(capacity)), + DataType::FixedSizeBinary(len) => { + Box::new(FixedSizeBinaryBuilder::new(capacity, *len)) + } + DataType::Decimal(precision, scale) => { + Box::new(DecimalBuilder::new(capacity, *precision, *scale)) + } + DataType::Utf8 => Box::new(StringBuilder::new(capacity)), + DataType::Date32 => Box::new(Date32Builder::new(capacity)), + DataType::Date64 => Box::new(Date64Builder::new(capacity)), + DataType::Time32(TimeUnit::Second) => { + Box::new(Time32SecondBuilder::new(capacity)) + } + DataType::Time32(TimeUnit::Millisecond) => { + Box::new(Time32MillisecondBuilder::new(capacity)) + } + DataType::Time64(TimeUnit::Microsecond) => { + Box::new(Time64MicrosecondBuilder::new(capacity)) + } + DataType::Time64(TimeUnit::Nanosecond) => { + Box::new(Time64NanosecondBuilder::new(capacity)) + } + DataType::Timestamp(TimeUnit::Second, _) => { + Box::new(TimestampSecondBuilder::new(capacity)) + } + DataType::Timestamp(TimeUnit::Millisecond, _) => { + Box::new(TimestampMillisecondBuilder::new(capacity)) + } + DataType::Timestamp(TimeUnit::Microsecond, _) => { + Box::new(TimestampMicrosecondBuilder::new(capacity)) + } + DataType::Timestamp(TimeUnit::Nanosecond, _) => { + Box::new(TimestampNanosecondBuilder::new(capacity)) + } + DataType::Interval(IntervalUnit::YearMonth) => { + Box::new(IntervalYearMonthBuilder::new(capacity)) + } + DataType::Interval(IntervalUnit::DayTime) => { + Box::new(IntervalDayTimeBuilder::new(capacity)) + } + DataType::Interval(IntervalUnit::MonthDayNano) => { + Box::new(IntervalMonthDayNanoBuilder::new(capacity)) + } + DataType::Duration(TimeUnit::Second) => { + Box::new(DurationSecondBuilder::new(capacity)) + } + DataType::Duration(TimeUnit::Millisecond) => { + Box::new(DurationMillisecondBuilder::new(capacity)) + } + DataType::Duration(TimeUnit::Microsecond) => { + Box::new(DurationMicrosecondBuilder::new(capacity)) + } + DataType::Duration(TimeUnit::Nanosecond) => { + Box::new(DurationNanosecondBuilder::new(capacity)) + } + DataType::Struct(fields) => { + Box::new(StructBuilder::from_fields(fields.clone(), capacity)) + } + t => panic!("Data type {:?} is not currently supported", t), + } +} + +impl StructBuilder { + pub fn new(fields: Vec, field_builders: Vec>) -> Self { + Self { + fields, + field_builders, + bitmap_builder: BooleanBufferBuilder::new(0), + len: 0, + } + } + + pub fn from_fields(fields: Vec, capacity: usize) -> Self { + let mut builders = Vec::with_capacity(fields.len()); + for field in &fields { + builders.push(make_builder(field.data_type(), capacity)); + } + Self::new(fields, builders) + } + + /// Returns a mutable reference to the child field builder at index `i`. + /// Result will be `None` if the input type `T` provided doesn't match the actual + /// field builder's type. + pub fn field_builder(&mut self, i: usize) -> Option<&mut T> { + self.field_builders[i].as_any_mut().downcast_mut::() + } + + /// Returns the number of fields for the struct this builder is building. + pub fn num_fields(&self) -> usize { + self.field_builders.len() + } + + /// Appends an element (either null or non-null) to the struct. The actual elements + /// should be appended for each child sub-array in a consistent way. + #[inline] + pub fn append(&mut self, is_valid: bool) -> Result<()> { + self.bitmap_builder.append(is_valid); + self.len += 1; + Ok(()) + } + + /// Appends a null element to the struct. + #[inline] + pub fn append_null(&mut self) -> Result<()> { + self.append(false) + } + + /// Builds the `StructArray` and reset this builder. + pub fn finish(&mut self) -> StructArray { + let mut child_data = Vec::with_capacity(self.field_builders.len()); + for f in &mut self.field_builders { + let arr = f.finish(); + child_data.push(arr.data().clone()); + } + + let null_bit_buffer = self.bitmap_builder.finish(); + let null_count = self.len - null_bit_buffer.count_set_bits(); + let mut builder = ArrayData::builder(DataType::Struct(self.fields.clone())) + .len(self.len) + .child_data(child_data); + if null_count > 0 { + builder = builder.null_bit_buffer(Some(null_bit_buffer)); + } + + self.len = 0; + + let array_data = unsafe { builder.build_unchecked() }; + StructArray::from(array_data) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use crate::array::Array; + use crate::bitmap::Bitmap; + use crate::buffer::Buffer; + + #[test] + fn test_struct_array_builder() { + let string_builder = StringBuilder::new(4); + let int_builder = Int32Builder::new(4); + + let mut fields = Vec::new(); + let mut field_builders = Vec::new(); + fields.push(Field::new("f1", DataType::Utf8, false)); + field_builders.push(Box::new(string_builder) as Box); + fields.push(Field::new("f2", DataType::Int32, false)); + field_builders.push(Box::new(int_builder) as Box); + + let mut builder = StructBuilder::new(fields, field_builders); + assert_eq!(2, builder.num_fields()); + + let string_builder = builder + .field_builder::(0) + .expect("builder at field 0 should be string builder"); + string_builder.append_value("joe").unwrap(); + string_builder.append_null().unwrap(); + string_builder.append_null().unwrap(); + string_builder.append_value("mark").unwrap(); + + let int_builder = builder + .field_builder::(1) + .expect("builder at field 1 should be int builder"); + int_builder.append_value(1).unwrap(); + int_builder.append_value(2).unwrap(); + int_builder.append_null().unwrap(); + int_builder.append_value(4).unwrap(); + + builder.append(true).unwrap(); + builder.append(true).unwrap(); + builder.append_null().unwrap(); + builder.append(true).unwrap(); + + let arr = builder.finish(); + + let struct_data = arr.data(); + assert_eq!(4, struct_data.len()); + assert_eq!(1, struct_data.null_count()); + assert_eq!( + Some(&Bitmap::from(Buffer::from(&[11_u8]))), + struct_data.null_bitmap() + ); + + let expected_string_data = ArrayData::builder(DataType::Utf8) + .len(4) + .null_bit_buffer(Some(Buffer::from(&[9_u8]))) + .add_buffer(Buffer::from_slice_ref(&[0, 3, 3, 3, 7])) + .add_buffer(Buffer::from_slice_ref(b"joemark")) + .build() + .unwrap(); + + let expected_int_data = ArrayData::builder(DataType::Int32) + .len(4) + .null_bit_buffer(Some(Buffer::from_slice_ref(&[11_u8]))) + .add_buffer(Buffer::from_slice_ref(&[1, 2, 0, 4])) + .build() + .unwrap(); + + assert_eq!(expected_string_data, *arr.column(0).data()); + assert_eq!(expected_int_data, *arr.column(1).data()); + } + + #[test] + fn test_struct_array_builder_finish() { + let int_builder = Int32Builder::new(10); + let bool_builder = BooleanBuilder::new(10); + + let mut fields = Vec::new(); + let mut field_builders = Vec::new(); + fields.push(Field::new("f1", DataType::Int32, false)); + field_builders.push(Box::new(int_builder) as Box); + fields.push(Field::new("f2", DataType::Boolean, false)); + field_builders.push(Box::new(bool_builder) as Box); + + let mut builder = StructBuilder::new(fields, field_builders); + builder + .field_builder::(0) + .unwrap() + .append_slice(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) + .unwrap(); + builder + .field_builder::(1) + .unwrap() + .append_slice(&[ + false, true, false, true, false, true, false, true, false, true, + ]) + .unwrap(); + + // Append slot values - all are valid. + for _ in 0..10 { + assert!(builder.append(true).is_ok()) + } + + assert_eq!(10, builder.len()); + + let arr = builder.finish(); + + assert_eq!(10, arr.len()); + assert_eq!(0, builder.len()); + + builder + .field_builder::(0) + .unwrap() + .append_slice(&[1, 3, 5, 7, 9]) + .unwrap(); + builder + .field_builder::(1) + .unwrap() + .append_slice(&[false, true, false, true, false]) + .unwrap(); + + // Append slot values - all are valid. + for _ in 0..5 { + assert!(builder.append(true).is_ok()) + } + + assert_eq!(5, builder.len()); + + let arr = builder.finish(); + + assert_eq!(5, arr.len()); + assert_eq!(0, builder.len()); + } + + #[test] + fn test_struct_array_builder_from_schema() { + let mut fields = vec![ + Field::new("f1", DataType::Float32, false), + Field::new("f2", DataType::Utf8, false), + ]; + let sub_fields = vec![ + Field::new("g1", DataType::Int32, false), + Field::new("g2", DataType::Boolean, false), + ]; + let struct_type = DataType::Struct(sub_fields); + fields.push(Field::new("f3", struct_type, false)); + + let mut builder = StructBuilder::from_fields(fields, 5); + assert_eq!(3, builder.num_fields()); + assert!(builder.field_builder::(0).is_some()); + assert!(builder.field_builder::(1).is_some()); + assert!(builder.field_builder::(2).is_some()); + } + + #[test] + #[should_panic( + expected = "Data type List(Field { name: \"item\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: None }) is not currently supported" + )] + fn test_struct_array_builder_from_schema_unsupported_type() { + let mut fields = vec![Field::new("f1", DataType::Int16, false)]; + let list_type = + DataType::List(Box::new(Field::new("item", DataType::Int64, true))); + fields.push(Field::new("f2", list_type, false)); + + let _ = StructBuilder::from_fields(fields, 5); + } + + #[test] + fn test_struct_array_builder_field_builder_type_mismatch() { + let int_builder = Int32Builder::new(10); + + let mut fields = Vec::new(); + let mut field_builders = Vec::new(); + fields.push(Field::new("f1", DataType::Int32, false)); + field_builders.push(Box::new(int_builder) as Box); + + let mut builder = StructBuilder::new(fields, field_builders); + assert!(builder.field_builder::(0).is_none()); + } +} diff --git a/arrow/src/array/builder/union_builder.rs b/arrow/src/array/builder/union_builder.rs new file mode 100644 index 000000000000..78f9a3f4b430 --- /dev/null +++ b/arrow/src/array/builder/union_builder.rs @@ -0,0 +1,338 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::collections::HashMap; + +use crate::array::ArrayDataBuilder; +use crate::array::Int32BufferBuilder; +use crate::array::Int8BufferBuilder; +use crate::array::UnionArray; +use crate::buffer::MutableBuffer; + +use crate::datatypes::ArrowPrimitiveType; +use crate::datatypes::DataType; +use crate::datatypes::Field; +use crate::datatypes::IntervalMonthDayNanoType; +use crate::datatypes::IntervalUnit; +use crate::datatypes::{Float32Type, Float64Type}; +use crate::datatypes::{Int16Type, Int32Type, Int64Type, Int8Type}; +use crate::datatypes::{UInt16Type, UInt32Type, UInt64Type, UInt8Type}; +use crate::error::{ArrowError, Result}; + +use super::{BooleanBufferBuilder, BufferBuilder}; + +use super::buffer_builder::builder_to_mutable_buffer; +use super::buffer_builder::mutable_buffer_to_builder; +use crate::array::make_array; + +/// `FieldData` is a helper struct to track the state of the fields in the `UnionBuilder`. +#[derive(Debug)] +struct FieldData { + /// The type id for this field + type_id: i8, + /// The Arrow data type represented in the `values_buffer`, which is untyped + data_type: DataType, + /// A buffer containing the values for this field in raw bytes + values_buffer: Option, + /// The number of array slots represented by the buffer + slots: usize, + /// A builder for the null bitmap + bitmap_builder: BooleanBufferBuilder, +} + +impl FieldData { + /// Creates a new `FieldData`. + fn new(type_id: i8, data_type: DataType) -> Self { + Self { + type_id, + data_type, + values_buffer: Some(MutableBuffer::new(1)), + slots: 0, + bitmap_builder: BooleanBufferBuilder::new(1), + } + } + + /// Appends a single value to this `FieldData`'s `values_buffer`. + #[allow(clippy::unnecessary_wraps)] + fn append_to_values_buffer( + &mut self, + v: T::Native, + ) -> Result<()> { + let values_buffer = self + .values_buffer + .take() + .expect("Values buffer was never created"); + let mut builder: BufferBuilder = + mutable_buffer_to_builder(values_buffer, self.slots); + builder.append(v); + let mutable_buffer = builder_to_mutable_buffer(builder); + self.values_buffer = Some(mutable_buffer); + + self.slots += 1; + self.bitmap_builder.append(true); + Ok(()) + } + + /// Appends a null to this `FieldData`. + #[allow(clippy::unnecessary_wraps)] + fn append_null(&mut self) -> Result<()> { + let values_buffer = self + .values_buffer + .take() + .expect("Values buffer was never created"); + + let mut builder: BufferBuilder = + mutable_buffer_to_builder(values_buffer, self.slots); + + builder.advance(1); + let mutable_buffer = builder_to_mutable_buffer(builder); + self.values_buffer = Some(mutable_buffer); + self.slots += 1; + self.bitmap_builder.append(false); + Ok(()) + } + + /// Appends a null to this `FieldData` when the type is not known at compile time. + /// + /// As the main `append` method of `UnionBuilder` is generic, we need a way to append null + /// slots to the fields that are not being appended to in the case of sparse unions. This + /// method solves this problem by appending dynamically based on `DataType`. + /// + /// Note, this method does **not** update the length of the `UnionArray` (this is done by the + /// main append operation) and assumes that it is called from a method that is generic over `T` + /// where `T` satisfies the bound `ArrowPrimitiveType`. + fn append_null_dynamic(&mut self) -> Result<()> { + match self.data_type { + DataType::Null => unimplemented!(), + DataType::Int8 => self.append_null::()?, + DataType::Int16 => self.append_null::()?, + DataType::Int32 + | DataType::Date32 + | DataType::Time32(_) + | DataType::Interval(IntervalUnit::YearMonth) => { + self.append_null::()? + } + DataType::Int64 + | DataType::Timestamp(_, _) + | DataType::Date64 + | DataType::Time64(_) + | DataType::Interval(IntervalUnit::DayTime) + | DataType::Duration(_) => self.append_null::()?, + DataType::Interval(IntervalUnit::MonthDayNano) => self.append_null::()?, + DataType::UInt8 => self.append_null::()?, + DataType::UInt16 => self.append_null::()?, + DataType::UInt32 => self.append_null::()?, + DataType::UInt64 => self.append_null::()?, + DataType::Float32 => self.append_null::()?, + DataType::Float64 => self.append_null::()?, + _ => unreachable!("All cases of types that satisfy the trait bounds over T are covered above."), + }; + Ok(()) + } +} + +/// Builder type for creating a new `UnionArray`. +/// +/// Example: **Dense Memory Layout** +/// +/// ``` +/// use arrow::array::UnionBuilder; +/// use arrow::datatypes::{Float64Type, Int32Type}; +/// +/// let mut builder = UnionBuilder::new_dense(3); +/// builder.append::("a", 1).unwrap(); +/// builder.append::("b", 3.0).unwrap(); +/// builder.append::("a", 4).unwrap(); +/// let union = builder.build().unwrap(); +/// +/// assert_eq!(union.type_id(0), 0_i8); +/// assert_eq!(union.type_id(1), 1_i8); +/// assert_eq!(union.type_id(2), 0_i8); +/// +/// assert_eq!(union.value_offset(0), 0_i32); +/// assert_eq!(union.value_offset(1), 0_i32); +/// assert_eq!(union.value_offset(2), 1_i32); +/// ``` +/// +/// Example: **Sparse Memory Layout** +/// ``` +/// use arrow::array::UnionBuilder; +/// use arrow::datatypes::{Float64Type, Int32Type}; +/// +/// let mut builder = UnionBuilder::new_sparse(3); +/// builder.append::("a", 1).unwrap(); +/// builder.append::("b", 3.0).unwrap(); +/// builder.append::("a", 4).unwrap(); +/// let union = builder.build().unwrap(); +/// +/// assert_eq!(union.type_id(0), 0_i8); +/// assert_eq!(union.type_id(1), 1_i8); +/// assert_eq!(union.type_id(2), 0_i8); +/// +/// assert_eq!(union.value_offset(0), 0_i32); +/// assert_eq!(union.value_offset(1), 1_i32); +/// assert_eq!(union.value_offset(2), 2_i32); +/// ``` +#[derive(Debug)] +pub struct UnionBuilder { + /// The current number of slots in the array + len: usize, + /// Maps field names to `FieldData` instances which track the builders for that field + fields: HashMap, + /// Builder to keep track of type ids + type_id_builder: Int8BufferBuilder, + /// Builder to keep track of offsets (`None` for sparse unions) + value_offset_builder: Option, +} + +impl UnionBuilder { + /// Creates a new dense array builder. + pub fn new_dense(capacity: usize) -> Self { + Self { + len: 0, + fields: HashMap::default(), + type_id_builder: Int8BufferBuilder::new(capacity), + value_offset_builder: Some(Int32BufferBuilder::new(capacity)), + } + } + + /// Creates a new sparse array builder. + pub fn new_sparse(capacity: usize) -> Self { + Self { + len: 0, + fields: HashMap::default(), + type_id_builder: Int8BufferBuilder::new(capacity), + value_offset_builder: None, + } + } + + /// Appends a null to this builder, encoding the null in the array + /// of the `type_name` child / field. + /// + /// Since `UnionArray` encodes nulls as an entry in its children + /// (it doesn't have a validity bitmap itself), and where the null + /// is part of the final array, appending a NULL requires + /// specifying which field (child) to use. + #[inline] + pub fn append_null(&mut self, type_name: &str) -> Result<()> { + self.append_option::(type_name, None) + } + + /// Appends a value to this builder. + #[inline] + pub fn append( + &mut self, + type_name: &str, + v: T::Native, + ) -> Result<()> { + self.append_option::(type_name, Some(v)) + } + + fn append_option( + &mut self, + type_name: &str, + v: Option, + ) -> Result<()> { + let type_name = type_name.to_string(); + + let mut field_data = match self.fields.remove(&type_name) { + Some(data) => { + if data.data_type != T::DATA_TYPE { + return Err(ArrowError::InvalidArgumentError(format!("Attempt to write col \"{}\" with type {} doesn't match existing type {}", type_name, T::DATA_TYPE, data.data_type))); + } + data + } + None => match self.value_offset_builder { + Some(_) => FieldData::new(self.fields.len() as i8, T::DATA_TYPE), + None => { + let mut fd = FieldData::new(self.fields.len() as i8, T::DATA_TYPE); + for _ in 0..self.len { + fd.append_null::()?; + } + fd + } + }, + }; + self.type_id_builder.append(field_data.type_id); + + match &mut self.value_offset_builder { + // Dense Union + Some(offset_builder) => { + offset_builder.append(field_data.slots as i32); + } + // Sparse Union + None => { + for (_, fd) in self.fields.iter_mut() { + // Append to all bar the FieldData currently being appended to + fd.append_null_dynamic()?; + } + } + } + + match v { + Some(v) => field_data.append_to_values_buffer::(v)?, + None => field_data.append_null::()?, + } + + self.fields.insert(type_name, field_data); + self.len += 1; + Ok(()) + } + + /// Builds this builder creating a new `UnionArray`. + pub fn build(mut self) -> Result { + let type_id_buffer = self.type_id_builder.finish(); + let value_offsets_buffer = self.value_offset_builder.map(|mut b| b.finish()); + let mut children = Vec::new(); + for ( + name, + FieldData { + type_id, + data_type, + values_buffer, + slots, + mut bitmap_builder, + }, + ) in self.fields.into_iter() + { + let buffer = values_buffer + .expect("The `values_buffer` should only ever be None inside the `append` method.") + .into(); + let arr_data_builder = ArrayDataBuilder::new(data_type.clone()) + .add_buffer(buffer) + .len(slots) + .null_bit_buffer(Some(bitmap_builder.finish())); + + let arr_data_ref = unsafe { arr_data_builder.build_unchecked() }; + let array_ref = make_array(arr_data_ref); + children.push((type_id, (Field::new(&name, data_type, false), array_ref))) + } + + children.sort_by(|a, b| { + a.0.partial_cmp(&b.0) + .expect("This will never be None as type ids are always i8 values.") + }); + let children: Vec<_> = children.into_iter().map(|(_, b)| b).collect(); + + let type_ids: Vec = (0_i8..children.len() as i8).collect(); + + UnionArray::try_new(&type_ids, type_id_buffer, value_offsets_buffer, children) + } +} + +#[cfg(test)] +mod tests {} diff --git a/arrow/src/array/data.rs b/arrow/src/array/data.rs index cb6b894a058d..3e7e66496162 100644 --- a/arrow/src/array/data.rs +++ b/arrow/src/array/data.rs @@ -18,7 +18,7 @@ //! Contains `ArrayData`, a generic representation of Arrow array data which encapsulates //! common attributes and operations for Arrow array. -use crate::datatypes::{DataType, IntervalUnit, UnionMode}; +use crate::datatypes::{validate_decimal_precision, DataType, IntervalUnit, UnionMode}; use crate::error::{ArrowError, Result}; use crate::{bitmap::Bitmap, datatypes::ArrowNativeType}; use crate::{ @@ -712,10 +712,10 @@ impl ArrayData { // Additional Type specific checks match &self.data_type { DataType::Utf8 | DataType::Binary => { - self.validate_offsets::(&self.buffers[0], self.buffers[1].len())?; + self.validate_offsets::(self.buffers[1].len())?; } DataType::LargeUtf8 | DataType::LargeBinary => { - self.validate_offsets::(&self.buffers[0], self.buffers[1].len())?; + self.validate_offsets::(self.buffers[1].len())?; } DataType::Dictionary(key_type, _value_type) => { // At the moment, constructing a DictionaryArray will also check this @@ -738,40 +738,46 @@ impl ArrayData { /// entries. /// /// For an empty array, the `buffer` can also be empty. - fn typed_offsets<'a, T: ArrowNativeType + num::Num + std::fmt::Display>( - &'a self, - buffer: &'a Buffer, - ) -> Result<&'a [T]> { + fn typed_offsets(&self) -> Result<&[T]> { // An empty list-like array can have 0 offsets - if buffer.is_empty() && self.len == 0 { + if self.len == 0 && self.buffers[0].is_empty() { return Ok(&[]); } - // Validate that there are the correct number of offsets for this array's length - let required_offsets = self.len + self.offset + 1; + self.typed_buffer(0, self.len + 1) + } + + /// Returns a reference to the data in `buffers[idx]` as a typed slice after validating + fn typed_buffer( + &self, + idx: usize, + len: usize, + ) -> Result<&[T]> { + let buffer = &self.buffers[idx]; - if (buffer.len() / std::mem::size_of::()) < required_offsets { + let required_len = (len + self.offset) * std::mem::size_of::(); + + if buffer.len() < required_len { return Err(ArrowError::InvalidArgumentError(format!( - "Offsets buffer size (bytes): {} isn't large enough for {}. Length {} needs {}", - buffer.len(), self.data_type, self.len, required_offsets + "Buffer {} of {} isn't large enough. Expected {} bytes got {}", + idx, + self.data_type, + required_len, + buffer.len() ))); } - // Justification: buffer size was validated above - Ok(unsafe { - &(buffer.typed_data::()[self.offset..self.offset + self.len + 1]) - }) + Ok(&buffer.typed_data::()[self.offset..self.offset + len]) } /// Does a cheap sanity check that the `self.len` values in `buffer` are valid /// offsets (of type T) into some other buffer of `values_length` bytes long fn validate_offsets( &self, - buffer: &Buffer, values_length: usize, ) -> Result<()> { // Justification: buffer size was validated above - let offsets = self.typed_offsets::(buffer)?; + let offsets = self.typed_offsets::()?; if offsets.is_empty() { return Ok(()); } @@ -819,12 +825,12 @@ impl ArrayData { match &self.data_type { DataType::List(field) | DataType::Map(field, _) => { let values_data = self.get_single_valid_child_data(field.data_type())?; - self.validate_offsets::(&self.buffers[0], values_data.len)?; + self.validate_offsets::(values_data.len)?; Ok(()) } DataType::LargeList(field) => { let values_data = self.get_single_valid_child_data(field.data_type())?; - self.validate_offsets::(&self.buffers[0], values_data.len)?; + self.validate_offsets::(values_data.len)?; Ok(()) } DataType::FixedSizeList(field, list_size) => { @@ -979,7 +985,7 @@ impl ArrayData { ))); } - self.validate_dictionary_offset()?; + self.validate_values()?; // validate all children recursively self.child_data @@ -997,8 +1003,15 @@ impl ArrayData { Ok(()) } - pub fn validate_dictionary_offset(&self) -> Result<()> { + pub fn validate_values(&self) -> Result<()> { match &self.data_type { + DataType::Decimal(p, _) => { + let values_buffer: &[i128] = self.typed_buffer(0, self.len)?; + for value in values_buffer { + validate_decimal_precision(*value, *p)?; + } + Ok(()) + } DataType::Utf8 => self.validate_utf8::(), DataType::LargeUtf8 => self.validate_utf8::(), DataType::Binary => self.validate_offsets_full::(self.buffers[1].len()), @@ -1007,11 +1020,11 @@ impl ArrayData { } DataType::List(_) | DataType::Map(_, _) => { let child = &self.child_data[0]; - self.validate_offsets_full::(child.len + child.offset) + self.validate_offsets_full::(child.len) } DataType::LargeList(_) => { let child = &self.child_data[0]; - self.validate_offsets_full::(child.len + child.offset) + self.validate_offsets_full::(child.len) } DataType::Union(_, _, _) => { // Validate Union Array as part of implementing new Union semantics @@ -1053,17 +1066,12 @@ impl ArrayData { /// /// For example, the offsets buffer contained `[1, 2, 4]`, this /// function would call `validate([1,2])`, and `validate([2,4])` - fn validate_each_offset( - &self, - offsets_buffer: &Buffer, - offset_limit: usize, - validate: V, - ) -> Result<()> + fn validate_each_offset(&self, offset_limit: usize, validate: V) -> Result<()> where - T: ArrowNativeType + std::convert::TryInto + num::Num + std::fmt::Display, + T: ArrowNativeType + TryInto + num::Num + std::fmt::Display, V: Fn(usize, Range) -> Result<()>, { - self.typed_offsets::(offsets_buffer)? + self.typed_offsets::()? .iter() .enumerate() .map(|(i, x)| { @@ -1109,50 +1117,39 @@ impl ArrayData { /// into `buffers[1]` are valid utf8 sequences fn validate_utf8(&self) -> Result<()> where - T: ArrowNativeType + std::convert::TryInto + num::Num + std::fmt::Display, + T: ArrowNativeType + TryInto + num::Num + std::fmt::Display, { - let offset_buffer = &self.buffers[0]; let values_buffer = &self.buffers[1].as_slice(); - self.validate_each_offset::( - offset_buffer, - values_buffer.len(), - |string_index, range| { - std::str::from_utf8(&values_buffer[range.clone()]).map_err(|e| { - ArrowError::InvalidArgumentError(format!( - "Invalid UTF8 sequence at string index {} ({:?}): {}", - string_index, range, e - )) - })?; - Ok(()) - }, - ) + self.validate_each_offset::(values_buffer.len(), |string_index, range| { + std::str::from_utf8(&values_buffer[range.clone()]).map_err(|e| { + ArrowError::InvalidArgumentError(format!( + "Invalid UTF8 sequence at string index {} ({:?}): {}", + string_index, range, e + )) + })?; + Ok(()) + }) } /// Ensures that all offsets in `buffers[0]` into `buffers[1]` are /// between `0` and `offset_limit` fn validate_offsets_full(&self, offset_limit: usize) -> Result<()> where - T: ArrowNativeType + std::convert::TryInto + num::Num + std::fmt::Display, + T: ArrowNativeType + TryInto + num::Num + std::fmt::Display, { - let offset_buffer = &self.buffers[0]; - - self.validate_each_offset::( - offset_buffer, - offset_limit, - |_string_index, _range| { - // No validation applied to each value, but the iteration - // itself applies bounds checking to each range - Ok(()) - }, - ) + self.validate_each_offset::(offset_limit, |_string_index, _range| { + // No validation applied to each value, but the iteration + // itself applies bounds checking to each range + Ok(()) + }) } /// Validates that each value in self.buffers (typed as T) /// is within the range [0, max_value], inclusive fn check_bounds(&self, max_value: i64) -> Result<()> where - T: ArrowNativeType + std::convert::TryInto + num::Num + std::fmt::Display, + T: ArrowNativeType + TryInto + num::Num + std::fmt::Display, { let required_len = self.len + self.offset; let buffer = &self.buffers[0]; @@ -1163,7 +1160,7 @@ impl ArrayData { // Justification: buffer size was validated above let indexes: &[T] = - unsafe { &(buffer.typed_data::()[self.offset..self.offset + self.len]) }; + &buffer.typed_data::()[self.offset..self.offset + self.len]; indexes.iter().enumerate().try_for_each(|(i, &dict_index)| { // Do not check the value is null (value can be arbitrary) @@ -1492,8 +1489,9 @@ mod tests { use std::ptr::NonNull; use crate::array::{ - make_array, Array, BooleanBuilder, Int32Array, Int32Builder, Int64Array, - StringArray, StructBuilder, UInt64Array, + make_array, Array, BooleanBuilder, DecimalBuilder, FixedSizeListBuilder, + Int32Array, Int32Builder, Int64Array, StringArray, StructBuilder, UInt64Array, + UInt8Builder, }; use crate::buffer::Buffer; use crate::datatypes::Field; @@ -1843,7 +1841,7 @@ mod tests { #[test] #[should_panic( - expected = "Offsets buffer size (bytes): 4 isn't large enough for LargeUtf8. Length 0 needs 1" + expected = "Buffer 0 of LargeUtf8 isn't large enough. Expected 8 bytes got 4" )] fn test_empty_large_utf8_array_with_wrong_type_offsets() { let data_buffer = Buffer::from(&[]); @@ -1861,7 +1859,7 @@ mod tests { #[test] #[should_panic( - expected = "Offsets buffer size (bytes): 8 isn't large enough for Utf8. Length 2 needs 3" + expected = "Buffer 0 of Utf8 isn't large enough. Expected 12 bytes got 8" )] fn test_validate_offsets_i32() { let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes()); @@ -1879,7 +1877,7 @@ mod tests { #[test] #[should_panic( - expected = "Offsets buffer size (bytes): 16 isn't large enough for LargeUtf8. Length 2 needs 3" + expected = "Buffer 0 of LargeUtf8 isn't large enough. Expected 24 bytes got 16" )] fn test_validate_offsets_i64() { let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes()); @@ -2707,4 +2705,72 @@ mod tests { assert_eq!(array, &expected); } + + #[test] + #[cfg(not(feature = "force_validate"))] + fn test_decimal_full_validation() { + let values_builder = UInt8Builder::new(10); + let byte_width = 16; + let mut fixed_size_builder = + FixedSizeListBuilder::new(values_builder, byte_width); + let value_as_bytes = DecimalBuilder::from_i128_to_fixed_size_bytes( + 123456, + fixed_size_builder.value_length() as usize, + ) + .unwrap(); + fixed_size_builder + .values() + .append_slice(value_as_bytes.as_slice()) + .unwrap(); + fixed_size_builder.append(true).unwrap(); + let fixed_size_array = fixed_size_builder.finish(); + + // Build ArrayData for Decimal + let builder = ArrayData::builder(DataType::Decimal(5, 3)) + .len(fixed_size_array.len()) + .add_buffer(fixed_size_array.data_ref().child_data()[0].buffers()[0].clone()); + let array_data = unsafe { builder.build_unchecked() }; + let validation_result = array_data.validate_full(); + let error = validation_result.unwrap_err(); + assert_eq!( + "Invalid argument error: 123456 is too large to store in a Decimal of precision 5. Max is 99999", + error.to_string() + ); + } + + #[test] + fn test_decimal_validation() { + let mut builder = DecimalBuilder::new(4, 10, 4); + builder.append_value(10000).unwrap(); + builder.append_value(20000).unwrap(); + let array = builder.finish(); + + array.data().validate_full().unwrap(); + } + + #[test] + #[cfg(not(feature = "force_validate"))] + fn test_sliced_array_child() { + let values = Int32Array::from_iter_values([1, 2, 3]); + let values_sliced = values.slice(1, 2); + let offsets = Buffer::from_iter([1_i32, 3_i32]); + + let list_field = Field::new("element", DataType::Int32, false); + let data_type = DataType::List(Box::new(list_field)); + + let data = unsafe { + ArrayData::new_unchecked( + data_type, + 1, + None, + None, + 0, + vec![offsets], + vec![values_sliced.data().clone()], + ) + }; + + let err = data.validate_values().unwrap_err(); + assert_eq!(err.to_string(), "Invalid argument error: Offset invariant failure: offset at position 1 out of bounds: 3 > 2"); + } } diff --git a/arrow/src/array/equal/list.rs b/arrow/src/array/equal/list.rs index 65d320c0079d..0feefa7aa11a 100644 --- a/arrow/src/array/equal/list.rs +++ b/arrow/src/array/equal/list.rs @@ -73,6 +73,11 @@ pub(super) fn list_equal( // however, one is more likely to slice into a list array and get a region that has 0 // child values. // The test that triggered this behaviour had [4, 4] as a slice of 1 value slot. + // For the edge case that zero length list arrays are always equal. + if len == 0 { + return true; + } + let lhs_child_length = lhs_offsets[lhs_start + len].to_usize().unwrap() - lhs_offsets[lhs_start].to_usize().unwrap(); diff --git a/arrow/src/array/equal/mod.rs b/arrow/src/array/equal/mod.rs index b89a8fa53e0b..c3b0bbc95c2b 100644 --- a/arrow/src/array/equal/mod.rs +++ b/arrow/src/array/equal/mod.rs @@ -629,6 +629,57 @@ mod tests { test_equal(&a, &b, false); } + #[test] + fn test_empty_offsets_list_equal() { + let empty: Vec = vec![]; + let values = Int32Array::from(empty); + let empty_offsets: [u8; 0] = []; + + let a = ArrayDataBuilder::new(DataType::List(Box::new(Field::new( + "item", + DataType::Int32, + true, + )))) + .len(0) + .add_buffer(Buffer::from(&empty_offsets)) + .add_child_data(values.data().clone()) + .null_bit_buffer(Some(Buffer::from(&empty_offsets))) + .build() + .unwrap(); + + let b = ArrayDataBuilder::new(DataType::List(Box::new(Field::new( + "item", + DataType::Int32, + true, + )))) + .len(0) + .add_buffer(Buffer::from(&empty_offsets)) + .add_child_data(values.data().clone()) + .null_bit_buffer(Some(Buffer::from(&empty_offsets))) + .build() + .unwrap(); + + test_equal(&a, &b, true); + + let c = ArrayDataBuilder::new(DataType::List(Box::new(Field::new( + "item", + DataType::Int32, + true, + )))) + .len(0) + .add_buffer(Buffer::from(vec![0i32, 2, 3, 4, 6, 7, 8].to_byte_slice())) + .add_child_data( + Int32Array::from(vec![1, 2, -1, -2, 3, 4, -3, -4]) + .data() + .clone(), + ) + .null_bit_buffer(Some(Buffer::from(vec![0b00001001]))) + .build() + .unwrap(); + + test_equal(&a, &c, true); + } + // Test the case where null_count > 0 #[test] fn test_list_null() { diff --git a/arrow/src/array/equal_json.rs b/arrow/src/array/equal_json.rs index 64f109df5ff9..9db1a4397cb8 100644 --- a/arrow/src/array/equal_json.rs +++ b/arrow/src/array/equal_json.rs @@ -370,7 +370,7 @@ impl JsonEqual for DecimalArray { self.is_valid(i) && (s .parse::() - .map_or_else(|_| false, |v| v == self.value(i))) + .map_or_else(|_| false, |v| v == self.value(i).as_i128())) } JNull => self.is_null(i), _ => false, diff --git a/arrow/src/array/ffi.rs b/arrow/src/array/ffi.rs index 57329037bc46..12d6f440b78d 100644 --- a/arrow/src/array/ffi.rs +++ b/arrow/src/array/ffi.rs @@ -74,6 +74,11 @@ mod tests { let result = &ArrayData::try_from(d1)?; assert_eq!(result, expected); + + unsafe { + Arc::from_raw(array); + Arc::from_raw(schema); + } Ok(()) } diff --git a/arrow/src/array/iterator.rs b/arrow/src/array/iterator.rs index 18bdca621795..bc70d1a2a8ed 100644 --- a/arrow/src/array/iterator.rs +++ b/arrow/src/array/iterator.rs @@ -425,7 +425,7 @@ impl<'a> std::iter::Iterator for DecimalIter<'a> { if self.array.is_null(old) { Some(None) } else { - Some(Some(self.array.value(old))) + Some(Some(self.array.value(old).as_i128())) } } } diff --git a/arrow/src/array/mod.rs b/arrow/src/array/mod.rs index 0bd32d347772..bbe62cf6a1f6 100644 --- a/arrow/src/array/mod.rs +++ b/arrow/src/array/mod.rs @@ -15,40 +15,66 @@ // specific language governing permissions and limitations // under the License. -//! The central type in Apache Arrow are arrays, represented -//! by the [`Array` trait](crate::array::Array). -//! An array represents a known-length sequence of values all -//! having the same type. +//! The central type in Apache Arrow are arrays, which are a known-length sequence of values +//! all having the same type. This module provides concrete implementations of each type, as +//! well as an [`Array`] trait that can be used for type-erasure. //! -//! Internally, those values are represented by one or several -//! [buffers](crate::buffer::Buffer), the number and meaning -//! of which depend on the array’s data type, as documented in -//! [the Arrow data layout specification](https://arrow.apache.org/docs/format/Columnar.html). -//! For example, the type `Int16Array` represents an Apache -//! Arrow array of 16-bit integers. +//! # Downcasting an Array //! -//! Those buffers consist of the value data itself and an -//! optional [bitmap buffer](crate::bitmap::Bitmap) that -//! indicates which array entries are null values. -//! The bitmap buffer can be entirely omitted if the array is -//! known to have zero null values. +//! Arrays are often passed around as a dynamically typed [`&dyn Array`] or [`ArrayRef`]. +//! For example, [`RecordBatch`](`crate::record_batch::RecordBatch`) stores columns as [`ArrayRef`]. //! -//! There are concrete implementations of this trait for each -//! data type, that help you access individual values of the -//! array. +//! Whilst these arrays can be passed directly to the [`compute`](crate::compute), +//! [`csv`](crate::csv), [`json`](crate::json), etc... APIs, it is often the case that you wish +//! to interact with the data directly. This requires downcasting to the concrete type of the array: +//! +//! ``` +//! # use arrow::array::{Array, Float32Array, Int32Array}; +//! # +//! fn sum_int32(array: &dyn Array) -> i32 { +//! let integers: &Int32Array = array.as_any().downcast_ref().unwrap(); +//! integers.iter().map(|val| val.unwrap_or_default()).sum() +//! } +//! +//! // Note: the values for positions corresponding to nulls will be arbitrary +//! fn as_f32_slice(array: &dyn Array) -> &[f32] { +//! array.as_any().downcast_ref::().unwrap().values() +//! } +//! ``` //! //! # Building an Array //! -//! Arrow's `Arrays` are immutable, but there is the trait -//! [`ArrayBuilder`](crate::array::ArrayBuilder) -//! that helps you with constructing new `Arrays`. As with the -//! `Array` trait, there are builder implementations for all -//! concrete array types. +//! Most [`Array`] implementations can be constructed directly from iterators or [`Vec`] //! -//! # Example //! ``` -//! use arrow::array::Int16Array; +//! # use arrow::array::Int32Array; +//! # use arrow::array::StringArray; +//! # use arrow::array::ListArray; +//! # use arrow::datatypes::Int32Type; +//! # +//! Int32Array::from(vec![1, 2]); +//! Int32Array::from(vec![Some(1), None]); +//! Int32Array::from_iter([1, 2, 3, 4]); +//! Int32Array::from_iter([Some(1), Some(2), None, Some(4)]); +//! +//! StringArray::from(vec!["foo", "bar"]); +//! StringArray::from(vec![Some("foo"), None]); +//! StringArray::from_iter([Some("foo"), None]); +//! StringArray::from_iter_values(["foo", "bar"]); +//! +//! ListArray::from_iter_primitive::([ +//! Some(vec![Some(1), None, Some(3)]), +//! None, +//! Some(vec![]) +//! ]); +//! ``` +//! +//! Additionally [`ArrayBuilder`](crate::array::ArrayBuilder) implementations can be +//! used to construct arrays with a push-based interface //! +//! ``` +//! # use arrow::array::Int16Array; +//! # //! // Create a new builder with a capacity of 100 //! let mut builder = Int16Array::builder(100); //! @@ -78,6 +104,43 @@ //! "Get slice of len 2 starting at idx 3" //! ) //! ``` +//! +//! # Zero-Copy Slicing +//! +//! Given an [`Array`] of arbitrary length, it is possible to create an owned slice of this +//! data. Internally this just increments some ref-counts, and so is incredibly cheap +//! +//! ```rust +//! # use std::sync::Arc; +//! # use arrow::array::{Array, Int32Array, ArrayRef}; +//! let array = Arc::new(Int32Array::from_iter([1, 2, 3])) as ArrayRef; +//! +//! // Slice with offset 1 and length 2 +//! let sliced = array.slice(1, 2); +//! let ints = sliced.as_any().downcast_ref::().unwrap(); +//! assert_eq!(ints.values(), &[2, 3]); +//! ``` +//! +//! # Internal Representation +//! +//! Internally, arrays are represented by one or several [`Buffer`], the number and meaning of +//! which depend on the array’s data type, as documented in the [Arrow specification]. +//! +//! For example, the type `Int16Array` represents an array of 16-bit integers and consists of: +//! +//! * An optional [`Bitmap`] identifying any null values +//! * A contiguous [`Buffer`] of 16-bit integers +//! +//! Similarly, the type `StringArray` represents an array of UTF-8 strings and consists of: +//! +//! * An optional [`Bitmap`] identifying any null values +//! * An offsets [`Buffer`] of 32-bit integers identifying valid UTF-8 sequences within the values buffer +//! * A values [`Buffer`] of UTF-8 encoded string data +//! +//! [Arrow specification]: https://arrow.apache.org/docs/format/Columnar.html +//! [`&dyn Array`]: Array +//! [`Bitmap`]: crate::bitmap::Bitmap +//! [`Buffer`]: crate::buffer::Buffer #[allow(clippy::module_inception)] mod array; @@ -398,9 +461,29 @@ pub use self::array_string::GenericStringArray; // --------------------- Array Builder --------------------- -pub use self::builder::make_builder; +pub use self::builder::ArrayBuilder; +pub use self::builder::BinaryBuilder; pub use self::builder::BooleanBufferBuilder; +pub use self::builder::BooleanBuilder; pub use self::builder::BufferBuilder; +pub use self::builder::DecimalBuilder; +pub use self::builder::FixedSizeBinaryBuilder; +pub use self::builder::FixedSizeListBuilder; +pub use self::builder::GenericListBuilder; +pub use self::builder::GenericStringBuilder; +pub use self::builder::LargeBinaryBuilder; +pub use self::builder::LargeListBuilder; +pub use self::builder::LargeStringBuilder; +pub use self::builder::ListBuilder; +pub use self::builder::MapBuilder; +pub use self::builder::PrimitiveBuilder; +pub use self::builder::PrimitiveDictionaryBuilder; +pub use self::builder::StringBuilder; +pub use self::builder::StringDictionaryBuilder; +pub use self::builder::StructBuilder; +pub use self::builder::UnionBuilder; + +pub use self::builder::make_builder; pub type Int8BufferBuilder = BufferBuilder; pub type Int16BufferBuilder = BufferBuilder; @@ -446,26 +529,6 @@ pub type DurationMicrosecondBufferBuilder = pub type DurationNanosecondBufferBuilder = BufferBuilder<::Native>; -pub use self::builder::ArrayBuilder; -pub use self::builder::BinaryBuilder; -pub use self::builder::BooleanBuilder; -pub use self::builder::DecimalBuilder; -pub use self::builder::FixedSizeBinaryBuilder; -pub use self::builder::FixedSizeListBuilder; -pub use self::builder::GenericListBuilder; -pub use self::builder::GenericStringBuilder; -pub use self::builder::LargeBinaryBuilder; -pub use self::builder::LargeListBuilder; -pub use self::builder::LargeStringBuilder; -pub use self::builder::ListBuilder; -pub use self::builder::MapBuilder; -pub use self::builder::PrimitiveBuilder; -pub use self::builder::PrimitiveDictionaryBuilder; -pub use self::builder::StringBuilder; -pub use self::builder::StringDictionaryBuilder; -pub use self::builder::StructBuilder; -pub use self::builder::UnionBuilder; - pub type Int8Builder = PrimitiveBuilder; pub type Int16Builder = PrimitiveBuilder; pub type Int32Builder = PrimitiveBuilder; diff --git a/arrow/src/array/transform/mod.rs b/arrow/src/array/transform/mod.rs index 4671d82673fd..68ae7f6d4d0d 100644 --- a/arrow/src/array/transform/mod.rs +++ b/arrow/src/array/transform/mod.rs @@ -78,18 +78,13 @@ impl<'a> _MutableArrayData<'a> { } }; - let mut array_data_builder = ArrayDataBuilder::new(self.data_type) + ArrayDataBuilder::new(self.data_type) .offset(0) .len(self.len) .null_count(self.null_count) .buffers(buffers) - .child_data(child_data); - if self.null_count > 0 { - array_data_builder = - array_data_builder.null_bit_buffer(Some(self.null_buffer.into())); - } - - array_data_builder + .child_data(child_data) + .null_bit_buffer((self.null_count > 0).then(|| self.null_buffer.into())) } } @@ -184,48 +179,23 @@ fn build_extend_dictionary( max: usize, ) -> Option { use crate::datatypes::*; + macro_rules! validate_and_build { + ($dt: ty) => {{ + let _: $dt = max.try_into().ok()?; + let offset: $dt = offset.try_into().ok()?; + Some(primitive::build_extend_with_offset(array, offset)) + }}; + } match array.data_type() { DataType::Dictionary(child_data_type, _) => match child_data_type.as_ref() { - DataType::UInt8 => { - let _: u8 = max.try_into().ok()?; - let offset: u8 = offset.try_into().ok()?; - Some(primitive::build_extend_with_offset(array, offset)) - } - DataType::UInt16 => { - let _: u16 = max.try_into().ok()?; - let offset: u16 = offset.try_into().ok()?; - Some(primitive::build_extend_with_offset(array, offset)) - } - DataType::UInt32 => { - let _: u32 = max.try_into().ok()?; - let offset: u32 = offset.try_into().ok()?; - Some(primitive::build_extend_with_offset(array, offset)) - } - DataType::UInt64 => { - let _: u64 = max.try_into().ok()?; - let offset: u64 = offset.try_into().ok()?; - Some(primitive::build_extend_with_offset(array, offset)) - } - DataType::Int8 => { - let _: i8 = max.try_into().ok()?; - let offset: i8 = offset.try_into().ok()?; - Some(primitive::build_extend_with_offset(array, offset)) - } - DataType::Int16 => { - let _: i16 = max.try_into().ok()?; - let offset: i16 = offset.try_into().ok()?; - Some(primitive::build_extend_with_offset(array, offset)) - } - DataType::Int32 => { - let _: i32 = max.try_into().ok()?; - let offset: i32 = offset.try_into().ok()?; - Some(primitive::build_extend_with_offset(array, offset)) - } - DataType::Int64 => { - let _: i64 = max.try_into().ok()?; - let offset: i64 = offset.try_into().ok()?; - Some(primitive::build_extend_with_offset(array, offset)) - } + DataType::UInt8 => validate_and_build!(u8), + DataType::UInt16 => validate_and_build!(u16), + DataType::UInt32 => validate_and_build!(u32), + DataType::UInt64 => validate_and_build!(u64), + DataType::Int8 => validate_and_build!(i8), + DataType::Int16 => validate_and_build!(i16), + DataType::Int32 => validate_and_build!(i32), + DataType::Int64 => validate_and_build!(i64), _ => unreachable!(), }, _ => None, @@ -394,7 +364,7 @@ impl<'a> MutableArrayData<'a> { /// a [Capacities] variant is not yet supported. pub fn with_capacities( arrays: Vec<&'a ArrayData>, - mut use_nulls: bool, + use_nulls: bool, capacities: Capacities, ) -> Self { let data_type = arrays[0].data_type(); @@ -402,20 +372,22 @@ impl<'a> MutableArrayData<'a> { // if any of the arrays has nulls, insertions from any array requires setting bits // as there is at least one array with nulls. - if arrays.iter().any(|array| array.null_count() > 0) { - use_nulls = true; - }; + let use_nulls = use_nulls | arrays.iter().any(|array| array.null_count() > 0); let mut array_capacity; let [buffer1, buffer2] = match (data_type, &capacities) { - (DataType::LargeUtf8, Capacities::Binary(capacity, Some(value_cap))) - | (DataType::LargeBinary, Capacities::Binary(capacity, Some(value_cap))) => { + ( + DataType::LargeUtf8 | DataType::LargeBinary, + Capacities::Binary(capacity, Some(value_cap)), + ) => { array_capacity = *capacity; preallocate_offset_and_binary_buffer::(*capacity, *value_cap) } - (DataType::Utf8, Capacities::Binary(capacity, Some(value_cap))) - | (DataType::Binary, Capacities::Binary(capacity, Some(value_cap))) => { + ( + DataType::Utf8 | DataType::Binary, + Capacities::Binary(capacity, Some(value_cap)), + ) => { array_capacity = *capacity; preallocate_offset_and_binary_buffer::(*capacity, *value_cap) } @@ -423,6 +395,13 @@ impl<'a> MutableArrayData<'a> { array_capacity = *capacity; new_buffers(data_type, *capacity) } + ( + DataType::List(_) | DataType::LargeList(_), + Capacities::List(capacity, _), + ) => { + array_capacity = *capacity; + new_buffers(data_type, *capacity) + } _ => panic!("Capacities: {:?} not yet supported", capacities), }; @@ -462,11 +441,10 @@ impl<'a> MutableArrayData<'a> { let capacities = if let Capacities::List(capacity, ref child_capacities) = capacities { - array_capacity = capacity; child_capacities .clone() .map(|c| *c) - .unwrap_or(Capacities::Array(array_capacity)) + .unwrap_or(Capacities::Array(capacity)) } else { Capacities::Array(array_capacity) }; @@ -721,6 +699,7 @@ mod tests { } #[test] + #[cfg(not(feature = "force_validate"))] fn test_decimal() { let decimal_array = create_decimal_array(&[Some(1), Some(2), None, Some(3)], 10, 3); @@ -734,6 +713,7 @@ mod tests { assert_eq!(array, expected); } #[test] + #[cfg(not(feature = "force_validate"))] fn test_decimal_offset() { let decimal_array = create_decimal_array(&[Some(1), Some(2), None, Some(3)], 10, 3); @@ -748,6 +728,7 @@ mod tests { } #[test] + #[cfg(not(feature = "force_validate"))] fn test_decimal_null_offset_nulls() { let decimal_array = create_decimal_array(&[Some(1), Some(2), None, Some(3)], 10, 3); @@ -1343,6 +1324,40 @@ mod tests { Ok(()) } + #[test] + fn test_list_append_with_capacities() -> Result<()> { + let mut builder = ListBuilder::::new(Int64Builder::new(24)); + builder.values().append_slice(&[1, 2, 3])?; + builder.append(true)?; + builder.values().append_slice(&[4, 5])?; + builder.append(true)?; + builder.values().append_slice(&[6, 7, 8])?; + builder.values().append_slice(&[9, 10, 11])?; + builder.append(true)?; + let a = builder.finish(); + + let a_builder = Int64Builder::new(24); + let mut a_builder = ListBuilder::::new(a_builder); + a_builder.values().append_slice(&[12, 13])?; + a_builder.append(true)?; + a_builder.append(true)?; + a_builder.values().append_slice(&[14, 15, 16, 17])?; + a_builder.append(true)?; + let b = a_builder.finish(); + + let mutable = MutableArrayData::with_capacities( + vec![a.data(), b.data()], + false, + Capacities::List(6, Some(Box::new(Capacities::Array(17)))), + ); + + // capacities are rounded up to multiples of 64 by MutableBuffer + assert_eq!(mutable.data.buffer1.capacity(), 64); + assert_eq!(mutable.data.child_data[0].data.buffer1.capacity(), 192); + + Ok(()) + } + #[test] fn test_map_nulls_append() -> Result<()> { let mut builder = MapBuilder::::new( diff --git a/arrow/src/buffer/immutable.rs b/arrow/src/buffer/immutable.rs index c34ea101bb3b..f5d59c5ed555 100644 --- a/arrow/src/buffer/immutable.rs +++ b/arrow/src/buffer/immutable.rs @@ -181,19 +181,15 @@ impl Buffer { /// View buffer as typed slice. /// - /// # Safety + /// # Panics /// - /// `ArrowNativeType` is public so that it can be used as a trait bound for other public - /// components, such as the `ToByteSlice` trait. However, this means that it can be - /// implemented by user defined types, which it is not intended for. - pub unsafe fn typed_data(&self) -> &[T] { - // JUSTIFICATION - // Benefit - // Many of the buffers represent specific types, and consumers of `Buffer` often need to re-interpret them. - // Soundness - // * The pointer is non-null by construction - // * alignment asserted below. - let (prefix, offsets, suffix) = self.as_slice().align_to::(); + /// This function panics if the underlying buffer is not aligned + /// correctly for type `T`. + pub fn typed_data(&self) -> &[T] { + // SAFETY + // ArrowNativeType is trivially transmutable, is sealed to prevent potentially incorrect + // implementation outside this crate, and this method checks alignment + let (prefix, offsets, suffix) = unsafe { self.as_slice().align_to::() }; assert!(prefix.is_empty() && suffix.is_empty()); offsets } @@ -451,7 +447,7 @@ mod tests { macro_rules! check_as_typed_data { ($input: expr, $native_t: ty) => {{ let buffer = Buffer::from_slice_ref($input); - let slice: &[$native_t] = unsafe { buffer.typed_data::<$native_t>() }; + let slice: &[$native_t] = buffer.typed_data::<$native_t>(); assert_eq!($input, slice); }}; } @@ -573,12 +569,12 @@ mod tests { ) }; - let slice = unsafe { buffer.typed_data::() }; + let slice = buffer.typed_data::(); assert_eq!(slice, &[1, 2, 3, 4, 5]); let buffer = buffer.slice(std::mem::size_of::()); - let slice = unsafe { buffer.typed_data::() }; + let slice = buffer.typed_data::(); assert_eq!(slice, &[2, 3, 4, 5]); } } diff --git a/arrow/src/buffer/mod.rs b/arrow/src/buffer/mod.rs index cf0461b5f536..b392b0583d6d 100644 --- a/arrow/src/buffer/mod.rs +++ b/arrow/src/buffer/mod.rs @@ -23,6 +23,9 @@ pub use immutable::*; mod mutable; pub use mutable::*; mod ops; +mod scalar; +pub use scalar::*; + pub use ops::*; use crate::error::{ArrowError, Result}; diff --git a/arrow/src/buffer/mutable.rs b/arrow/src/buffer/mutable.rs index 709973b4401b..11783b82da54 100644 --- a/arrow/src/buffer/mutable.rs +++ b/arrow/src/buffer/mutable.rs @@ -30,7 +30,11 @@ use std::ptr::NonNull; /// along cache lines and in multiple of 64 bytes. /// Use [MutableBuffer::push] to insert an item, [MutableBuffer::extend_from_slice] /// to insert many items, and `into` to convert it to [`Buffer`]. +/// +/// For a safe, strongly typed API consider using [`crate::array::BufferBuilder`] +/// /// # Example +/// /// ``` /// # use arrow::buffer::{Buffer, MutableBuffer}; /// let mut buffer = MutableBuffer::new(0); @@ -152,6 +156,17 @@ impl MutableBuffer { } } + /// Truncates this buffer to `len` bytes + /// + /// If `len` is greater than the buffer's current length, this has no effect + #[inline(always)] + pub fn truncate(&mut self, len: usize) { + if len > self.len { + return; + } + self.len = len; + } + /// Resizes the buffer, either truncating its contents (with no change in capacity), or /// growing it (potentially reallocating it) and writing `value` in the newly available bytes. /// # Example @@ -273,19 +288,18 @@ impl MutableBuffer { Buffer::from_bytes(bytes) } - /// View this buffer asa slice of a specific type. - /// - /// # Safety - /// - /// This function must only be used with buffers which are treated - /// as type `T` (e.g. extended with items of type `T`). + /// View this buffer as a slice of a specific type. /// /// # Panics /// /// This function panics if the underlying buffer is not aligned /// correctly for type `T`. - pub unsafe fn typed_data_mut(&mut self) -> &mut [T] { - let (prefix, offsets, suffix) = self.as_slice_mut().align_to_mut::(); + pub fn typed_data_mut(&mut self) -> &mut [T] { + // SAFETY + // ArrowNativeType is trivially transmutable, is sealed to prevent potentially incorrect + // implementation outside this crate, and this method checks alignment + let (prefix, offsets, suffix) = + unsafe { self.as_slice_mut().align_to_mut::() }; assert!(prefix.is_empty() && suffix.is_empty()); offsets } @@ -299,7 +313,7 @@ impl MutableBuffer { /// assert_eq!(buffer.len(), 8) // u32 has 4 bytes /// ``` #[inline] - pub fn extend_from_slice(&mut self, items: &[T]) { + pub fn extend_from_slice(&mut self, items: &[T]) { let len = items.len(); let additional = len * std::mem::size_of::(); self.reserve(additional); diff --git a/arrow/src/buffer/ops.rs b/arrow/src/buffer/ops.rs index e0086a1a8207..ea155c8d78e4 100644 --- a/arrow/src/buffer/ops.rs +++ b/arrow/src/buffer/ops.rs @@ -15,110 +15,8 @@ // specific language governing permissions and limitations // under the License. -#[cfg(feature = "simd")] -use crate::util::bit_util; -#[cfg(feature = "simd")] -use packed_simd::u8x64; - -#[cfg(feature = "avx512")] -use crate::arch::avx512::*; -use crate::util::bit_util::ceil; -#[cfg(any(feature = "simd", feature = "avx512"))] -use std::borrow::BorrowMut; - use super::{Buffer, MutableBuffer}; - -/// Apply a bitwise operation `simd_op` / `scalar_op` to two inputs using simd instructions and return the result as a Buffer. -/// The `simd_op` functions gets applied on chunks of 64 bytes (512 bits) at a time -/// and the `scalar_op` gets applied to remaining bytes. -/// Contrary to the non-simd version `bitwise_bin_op_helper`, the offset and length is specified in bytes -/// and this version does not support operations starting at arbitrary bit offsets. -#[cfg(feature = "simd")] -pub fn bitwise_bin_op_simd_helper( - left: &Buffer, - left_offset: usize, - right: &Buffer, - right_offset: usize, - len: usize, - simd_op: SI, - scalar_op: SC, -) -> Buffer -where - SI: Fn(u8x64, u8x64) -> u8x64, - SC: Fn(u8, u8) -> u8, -{ - let mut result = MutableBuffer::new(len).with_bitset(len, false); - let lanes = u8x64::lanes(); - - let mut left_chunks = left.as_slice()[left_offset..].chunks_exact(lanes); - let mut right_chunks = right.as_slice()[right_offset..].chunks_exact(lanes); - let mut result_chunks = result.as_slice_mut().chunks_exact_mut(lanes); - - result_chunks - .borrow_mut() - .zip(left_chunks.borrow_mut().zip(right_chunks.borrow_mut())) - .for_each(|(res, (left, right))| { - unsafe { bit_util::bitwise_bin_op_simd(&left, &right, res, &simd_op) }; - }); - - result_chunks - .into_remainder() - .iter_mut() - .zip( - left_chunks - .remainder() - .iter() - .zip(right_chunks.remainder().iter()), - ) - .for_each(|(res, (left, right))| { - *res = scalar_op(*left, *right); - }); - - result.into() -} - -/// Apply a bitwise operation `simd_op` / `scalar_op` to one input using simd instructions and return the result as a Buffer. -/// The `simd_op` functions gets applied on chunks of 64 bytes (512 bits) at a time -/// and the `scalar_op` gets applied to remaining bytes. -/// Contrary to the non-simd version `bitwise_unary_op_helper`, the offset and length is specified in bytes -/// and this version does not support operations starting at arbitrary bit offsets. -#[cfg(feature = "simd")] -pub fn bitwise_unary_op_simd_helper( - left: &Buffer, - left_offset: usize, - len: usize, - simd_op: SI, - scalar_op: SC, -) -> Buffer -where - SI: Fn(u8x64) -> u8x64, - SC: Fn(u8) -> u8, -{ - let mut result = MutableBuffer::new(len).with_bitset(len, false); - let lanes = u8x64::lanes(); - - let mut left_chunks = left.as_slice()[left_offset..].chunks_exact(lanes); - let mut result_chunks = result.as_slice_mut().chunks_exact_mut(lanes); - - result_chunks - .borrow_mut() - .zip(left_chunks.borrow_mut()) - .for_each(|(res, left)| unsafe { - let data_simd = u8x64::from_slice_unaligned_unchecked(left); - let simd_result = simd_op(data_simd); - simd_result.write_to_slice_unaligned_unchecked(res); - }); - - result_chunks - .into_remainder() - .iter_mut() - .zip(left_chunks.remainder().iter()) - .for_each(|(res, left)| { - *res = scalar_op(*left); - }); - - result.into() -} +use crate::util::bit_util::ceil; /// Apply a bitwise operation `op` to two inputs and return the result as a Buffer. /// The inputs are treated as bitmaps, meaning that offsets and length are specified in number of bits. @@ -170,9 +68,7 @@ where let left_chunks = left.bit_chunks(offset_in_bits, len_in_bits); - // Safety: buffer is always treated as type `u64` in the code - // below. - let result_chunks = unsafe { result.typed_data_mut::().iter_mut() }; + let result_chunks = result.typed_data_mut::().iter_mut(); result_chunks .zip(left_chunks.iter()) @@ -189,100 +85,6 @@ where result.into() } -#[cfg(all(target_arch = "x86_64", feature = "avx512"))] -pub fn buffer_bin_and( - left: &Buffer, - left_offset_in_bits: usize, - right: &Buffer, - right_offset_in_bits: usize, - len_in_bits: usize, -) -> Buffer { - if left_offset_in_bits % 8 == 0 - && right_offset_in_bits % 8 == 0 - && len_in_bits % 8 == 0 - { - let len = len_in_bits / 8; - let left_offset = left_offset_in_bits / 8; - let right_offset = right_offset_in_bits / 8; - - let mut result = MutableBuffer::new(len).with_bitset(len, false); - - let mut left_chunks = - left.as_slice()[left_offset..].chunks_exact(AVX512_U8X64_LANES); - let mut right_chunks = - right.as_slice()[right_offset..].chunks_exact(AVX512_U8X64_LANES); - let mut result_chunks = - result.as_slice_mut().chunks_exact_mut(AVX512_U8X64_LANES); - - result_chunks - .borrow_mut() - .zip(left_chunks.borrow_mut().zip(right_chunks.borrow_mut())) - .for_each(|(res, (left, right))| unsafe { - avx512_bin_and(left, right, res); - }); - - result_chunks - .into_remainder() - .iter_mut() - .zip( - left_chunks - .remainder() - .iter() - .zip(right_chunks.remainder().iter()), - ) - .for_each(|(res, (left, right))| { - *res = *left & *right; - }); - - result.into() - } else { - bitwise_bin_op_helper( - &left, - left_offset_in_bits, - right, - right_offset_in_bits, - len_in_bits, - |a, b| a & b, - ) - } -} - -#[cfg(all(feature = "simd", not(feature = "avx512")))] -pub fn buffer_bin_and( - left: &Buffer, - left_offset_in_bits: usize, - right: &Buffer, - right_offset_in_bits: usize, - len_in_bits: usize, -) -> Buffer { - if left_offset_in_bits % 8 == 0 - && right_offset_in_bits % 8 == 0 - && len_in_bits % 8 == 0 - { - bitwise_bin_op_simd_helper( - &left, - left_offset_in_bits / 8, - &right, - right_offset_in_bits / 8, - len_in_bits / 8, - |a, b| a & b, - |a, b| a & b, - ) - } else { - bitwise_bin_op_helper( - &left, - left_offset_in_bits, - right, - right_offset_in_bits, - len_in_bits, - |a, b| a & b, - ) - } -} - -// Note: do not target specific features like x86 without considering -// other targets like wasm32, as those would fail to build -#[cfg(all(not(any(feature = "simd", feature = "avx512"))))] pub fn buffer_bin_and( left: &Buffer, left_offset_in_bits: usize, @@ -300,98 +102,6 @@ pub fn buffer_bin_and( ) } -#[cfg(all(target_arch = "x86_64", feature = "avx512"))] -pub fn buffer_bin_or( - left: &Buffer, - left_offset_in_bits: usize, - right: &Buffer, - right_offset_in_bits: usize, - len_in_bits: usize, -) -> Buffer { - if left_offset_in_bits % 8 == 0 - && right_offset_in_bits % 8 == 0 - && len_in_bits % 8 == 0 - { - let len = len_in_bits / 8; - let left_offset = left_offset_in_bits / 8; - let right_offset = right_offset_in_bits / 8; - - let mut result = MutableBuffer::new(len).with_bitset(len, false); - - let mut left_chunks = - left.as_slice()[left_offset..].chunks_exact(AVX512_U8X64_LANES); - let mut right_chunks = - right.as_slice()[right_offset..].chunks_exact(AVX512_U8X64_LANES); - let mut result_chunks = - result.as_slice_mut().chunks_exact_mut(AVX512_U8X64_LANES); - - result_chunks - .borrow_mut() - .zip(left_chunks.borrow_mut().zip(right_chunks.borrow_mut())) - .for_each(|(res, (left, right))| unsafe { - avx512_bin_or(left, right, res); - }); - - result_chunks - .into_remainder() - .iter_mut() - .zip( - left_chunks - .remainder() - .iter() - .zip(right_chunks.remainder().iter()), - ) - .for_each(|(res, (left, right))| { - *res = *left | *right; - }); - - result.into() - } else { - bitwise_bin_op_helper( - &left, - left_offset_in_bits, - right, - right_offset_in_bits, - len_in_bits, - |a, b| a | b, - ) - } -} - -#[cfg(all(feature = "simd", not(feature = "avx512")))] -pub fn buffer_bin_or( - left: &Buffer, - left_offset_in_bits: usize, - right: &Buffer, - right_offset_in_bits: usize, - len_in_bits: usize, -) -> Buffer { - if left_offset_in_bits % 8 == 0 - && right_offset_in_bits % 8 == 0 - && len_in_bits % 8 == 0 - { - bitwise_bin_op_simd_helper( - &left, - left_offset_in_bits / 8, - &right, - right_offset_in_bits / 8, - len_in_bits / 8, - |a, b| a | b, - |a, b| a | b, - ) - } else { - bitwise_bin_op_helper( - &left, - left_offset_in_bits, - right, - right_offset_in_bits, - len_in_bits, - |a, b| a | b, - ) - } -} - -#[cfg(all(not(any(feature = "simd", feature = "avx512"))))] pub fn buffer_bin_or( left: &Buffer, left_offset_in_bits: usize, @@ -414,20 +124,5 @@ pub fn buffer_unary_not( offset_in_bits: usize, len_in_bits: usize, ) -> Buffer { - // SIMD implementation if available and byte-aligned - #[cfg(feature = "simd")] - if offset_in_bits % 8 == 0 && len_in_bits % 8 == 0 { - return bitwise_unary_op_simd_helper( - &left, - offset_in_bits / 8, - len_in_bits / 8, - |a| !a, - |a| !a, - ); - } - // Default implementation - #[allow(unreachable_code)] - { - bitwise_unary_op_helper(left, offset_in_bits, len_in_bits, |a| !a) - } + bitwise_unary_op_helper(left, offset_in_bits, len_in_bits, |a| !a) } diff --git a/arrow/src/buffer/scalar.rs b/arrow/src/buffer/scalar.rs new file mode 100644 index 000000000000..7d663cd2bf96 --- /dev/null +++ b/arrow/src/buffer/scalar.rs @@ -0,0 +1,149 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::buffer::Buffer; +use crate::datatypes::ArrowNativeType; +use std::ops::Deref; + +/// Provides a safe API for interpreting a [`Buffer`] as a slice of [`ArrowNativeType`] +/// +/// # Safety +/// +/// All [`ArrowNativeType`] are valid for all possible backing byte representations, and as +/// a result they are "trivially safely transmutable". +#[derive(Debug)] +pub struct ScalarBuffer { + #[allow(unused)] + buffer: Buffer, + // Borrows from `buffer` and is valid for the lifetime of `buffer` + ptr: *const T, + // The length of this slice + len: usize, +} + +impl ScalarBuffer { + /// Create a new [`ScalarBuffer`] from a [`Buffer`], and an `offset` + /// and `length` in units of `T` + /// + /// # Panics + /// + /// This method will panic if + /// + /// * `offset` or `len` would result in overflow + /// * `buffer` is not aligned to a multiple of `std::mem::size_of::` + /// * `bytes` is not large enough for the requested slice + pub fn new(buffer: Buffer, offset: usize, len: usize) -> Self { + let size = std::mem::size_of::(); + let offset_len = offset.checked_add(len).expect("length overflow"); + let start_bytes = offset.checked_mul(size).expect("start bytes overflow"); + let end_bytes = offset_len.checked_mul(size).expect("end bytes overflow"); + + let bytes = &buffer.as_slice()[start_bytes..end_bytes]; + + // SAFETY: all byte sequences correspond to a valid instance of T + let (prefix, offsets, suffix) = unsafe { bytes.align_to::() }; + assert!( + prefix.is_empty() && suffix.is_empty(), + "buffer is not aligned to {} byte boundary", + size + ); + + let ptr = offsets.as_ptr(); + Self { buffer, ptr, len } + } +} + +impl Deref for ScalarBuffer { + type Target = [T]; + + fn deref(&self) -> &Self::Target { + // SAFETY: Bounds checked in constructor and ptr is valid for the lifetime of self + unsafe { std::slice::from_raw_parts(self.ptr, self.len) } + } +} + +impl AsRef<[T]> for ScalarBuffer { + fn as_ref(&self) -> &[T] { + self + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_basic() { + let expected = [0_i32, 1, 2]; + let buffer = Buffer::from_iter(expected.iter().cloned()); + let typed = ScalarBuffer::::new(buffer.clone(), 0, 3); + assert_eq!(*typed, expected); + + let typed = ScalarBuffer::::new(buffer.clone(), 1, 2); + assert_eq!(*typed, expected[1..]); + + let typed = ScalarBuffer::::new(buffer.clone(), 1, 0); + assert!(typed.is_empty()); + + let typed = ScalarBuffer::::new(buffer, 3, 0); + assert!(typed.is_empty()); + } + + #[test] + #[should_panic(expected = "buffer is not aligned to 4 byte boundary")] + fn test_unaligned() { + let expected = [0_i32, 1, 2]; + let buffer = Buffer::from_iter(expected.iter().cloned()); + let buffer = buffer.slice(1); + ScalarBuffer::::new(buffer, 0, 2); + } + + #[test] + #[should_panic(expected = "range end index 16 out of range for slice of length 12")] + fn test_length_out_of_bounds() { + let buffer = Buffer::from_iter([0_i32, 1, 2]); + ScalarBuffer::::new(buffer, 1, 3); + } + + #[test] + #[should_panic(expected = "range end index 16 out of range for slice of length 12")] + fn test_offset_out_of_bounds() { + let buffer = Buffer::from_iter([0_i32, 1, 2]); + ScalarBuffer::::new(buffer, 4, 0); + } + + #[test] + #[should_panic(expected = "length overflow")] + fn test_length_overflow() { + let buffer = Buffer::from_iter([0_i32, 1, 2]); + ScalarBuffer::::new(buffer, usize::MAX, 1); + } + + #[test] + #[should_panic(expected = "start bytes overflow")] + fn test_start_overflow() { + let buffer = Buffer::from_iter([0_i32, 1, 2]); + ScalarBuffer::::new(buffer, usize::MAX / 4 + 1, 0); + } + + #[test] + #[should_panic(expected = "end bytes overflow")] + fn test_end_overflow() { + let buffer = Buffer::from_iter([0_i32, 1, 2]); + ScalarBuffer::::new(buffer, 0, usize::MAX / 4 + 1); + } +} diff --git a/arrow/src/compute/kernels/arithmetic.rs b/arrow/src/compute/kernels/arithmetic.rs index 2d63924a12d3..04865e15bca2 100644 --- a/arrow/src/compute/kernels/arithmetic.rs +++ b/arrow/src/compute/kernels/arithmetic.rs @@ -64,7 +64,7 @@ where } let null_bit_buffer = - combine_option_bitmap(left.data_ref(), right.data_ref(), left.len())?; + combine_option_bitmap(&[left.data_ref(), right.data_ref()], left.len())?; let values = left .values() @@ -117,7 +117,7 @@ where } let null_bit_buffer = - combine_option_bitmap(left.data_ref(), right.data_ref(), left.len())?; + combine_option_bitmap(&[left.data_ref(), right.data_ref()], left.len())?; let buffer = if let Some(b) = &null_bit_buffer { let values = left.values().iter().zip(right.values()).enumerate().map( @@ -316,7 +316,7 @@ where // Create the combined `Bitmap` let null_bit_buffer = - combine_option_bitmap(left.data_ref(), right.data_ref(), left.len())?; + combine_option_bitmap(&[left.data_ref(), right.data_ref()], left.len())?; let lanes = T::lanes(); let buffer_size = left.len() * std::mem::size_of::(); diff --git a/arrow/src/compute/kernels/boolean.rs b/arrow/src/compute/kernels/boolean.rs index 5f52d56bb4d2..209edc48d195 100644 --- a/arrow/src/compute/kernels/boolean.rs +++ b/arrow/src/compute/kernels/boolean.rs @@ -193,7 +193,7 @@ where let left_data = left.data_ref(); let right_data = right.data_ref(); - let null_bit_buffer = combine_option_bitmap(left_data, right_data, len)?; + let null_bit_buffer = combine_option_bitmap(&[left_data, right_data], len)?; let left_buffer = &left_data.buffers()[0]; let right_buffer = &right_data.buffers()[0]; diff --git a/arrow/src/compute/kernels/cast.rs b/arrow/src/compute/kernels/cast.rs index 93a8ebcb6b5a..fa92179b747c 100644 --- a/arrow/src/compute/kernels/cast.rs +++ b/arrow/src/compute/kernels/cast.rs @@ -353,7 +353,7 @@ macro_rules! cast_decimal_to_integer { if array.is_null(i) { value_builder.append_null()?; } else { - let v = array.value(i) / div; + let v = array.value(i).as_i128() / div; // check the overflow // For example: Decimal(128,10,0) as i8 // 128 is out of range i8 @@ -383,7 +383,7 @@ macro_rules! cast_decimal_to_float { } else { // The range of f32 or f64 is larger than i128, we don't need to check overflow. // cast the i128 to f64 will lose precision, for example the `112345678901234568` will be as `112345678901234560`. - let v = (array.value(i) as f64 / div) as $NATIVE_TYPE; + let v = (array.value(i).as_i128() as f64 / div) as $NATIVE_TYPE; value_builder.append_value(v)?; } } @@ -2084,7 +2084,7 @@ where let list_data = array.data(); let str_values_buf = str_array.value_data(); - let offsets = unsafe { list_data.buffers()[0].typed_data::() }; + let offsets = list_data.buffers()[0].typed_data::(); let mut offset_builder = BufferBuilder::::new(offsets.len()); offsets.iter().try_for_each::<_, Result<_>>(|offset| { @@ -2196,6 +2196,7 @@ where #[cfg(test)] mod tests { use super::*; + use crate::util::decimal::Decimal128; use crate::{buffer::Buffer, util::display::array_value_to_string}; macro_rules! generate_cast_test_case { @@ -2247,9 +2248,9 @@ mod tests { DecimalArray, &output_type, vec![ - Some(11234560_i128), - Some(21234560_i128), - Some(31234560_i128), + Some(Decimal128::new_from_i128(20, 4, 11234560_i128)), + Some(Decimal128::new_from_i128(20, 4, 21234560_i128)), + Some(Decimal128::new_from_i128(20, 4, 31234560_i128)), None ] ); @@ -2426,11 +2427,11 @@ mod tests { DecimalArray, &decimal_type, vec![ - Some(1000000_i128), - Some(2000000_i128), - Some(3000000_i128), + Some(Decimal128::new_from_i128(38, 6, 1000000_i128)), + Some(Decimal128::new_from_i128(38, 6, 2000000_i128)), + Some(Decimal128::new_from_i128(38, 6, 3000000_i128)), None, - Some(5000000_i128) + Some(Decimal128::new_from_i128(38, 6, 5000000_i128)) ] ); } @@ -2458,12 +2459,12 @@ mod tests { DecimalArray, &decimal_type, vec![ - Some(1100000_i128), - Some(2200000_i128), - Some(4400000_i128), + Some(Decimal128::new_from_i128(38, 6, 1100000_i128)), + Some(Decimal128::new_from_i128(38, 6, 2200000_i128)), + Some(Decimal128::new_from_i128(38, 6, 4400000_i128)), None, - Some(1123456_i128), - Some(1123456_i128), + Some(Decimal128::new_from_i128(38, 6, 1123456_i128)), + Some(Decimal128::new_from_i128(38, 6, 1123456_i128)), ] ); @@ -2483,13 +2484,13 @@ mod tests { DecimalArray, &decimal_type, vec![ - Some(1100000_i128), - Some(2200000_i128), - Some(4400000_i128), + Some(Decimal128::new_from_i128(38, 6, 1100000_i128)), + Some(Decimal128::new_from_i128(38, 6, 2200000_i128)), + Some(Decimal128::new_from_i128(38, 6, 4400000_i128)), None, - Some(1123456_i128), - Some(1123456_i128), - Some(1123456_i128), + Some(Decimal128::new_from_i128(38, 6, 1123456_i128)), + Some(Decimal128::new_from_i128(38, 6, 1123456_i128)), + Some(Decimal128::new_from_i128(38, 6, 1123456_i128)), ] ); } diff --git a/arrow/src/compute/kernels/comparison.rs b/arrow/src/compute/kernels/comparison.rs index 590ed5b0f735..068b9dedf59b 100644 --- a/arrow/src/compute/kernels/comparison.rs +++ b/arrow/src/compute/kernels/comparison.rs @@ -52,7 +52,7 @@ macro_rules! compare_op { } let null_bit_buffer = - combine_option_bitmap($left.data_ref(), $right.data_ref(), $left.len())?; + combine_option_bitmap(&[$left.data_ref(), $right.data_ref()], $left.len())?; // Safety: // `i < $left.len()` and $left.len() == $right.len() @@ -86,7 +86,7 @@ macro_rules! compare_op_primitive { } let null_bit_buffer = - combine_option_bitmap($left.data_ref(), $right.data_ref(), $left.len())?; + combine_option_bitmap(&[$left.data_ref(), $right.data_ref()], $left.len())?; let mut values = MutableBuffer::from_len_zeroed(($left.len() + 7) / 8); let lhs_chunks_iter = $left.values().chunks_exact(8); @@ -258,7 +258,7 @@ where } let null_bit_buffer = - combine_option_bitmap(left.data_ref(), right.data_ref(), left.len())?; + combine_option_bitmap(&[left.data_ref(), right.data_ref()], left.len())?; let mut result = BooleanBufferBuilder::new(left.len()); for i in 0..left.len() { @@ -548,6 +548,89 @@ pub fn ilike_utf8_scalar( Ok(BooleanArray::from(data)) } +/// Perform SQL `left NOT ILIKE right` operation on [`StringArray`] / +/// [`LargeStringArray`]. +/// +/// See the documentation on [`like_utf8`] for more details. +pub fn nilike_utf8( + left: &GenericStringArray, + right: &GenericStringArray, +) -> Result { + regex_like(left, right, true, |re_pattern| { + Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| { + ArrowError::ComputeError(format!( + "Unable to build regex from ILIKE pattern: {}", + e + )) + }) + }) +} + +/// Perform SQL `left NOT ILIKE right` operation on [`StringArray`] / +/// [`LargeStringArray`] and a scalar. +/// +/// See the documentation on [`like_utf8`] for more details. +pub fn nilike_utf8_scalar( + left: &GenericStringArray, + right: &str, +) -> Result { + let null_bit_buffer = left.data().null_buffer().cloned(); + let mut result = BooleanBufferBuilder::new(left.len()); + + if !right.contains(is_like_pattern) { + // fast path, can use equals + for i in 0..left.len() { + result.append(left.value(i) != right); + } + } else if right.ends_with('%') && !right[..right.len() - 1].contains(is_like_pattern) + { + // fast path, can use ends_with + for i in 0..left.len() { + result.append( + !left + .value(i) + .to_uppercase() + .starts_with(&right[..right.len() - 1].to_uppercase()), + ); + } + } else if right.starts_with('%') && !right[1..].contains(is_like_pattern) { + // fast path, can use starts_with + for i in 0..left.len() { + result.append( + !left + .value(i) + .to_uppercase() + .ends_with(&right[1..].to_uppercase()), + ); + } + } else { + let re_pattern = escape(right).replace('%', ".*").replace('_', "."); + let re = Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| { + ArrowError::ComputeError(format!( + "Unable to build regex from ILIKE pattern: {}", + e + )) + })?; + for i in 0..left.len() { + let haystack = left.value(i); + result.append(!re.is_match(haystack)); + } + } + + let data = unsafe { + ArrayData::new_unchecked( + DataType::Boolean, + left.len(), + None, + null_bit_buffer, + 0, + vec![result.finish()], + vec![], + ) + }; + Ok(BooleanArray::from(data)) +} + /// Perform SQL `array ~ regex_array` operation on [`StringArray`] / [`LargeStringArray`]. /// If `regex_array` element has an empty value, the corresponding result value is always true. /// @@ -567,7 +650,7 @@ pub fn regexp_is_match_utf8( )); } let null_bit_buffer = - combine_option_bitmap(array.data_ref(), regex_array.data_ref(), array.len())?; + combine_option_bitmap(&[array.data_ref(), regex_array.data_ref()], array.len())?; let mut patterns: HashMap = HashMap::new(); let mut result = BooleanBufferBuilder::new(array.len()); @@ -1676,7 +1759,8 @@ where )); } - let null_bit_buffer = combine_option_bitmap(left.data_ref(), right.data_ref(), len)?; + let null_bit_buffer = + combine_option_bitmap(&[left.data_ref(), right.data_ref()], len)?; // we process the data in chunks so that each iteration results in one u64 of comparison result bits const CHUNK_SIZE: usize = 64; @@ -2617,7 +2701,7 @@ where let num_bytes = bit_util::ceil(left_len, 8); let not_both_null_bit_buffer = - match combine_option_bitmap(left.data_ref(), right.data_ref(), left_len)? { + match combine_option_bitmap(&[left.data_ref(), right.data_ref()], left_len)? { Some(buff) => buff, None => new_all_set_buffer(num_bytes), }; @@ -2674,7 +2758,7 @@ where let num_bytes = bit_util::ceil(left_len, 8); let not_both_null_bit_buffer = - match combine_option_bitmap(left.data_ref(), right.data_ref(), left_len)? { + match combine_option_bitmap(&[left.data_ref(), right.data_ref()], left_len)? { Some(buff) => buff, None => new_all_set_buffer(num_bytes), }; @@ -3983,6 +4067,60 @@ mod tests { vec![false, true, false, false] ); + test_utf8!( + test_utf8_array_nilike, + vec!["arrow", "arrow", "ARROW", "arrow", "ARROW", "ARROWS", "arROw"], + vec!["arrow", "ar%", "%ro%", "foo", "ar%r", "arrow_", "arrow_"], + nilike_utf8, + vec![false, false, false, true, true, false, true] + ); + test_utf8_scalar!( + nilike_utf8_scalar_escape_testing, + vec!["varchar(255)", "int(255)", "varchar", "int"], + "%(%)%", + nilike_utf8_scalar, + vec![false, false, true, true] + ); + test_utf8_scalar!( + test_utf8_array_nilike_scalar, + vec!["arrow", "parquet", "datafusion", "flight"], + "%AR%", + nilike_utf8_scalar, + vec![false, false, true, true] + ); + + test_utf8_scalar!( + test_utf8_array_nilike_scalar_start, + vec!["arrow", "parrow", "arrows", "ARR"], + "aRRow%", + nilike_utf8_scalar, + vec![false, true, false, true] + ); + + test_utf8_scalar!( + test_utf8_array_nilike_scalar_end, + vec!["ArroW", "parrow", "ARRowS", "arr"], + "%arrow", + nilike_utf8_scalar, + vec![false, false, true, true] + ); + + test_utf8_scalar!( + test_utf8_array_nilike_scalar_equals, + vec!["arrow", "parrow", "arrows", "arr"], + "arrow", + nilike_utf8_scalar, + vec![false, true, true, true] + ); + + test_utf8_scalar!( + test_utf8_array_nilike_scalar_one, + vec!["arrow", "arrows", "parrow", "arr"], + "arrow_", + nilike_utf8_scalar, + vec![true, false, true, true] + ); + test_utf8!( test_utf8_array_neq, vec!["arrow", "arrow", "arrow", "arrow"], diff --git a/arrow/src/compute/kernels/concat_elements.rs b/arrow/src/compute/kernels/concat_elements.rs index 47cbdfab17e0..bc341df889c0 100644 --- a/arrow/src/compute/kernels/concat_elements.rs +++ b/arrow/src/compute/kernels/concat_elements.rs @@ -45,7 +45,7 @@ pub fn concat_elements_utf8( ))); } - let output_bitmap = combine_option_bitmap(left.data(), right.data(), left.len())?; + let output_bitmap = combine_option_bitmap(&[left.data(), right.data()], left.len())?; let left_offsets = left.value_offsets(); let right_offsets = right.value_offsets(); diff --git a/arrow/src/compute/kernels/filter.rs b/arrow/src/compute/kernels/filter.rs index b59625115209..1af93bff5ad7 100644 --- a/arrow/src/compute/kernels/filter.rs +++ b/arrow/src/compute/kernels/filter.rs @@ -29,7 +29,7 @@ use crate::buffer::{buffer_bin_and, Buffer, MutableBuffer}; use crate::datatypes::*; use crate::error::{ArrowError, Result}; use crate::record_batch::RecordBatch; -use crate::util::bit_chunk_iterator::{UnalignedBitChunk, UnalignedBitChunkIterator}; +use crate::util::bit_iterator::{BitIndexIterator, BitSliceIterator}; use crate::util::bit_util; /// If the filter selects more than this fraction of rows, use @@ -72,47 +72,15 @@ macro_rules! downcast_dict_filter { /// /// 2. Only performant for filters that copy across long contiguous runs #[derive(Debug)] -pub struct SlicesIterator<'a> { - iter: UnalignedBitChunkIterator<'a>, - len: usize, - current_offset: i64, - current_chunk: u64, -} +pub struct SlicesIterator<'a>(BitSliceIterator<'a>); impl<'a> SlicesIterator<'a> { pub fn new(filter: &'a BooleanArray) -> Self { let values = &filter.data_ref().buffers()[0]; let len = filter.len(); - let chunk = UnalignedBitChunk::new(values.as_slice(), filter.offset(), len); - let mut iter = chunk.iter(); - - let current_offset = -(chunk.lead_padding() as i64); - let current_chunk = iter.next().unwrap_or(0); - - Self { - iter, - len, - current_offset, - current_chunk, - } - } - - /// Returns `Some((chunk_offset, bit_offset))` for the next chunk that has at - /// least one bit set, or None if there is no such chunk. - /// - /// Where `chunk_offset` is the bit offset to the current `u64` chunk - /// and `bit_offset` is the offset of the first `1` bit in that chunk - fn advance_to_set_bit(&mut self) -> Option<(i64, u32)> { - loop { - if self.current_chunk != 0 { - // Find the index of the first 1 - let bit_pos = self.current_chunk.trailing_zeros(); - return Some((self.current_offset, bit_pos)); - } + let offset = filter.offset(); - self.current_chunk = self.iter.next()?; - self.current_offset += 64; - } + Self(BitSliceIterator::new(values, offset, len)) } } @@ -120,43 +88,7 @@ impl<'a> Iterator for SlicesIterator<'a> { type Item = (usize, usize); fn next(&mut self) -> Option { - // Used as termination condition - if self.len == 0 { - return None; - } - - let (start_chunk, start_bit) = self.advance_to_set_bit()?; - - // Set bits up to start - self.current_chunk |= (1 << start_bit) - 1; - - loop { - if self.current_chunk != u64::MAX { - // Find the index of the first 0 - let end_bit = self.current_chunk.trailing_ones(); - - // Zero out up to end_bit - self.current_chunk &= !((1 << end_bit) - 1); - - return Some(( - (start_chunk + start_bit as i64) as usize, - (self.current_offset + end_bit as i64) as usize, - )); - } - - match self.iter.next() { - Some(next) => { - self.current_chunk = next; - self.current_offset += 64; - } - None => { - return Some(( - (start_chunk + start_bit as i64) as usize, - std::mem::replace(&mut self.len, 0), - )); - } - } - } + self.0.next() } } @@ -165,29 +97,16 @@ impl<'a> Iterator for SlicesIterator<'a> { /// This provides the best performance on most predicates, apart from those which keep /// large runs and therefore favour [`SlicesIterator`] struct IndexIterator<'a> { - current_chunk: u64, - chunk_offset: i64, remaining: usize, - iter: UnalignedBitChunkIterator<'a>, + iter: BitIndexIterator<'a>, } impl<'a> IndexIterator<'a> { - fn new(filter: &'a BooleanArray, len: usize) -> Self { + fn new(filter: &'a BooleanArray, remaining: usize) -> Self { assert_eq!(filter.null_count(), 0); let data = filter.data(); - let chunks = - UnalignedBitChunk::new(&data.buffers()[0], data.offset(), data.len()); - let mut iter = chunks.iter(); - - let current_chunk = iter.next().unwrap_or(0); - let chunk_offset = -(chunks.lead_padding() as i64); - - Self { - current_chunk, - chunk_offset, - remaining: len, - iter, - } + let iter = BitIndexIterator::new(&data.buffers()[0], data.offset(), data.len()); + Self { remaining, iter } } } @@ -195,17 +114,13 @@ impl<'a> Iterator for IndexIterator<'a> { type Item = usize; fn next(&mut self) -> Option { - while self.remaining != 0 { - if self.current_chunk != 0 { - let bit_pos = self.current_chunk.trailing_zeros(); - self.current_chunk ^= 1 << bit_pos; - self.remaining -= 1; - return Some((self.chunk_offset + bit_pos as i64) as usize); - } - + if self.remaining != 0 { + // Fascinatingly swapping these two lines around results in a 50% + // performance regression for some benchmarks + let next = self.iter.next().expect("IndexIterator exhausted early"); + self.remaining -= 1; // Must panic if exhausted early as trusted length iterator - self.current_chunk = self.iter.next().expect("IndexIterator exhausted early"); - self.chunk_offset += 64; + return Some(next); } None } @@ -1332,6 +1247,7 @@ mod tests { } #[test] + #[cfg_attr(miri, ignore)] fn fuzz_test_slices_iterator() { let mut rng = thread_rng(); @@ -1401,6 +1317,7 @@ mod tests { } #[test] + #[cfg_attr(miri, ignore)] fn fuzz_filter() { let mut rng = thread_rng(); diff --git a/arrow/src/compute/kernels/sort.rs b/arrow/src/compute/kernels/sort.rs index 140a57f33ed5..e399cf9f0c19 100644 --- a/arrow/src/compute/kernels/sort.rs +++ b/arrow/src/compute/kernels/sort.rs @@ -452,8 +452,7 @@ fn sort_boolean( let mut result = MutableBuffer::new(result_capacity); // sets len to capacity so we can access the whole buffer as a typed slice result.resize(result_capacity, 0); - // Safety: the buffer is always treated as `u32` in the code below - let result_slice: &mut [u32] = unsafe { result.typed_data_mut() }; + let result_slice: &mut [u32] = result.typed_data_mut(); if options.nulls_first { let size = nulls_len.min(len); @@ -504,7 +503,7 @@ where .expect("Unable to downcast to decimal array"); let valids = value_indices .into_iter() - .map(|index| (index, decimal_array.value(index as usize))) + .map(|index| (index, decimal_array.value(index as usize).as_i128())) .collect::>(); sort_primitive_inner(decimal_values, null_indices, cmp, options, limit, valids) } @@ -565,8 +564,7 @@ where let mut result = MutableBuffer::new(result_capacity); // sets len to capacity so we can access the whole buffer as a typed slice result.resize(result_capacity, 0); - // Safety: the buffer is always treated as `u32` in the code below - let result_slice: &mut [u32] = unsafe { result.typed_data_mut() }; + let result_slice: &mut [u32] = result.typed_data_mut(); if options.nulls_first { let size = nulls_len.min(len); diff --git a/arrow/src/compute/kernels/substring.rs b/arrow/src/compute/kernels/substring.rs index f1b6e8d4aa79..024f5633fef4 100644 --- a/arrow/src/compute/kernels/substring.rs +++ b/arrow/src/compute/kernels/substring.rs @@ -16,7 +16,8 @@ // under the License. //! Defines kernel to extract a substring of an Array -//! Supported array types: \[Large\]StringArray, \[Large\]BinaryArray +//! Supported array types: +//! [GenericStringArray], [GenericBinaryArray], [FixedSizeBinaryArray], [DictionaryArray] use crate::array::DictionaryArray; use crate::buffer::MutableBuffer; @@ -29,7 +30,7 @@ use crate::{ use std::cmp::Ordering; use std::sync::Arc; -/// Returns an ArrayRef with substrings of all the elements in `array`. +/// Returns an [`ArrayRef`] with substrings of all the elements in `array`. /// /// # Arguments /// @@ -38,7 +39,7 @@ use std::sync::Arc; /// otherwise count from the end of the string. /// /// * `length`(option) - The length of all substrings. -/// If `length` is `None`, then the substring is from `start` to the end of the string. +/// If `length` is [None], then the substring is from `start` to the end of the string. /// /// Attention: Both `start` and `length` are counted by byte, not by char. /// @@ -53,9 +54,10 @@ use std::sync::Arc; /// ``` /// /// # Error -/// - The function errors when the passed array is not a \[Large\]String array, \[Large\]Binary -/// array, or DictionaryArray with \[Large\]String or \[Large\]Binary as its value type. +/// - The function errors when the passed array is not a [`GenericStringArray`], [`GenericBinaryArray`], [`FixedSizeBinaryArray`] +/// or [`DictionaryArray`] with supported array type as its value type. /// - The function errors if the offset of a substring in the input array is at invalid char boundary (only for \[Large\]String array). +/// It is recommended to use [`substring_by_char`] if the input array may contain non-ASCII chars. /// /// ## Example of trying to get an invalid utf-8 format substring /// ``` @@ -150,6 +152,101 @@ pub fn substring(array: &dyn Array, start: i64, length: Option) -> Result= 0`, then count from the start of the string, +/// otherwise count from the end of the string. +/// +/// * `length`(option) - The length of all substrings. +/// If `length` is `None`, then the substring is from `start` to the end of the string. +/// +/// Attention: Both `start` and `length` are counted by char. +/// +/// # Performance +/// This function is slower than [substring]. +/// Theoretically, the time complexity is `O(n)` where `n` is the length of the value buffer. +/// It is recommended to use [substring] if the input array only contains ASCII chars. +/// +/// # Basic usage +/// ``` +/// # use arrow::array::StringArray; +/// # use arrow::compute::kernels::substring::substring_by_char; +/// let array = StringArray::from(vec![Some("arrow"), None, Some("Γ ⊢x:T")]); +/// let result = substring_by_char(&array, 1, Some(4)).unwrap(); +/// assert_eq!(result, StringArray::from(vec![Some("rrow"), None, Some(" ⊢x:")])); +/// ``` +pub fn substring_by_char( + array: &GenericStringArray, + start: i64, + length: Option, +) -> Result> { + let mut vals = BufferBuilder::::new({ + let offsets = array.value_offsets(); + (offsets[array.len()] - offsets[0]).to_usize().unwrap() + }); + let mut new_offsets = BufferBuilder::::new(array.len() + 1); + new_offsets.append(OffsetSize::zero()); + let length = length.map(|len| len.to_usize().unwrap()); + + array.iter().for_each(|val| { + if let Some(val) = val { + let char_count = val.chars().count(); + let start = if start >= 0 { + start.to_usize().unwrap() + } else { + char_count - (-start).to_usize().unwrap().min(char_count) + }; + let (start_offset, end_offset) = get_start_end_offset(val, start, length); + vals.append_slice(&val.as_bytes()[start_offset..end_offset]); + } + new_offsets.append(OffsetSize::from_usize(vals.len()).unwrap()); + }); + let data = unsafe { + ArrayData::new_unchecked( + GenericStringArray::::get_data_type(), + array.len(), + None, + array + .data_ref() + .null_buffer() + .map(|b| b.bit_slice(array.offset(), array.len())), + 0, + vec![new_offsets.finish(), vals.finish()], + vec![], + ) + }; + Ok(GenericStringArray::::from(data)) +} + +/// * `val` - string +/// * `start` - the start char index of the substring +/// * `length` - the char length of the substring +/// +/// Return the `start` and `end` offset (by byte) of the substring +fn get_start_end_offset( + val: &str, + start: usize, + length: Option, +) -> (usize, usize) { + let len = val.len(); + let mut offset_char_iter = val.char_indices(); + let start_offset = offset_char_iter + .nth(start) + .map_or(len, |(offset, _)| offset); + let end_offset = length.map_or(len, |length| { + if length > 0 { + offset_char_iter + .nth(length - 1) + .map_or(len, |(offset, _)| offset) + } else { + start_offset + } + }); + (start_offset, end_offset) +} + fn binary_substring( array: &GenericBinaryArray, start: OffsetSize, @@ -348,218 +445,138 @@ mod tests { use super::*; use crate::datatypes::*; - #[allow(clippy::type_complexity)] - fn with_nulls_generic_binary() -> Result<()> { - let cases: Vec<(Vec>, i64, Option, Vec>)> = vec![ - // all-nulls array is always identical - (vec![None, None, None], -1, Some(1), vec![None, None, None]), + /// A helper macro to generate test cases. + /// # Arguments + /// * `input` - A vector which array can be built from. + /// * `start` - The start index of the substring. + /// * `len` - The length of the substring. + /// * `result` - The expected result of substring, which is a vector that array can be built from. + /// # Return + /// A vector of `(input, start, len, result)`. + /// + /// Users can provide any number of `(start, len, result)` to generate test cases for one `input`. + macro_rules! gen_test_cases { + ($input:expr, $(($start:expr, $len:expr, $result:expr)), *) => { + [ + $( + ($input.clone(), $start, $len, $result), + )* + ] + }; + } + + /// A helper macro to test the substring functions. + /// # Arguments + /// * `cases` - The test cases which is a vector of `(input, start, len, result)`. + /// Please look at [`gen_test_cases`] to find how to generate it. + /// * `array_ty` - The array type. + /// * `substring_fn` - Either [`substring`] or [`substring_by_char`]. + macro_rules! do_test { + ($cases:expr, $array_ty:ty, $substring_fn:ident) => { + $cases + .into_iter() + .for_each(|(array, start, length, expected)| { + let array = <$array_ty>::from(array); + let result = $substring_fn(&array, start, length).unwrap(); + let result = result.as_any().downcast_ref::<$array_ty>().unwrap(); + let expected = <$array_ty>::from(expected); + assert_eq!(&expected, result); + }) + }; + } + + fn with_nulls_generic_binary() { + let input = vec![ + Some("hello".as_bytes()), + None, + Some(&[0xf8, 0xf9, 0xff, 0xfa]), + ]; + // all-nulls array is always identical + let base_case = gen_test_cases!( + vec![None, None, None], + (-1, Some(1), vec![None, None, None]) + ); + let cases = gen_test_cases!( + input, // identity - ( - vec![Some(b"hello"), None, Some(&[0xf8, 0xf9, 0xff, 0xfa])], - 0, - None, - vec![Some(b"hello"), None, Some(&[0xf8, 0xf9, 0xff, 0xfa])], - ), + (0, None, input.clone()), // 0 length -> Nothing - ( - vec![Some(b"hello"), None, Some(&[0xf8, 0xf9, 0xff, 0xfa])], - 0, - Some(0), - vec![Some(&[]), None, Some(&[])], - ), + (0, Some(0), vec![Some(&[]), None, Some(&[])]), // high start -> Nothing - ( - vec![Some(b"hello"), None, Some(&[0xf8, 0xf9, 0xff, 0xfa])], - 1000, - Some(0), - vec![Some(&[]), None, Some(&[])], - ), + (1000, Some(0), vec![Some(&[]), None, Some(&[])]), // high negative start -> identity - ( - vec![Some(b"hello"), None, Some(&[0xf8, 0xf9, 0xff, 0xfa])], - -1000, - None, - vec![Some(b"hello"), None, Some(&[0xf8, 0xf9, 0xff, 0xfa])], - ), + (-1000, None, input.clone()), // high length -> identity - ( - vec![Some(b"hello"), None, Some(&[0xf8, 0xf9, 0xff, 0xfa])], - 0, - Some(1000), - vec![Some(b"hello"), None, Some(&[0xf8, 0xf9, 0xff, 0xfa])], - ), - ]; - - cases.into_iter().try_for_each::<_, Result<()>>( - |(array, start, length, expected)| { - let array = GenericBinaryArray::::from(array); - let result: ArrayRef = substring(&array, start, length)?; - assert_eq!(array.len(), result.len()); - - let result = result - .as_any() - .downcast_ref::>() - .unwrap(); - let expected = GenericBinaryArray::::from(expected); - assert_eq!(&expected, result); - Ok(()) - }, - )?; - - Ok(()) + (0, Some(1000), input.clone()) + ); + + do_test!( + [&base_case[..], &cases[..]].concat(), + GenericBinaryArray, + substring + ); } #[test] - fn with_nulls_binary() -> Result<()> { + fn with_nulls_binary() { with_nulls_generic_binary::() } #[test] - fn with_nulls_large_binary() -> Result<()> { + fn with_nulls_large_binary() { with_nulls_generic_binary::() } - #[allow(clippy::type_complexity)] - fn without_nulls_generic_binary() -> Result<()> { - let cases: Vec<(Vec<&[u8]>, i64, Option, Vec<&[u8]>)> = vec![ - // empty array is always identical - (vec![b"", b"", b""], 2, Some(1), vec![b"", b"", b""]), + fn without_nulls_generic_binary() { + let input = vec!["hello".as_bytes(), b"", &[0xf8, 0xf9, 0xff, 0xfa]]; + // empty array is always identical + let base_case = gen_test_cases!( + vec!["".as_bytes(), b"", b""], + (2, Some(1), vec!["".as_bytes(), b"", b""]) + ); + let cases = gen_test_cases!( + input, + // identity + (0, None, input.clone()), // increase start - ( - vec![b"hello", b"", &[0xf8, 0xf9, 0xff, 0xfa]], - 0, - None, - vec![b"hello", b"", &[0xf8, 0xf9, 0xff, 0xfa]], - ), - ( - vec![b"hello", b"", &[0xf8, 0xf9, 0xff, 0xfa]], - 1, - None, - vec![b"ello", b"", &[0xf9, 0xff, 0xfa]], - ), - ( - vec![b"hello", b"", &[0xf8, 0xf9, 0xff, 0xfa]], - 2, - None, - vec![b"llo", b"", &[0xff, 0xfa]], - ), - ( - vec![b"hello", b"", &[0xf8, 0xf9, 0xff, 0xfa]], - 3, - None, - vec![b"lo", b"", &[0xfa]], - ), - ( - vec![b"hello", b"", &[0xf8, 0xf9, 0xff, 0xfa]], - 10, - None, - vec![b"", b"", b""], - ), + (1, None, vec![b"ello", b"", &[0xf9, 0xff, 0xfa]]), + (2, None, vec![b"llo", b"", &[0xff, 0xfa]]), + (3, None, vec![b"lo", b"", &[0xfa]]), + (10, None, vec![b"", b"", b""]), // increase start negatively - ( - vec![b"hello", b"", &[0xf8, 0xf9, 0xff, 0xfa]], - -1, - None, - vec![b"o", b"", &[0xfa]], - ), - ( - vec![b"hello", b"", &[0xf8, 0xf9, 0xff, 0xfa]], - -2, - None, - vec![b"lo", b"", &[0xff, 0xfa]], - ), - ( - vec![b"hello", b"", &[0xf8, 0xf9, 0xff, 0xfa]], - -3, - None, - vec![b"llo", b"", &[0xf9, 0xff, 0xfa]], - ), - ( - vec![b"hello", b"", &[0xf8, 0xf9, 0xff, 0xfa]], - -10, - None, - vec![b"hello", b"", &[0xf8, 0xf9, 0xff, 0xfa]], - ), + (-1, None, vec![b"o", b"", &[0xfa]]), + (-2, None, vec![b"lo", b"", &[0xff, 0xfa]]), + (-3, None, vec![b"llo", b"", &[0xf9, 0xff, 0xfa]]), + (-10, None, input.clone()), // increase length - ( - vec![b"hello", b"", &[0xf8, 0xf9, 0xff, 0xfa]], - 1, - Some(1), - vec![b"e", b"", &[0xf9]], - ), - ( - vec![b"hello", b"", &[0xf8, 0xf9, 0xff, 0xfa]], - 1, - Some(2), - vec![b"el", b"", &[0xf9, 0xff]], - ), - ( - vec![b"hello", b"", &[0xf8, 0xf9, 0xff, 0xfa]], - 1, - Some(3), - vec![b"ell", b"", &[0xf9, 0xff, 0xfa]], - ), - ( - vec![b"hello", b"", &[0xf8, 0xf9, 0xff, 0xfa]], - 1, - Some(4), - vec![b"ello", b"", &[0xf9, 0xff, 0xfa]], - ), - ( - vec![b"hello", b"", &[0xf8, 0xf9, 0xff, 0xfa]], - -3, - Some(1), - vec![b"l", b"", &[0xf9]], - ), - ( - vec![b"hello", b"", &[0xf8, 0xf9, 0xff, 0xfa]], - -3, - Some(2), - vec![b"ll", b"", &[0xf9, 0xff]], - ), - ( - vec![b"hello", b"", &[0xf8, 0xf9, 0xff, 0xfa]], - -3, - Some(3), - vec![b"llo", b"", &[0xf9, 0xff, 0xfa]], - ), - ( - vec![b"hello", b"", &[0xf8, 0xf9, 0xff, 0xfa]], - -3, - Some(4), - vec![b"llo", b"", &[0xf9, 0xff, 0xfa]], - ), - ]; - - cases.into_iter().try_for_each::<_, Result<()>>( - |(array, start, length, expected)| { - let array = GenericBinaryArray::::from(array); - let result = substring(&array, start, length)?; - assert_eq!(array.len(), result.len()); - let result = result - .as_any() - .downcast_ref::>() - .unwrap(); - let expected = GenericBinaryArray::::from(expected); - assert_eq!(&expected, result,); - Ok(()) - }, - )?; - - Ok(()) + (1, Some(1), vec![b"e", b"", &[0xf9]]), + (1, Some(2), vec![b"el", b"", &[0xf9, 0xff]]), + (1, Some(3), vec![b"ell", b"", &[0xf9, 0xff, 0xfa]]), + (1, Some(4), vec![b"ello", b"", &[0xf9, 0xff, 0xfa]]), + (-3, Some(1), vec![b"l", b"", &[0xf9]]), + (-3, Some(2), vec![b"ll", b"", &[0xf9, 0xff]]), + (-3, Some(3), vec![b"llo", b"", &[0xf9, 0xff, 0xfa]]), + (-3, Some(4), vec![b"llo", b"", &[0xf9, 0xff, 0xfa]]) + ); + + do_test!( + [&base_case[..], &cases[..]].concat(), + GenericBinaryArray, + substring + ); } #[test] - fn without_nulls_binary() -> Result<()> { + fn without_nulls_binary() { without_nulls_generic_binary::() } #[test] - fn without_nulls_large_binary() -> Result<()> { + fn without_nulls_large_binary() { without_nulls_generic_binary::() } - fn generic_binary_with_non_zero_offset() -> Result<()> { + fn generic_binary_with_non_zero_offset() { let values = 0_u8..15; let offsets = &[ O::zero(), @@ -576,11 +593,12 @@ mod tests { .add_buffer(Buffer::from_iter(values)) .null_bit_buffer(Some(Buffer::from(bitmap))) .offset(1) - .build()?; + .build() + .unwrap(); // array is `[null, [10, 11, 12, 13, 14]]` let array = GenericBinaryArray::::from(data); // result is `[null, [11, 12, 13, 14]]` - let result = substring(&array, 1, None)?; + let result = substring(&array, 1, None).unwrap(); let result = result .as_any() .downcast_ref::>() @@ -588,277 +606,96 @@ mod tests { let expected = GenericBinaryArray::::from_opt_vec(vec![None, Some(&[11_u8, 12, 13, 14])]); assert_eq!(result, &expected); - - Ok(()) } #[test] - fn binary_with_non_zero_offset() -> Result<()> { + fn binary_with_non_zero_offset() { generic_binary_with_non_zero_offset::() } #[test] - fn large_binary_with_non_zero_offset() -> Result<()> { + fn large_binary_with_non_zero_offset() { generic_binary_with_non_zero_offset::() } #[test] - #[allow(clippy::type_complexity)] - fn with_nulls_fixed_size_binary() -> Result<()> { - let cases: Vec<(Vec>, i64, Option, Vec>)> = vec![ - // all-nulls array is always identical - (vec![None, None, None], 3, Some(2), vec![None, None, None]), + fn with_nulls_fixed_size_binary() { + let input = vec![Some("cat".as_bytes()), None, Some(&[0xf8, 0xf9, 0xff])]; + // all-nulls array is always identical + let base_case = + gen_test_cases!(vec![None, None, None], (3, Some(2), vec![None, None, None])); + let cases = gen_test_cases!( + input, + // identity + (0, None, input.clone()), // increase start - ( - vec![Some(b"cat"), None, Some(&[0xf8, 0xf9, 0xff])], - 0, - None, - vec![Some(b"cat"), None, Some(&[0xf8, 0xf9, 0xff])], - ), - ( - vec![Some(b"cat"), None, Some(&[0xf8, 0xf9, 0xff])], - 1, - None, - vec![Some(b"at"), None, Some(&[0xf9, 0xff])], - ), - ( - vec![Some(b"cat"), None, Some(&[0xf8, 0xf9, 0xff])], - 2, - None, - vec![Some(b"t"), None, Some(&[0xff])], - ), - ( - vec![Some(b"cat"), None, Some(&[0xf8, 0xf9, 0xff])], - 3, - None, - vec![Some(b""), None, Some(&[])], - ), - ( - vec![Some(b"cat"), None, Some(&[0xf8, 0xf9, 0xff])], - 10, - None, - vec![Some(b""), None, Some(b"")], - ), + (1, None, vec![Some(b"at"), None, Some(&[0xf9, 0xff])]), + (2, None, vec![Some(b"t"), None, Some(&[0xff])]), + (3, None, vec![Some(b""), None, Some(b"")]), + (10, None, vec![Some(b""), None, Some(b"")]), // increase start negatively - ( - vec![Some(b"cat"), None, Some(&[0xf8, 0xf9, 0xff])], - -1, - None, - vec![Some(b"t"), None, Some(&[0xff])], - ), - ( - vec![Some(b"cat"), None, Some(&[0xf8, 0xf9, 0xff])], - -2, - None, - vec![Some(b"at"), None, Some(&[0xf9, 0xff])], - ), - ( - vec![Some(b"cat"), None, Some(&[0xf8, 0xf9, 0xff])], - -3, - None, - vec![Some(b"cat"), None, Some(&[0xf8, 0xf9, 0xff])], - ), - ( - vec![Some(b"cat"), None, Some(&[0xf8, 0xf9, 0xff])], - -10, - None, - vec![Some(b"cat"), None, Some(&[0xf8, 0xf9, 0xff])], - ), + (-1, None, vec![Some(b"t"), None, Some(&[0xff])]), + (-2, None, vec![Some(b"at"), None, Some(&[0xf9, 0xff])]), + (-3, None, input.clone()), + (-10, None, input.clone()), // increase length - ( - vec![Some(b"cat"), None, Some(&[0xf8, 0xf9, 0xff])], - 1, - Some(1), - vec![Some(b"a"), None, Some(&[0xf9])], - ), - ( - vec![Some(b"cat"), None, Some(&[0xf8, 0xf9, 0xff])], - 1, - Some(2), - vec![Some(b"at"), None, Some(&[0xf9, 0xff])], - ), - ( - vec![Some(b"cat"), None, Some(&[0xf8, 0xf9, 0xff])], - 1, - Some(3), - vec![Some(b"at"), None, Some(&[0xf9, 0xff])], - ), - ( - vec![Some(b"cat"), None, Some(&[0xf8, 0xf9, 0xff])], - -3, - Some(1), - vec![Some(b"c"), None, Some(&[0xf8])], - ), - ( - vec![Some(b"cat"), None, Some(&[0xf8, 0xf9, 0xff])], - -3, - Some(2), - vec![Some(b"ca"), None, Some(&[0xf8, 0xf9])], - ), - ( - vec![Some(b"cat"), None, Some(&[0xf8, 0xf9, 0xff])], - -3, - Some(3), - vec![Some(b"cat"), None, Some(&[0xf8, 0xf9, 0xff])], - ), - ( - vec![Some(b"cat"), None, Some(&[0xf8, 0xf9, 0xff])], - -3, - Some(4), - vec![Some(b"cat"), None, Some(&[0xf8, 0xf9, 0xff])], - ), - ]; - - cases.into_iter().try_for_each::<_, Result<()>>( - |(array, start, length, expected)| { - let array = FixedSizeBinaryArray::try_from_sparse_iter(array.into_iter()) - .unwrap(); - let result = substring(&array, start, length)?; - assert_eq!(array.len(), result.len()); - let result = result - .as_any() - .downcast_ref::() - .unwrap(); - let expected = - FixedSizeBinaryArray::try_from_sparse_iter(expected.into_iter()) - .unwrap(); - assert_eq!(&expected, result,); - Ok(()) - }, - )?; - - Ok(()) + (1, Some(1), vec![Some(b"a"), None, Some(&[0xf9])]), + (1, Some(2), vec![Some(b"at"), None, Some(&[0xf9, 0xff])]), + (1, Some(3), vec![Some(b"at"), None, Some(&[0xf9, 0xff])]), + (-3, Some(1), vec![Some(b"c"), None, Some(&[0xf8])]), + (-3, Some(2), vec![Some(b"ca"), None, Some(&[0xf8, 0xf9])]), + (-3, Some(3), input.clone()), + (-3, Some(4), input.clone()) + ); + + do_test!( + [&base_case[..], &cases[..]].concat(), + FixedSizeBinaryArray, + substring + ); } #[test] - #[allow(clippy::type_complexity)] - fn without_nulls_fixed_size_binary() -> Result<()> { - let cases: Vec<(Vec<&[u8]>, i64, Option, Vec<&[u8]>)> = vec![ - // empty array is always identical - (vec![b"", b"", &[]], 3, Some(2), vec![b"", b"", &[]]), + fn without_nulls_fixed_size_binary() { + let input = vec!["cat".as_bytes(), b"dog", &[0xf8, 0xf9, 0xff]]; + // empty array is always identical + let base_case = gen_test_cases!( + vec!["".as_bytes(), &[], &[]], + (1, Some(2), vec!["".as_bytes(), &[], &[]]) + ); + let cases = gen_test_cases!( + input, + // identity + (0, None, input.clone()), // increase start - ( - vec![b"cat", b"dog", &[0xf8, 0xf9, 0xff]], - 0, - None, - vec![b"cat", b"dog", &[0xf8, 0xf9, 0xff]], - ), - ( - vec![b"cat", b"dog", &[0xf8, 0xf9, 0xff]], - 1, - None, - vec![b"at", b"og", &[0xf9, 0xff]], - ), - ( - vec![b"cat", b"dog", &[0xf8, 0xf9, 0xff]], - 2, - None, - vec![b"t", b"g", &[0xff]], - ), - ( - vec![b"cat", b"dog", &[0xf8, 0xf9, 0xff]], - 3, - None, - vec![b"", b"", &[]], - ), - ( - vec![b"cat", b"dog", &[0xf8, 0xf9, 0xff]], - 10, - None, - vec![b"", b"", b""], - ), + (1, None, vec![b"at", b"og", &[0xf9, 0xff]]), + (2, None, vec![b"t", b"g", &[0xff]]), + (3, None, vec![&[], &[], &[]]), + (10, None, vec![&[], &[], &[]]), // increase start negatively - ( - vec![b"cat", b"dog", &[0xf8, 0xf9, 0xff]], - -1, - None, - vec![b"t", b"g", &[0xff]], - ), - ( - vec![b"cat", b"dog", &[0xf8, 0xf9, 0xff]], - -2, - None, - vec![b"at", b"og", &[0xf9, 0xff]], - ), - ( - vec![b"cat", b"dog", &[0xf8, 0xf9, 0xff]], - -3, - None, - vec![b"cat", b"dog", &[0xf8, 0xf9, 0xff]], - ), - ( - vec![b"cat", b"dog", &[0xf8, 0xf9, 0xff]], - -10, - None, - vec![b"cat", b"dog", &[0xf8, 0xf9, 0xff]], - ), + (-1, None, vec![b"t", b"g", &[0xff]]), + (-2, None, vec![b"at", b"og", &[0xf9, 0xff]]), + (-3, None, input.clone()), + (-10, None, input.clone()), // increase length - ( - vec![b"cat", b"dog", &[0xf8, 0xf9, 0xff]], - 1, - Some(1), - vec![b"a", b"o", &[0xf9]], - ), - ( - vec![b"cat", b"dog", &[0xf8, 0xf9, 0xff]], - 1, - Some(2), - vec![b"at", b"og", &[0xf9, 0xff]], - ), - ( - vec![b"cat", b"dog", &[0xf8, 0xf9, 0xff]], - 1, - Some(3), - vec![b"at", b"og", &[0xf9, 0xff]], - ), - ( - vec![b"cat", b"dog", &[0xf8, 0xf9, 0xff]], - -3, - Some(1), - vec![b"c", b"d", &[0xf8]], - ), - ( - vec![b"cat", b"dog", &[0xf8, 0xf9, 0xff]], - -3, - Some(2), - vec![b"ca", b"do", &[0xf8, 0xf9]], - ), - ( - vec![b"cat", b"dog", &[0xf8, 0xf9, 0xff]], - -3, - Some(3), - vec![b"cat", b"dog", &[0xf8, 0xf9, 0xff]], - ), - ( - vec![b"cat", b"dog", &[0xf8, 0xf9, 0xff]], - -3, - Some(4), - vec![b"cat", b"dog", &[0xf8, 0xf9, 0xff]], - ), - ]; - - cases.into_iter().try_for_each::<_, Result<()>>( - |(array, start, length, expected)| { - let array = - FixedSizeBinaryArray::try_from_iter(array.into_iter()).unwrap(); - let result = substring(&array, start, length)?; - assert_eq!(array.len(), result.len()); - let result = result - .as_any() - .downcast_ref::() - .unwrap(); - let expected = - FixedSizeBinaryArray::try_from_iter(expected.into_iter()).unwrap(); - assert_eq!(&expected, result,); - Ok(()) - }, - )?; - - Ok(()) + (1, Some(1), vec![b"a", b"o", &[0xf9]]), + (1, Some(2), vec![b"at", b"og", &[0xf9, 0xff]]), + (1, Some(3), vec![b"at", b"og", &[0xf9, 0xff]]), + (-3, Some(1), vec![b"c", b"d", &[0xf8]]), + (-3, Some(2), vec![b"ca", b"do", &[0xf8, 0xf9]]), + (-3, Some(3), input.clone()), + (-3, Some(4), input.clone()) + ); + + do_test!( + [&base_case[..], &cases[..]].concat(), + FixedSizeBinaryArray, + substring + ); } #[test] - fn fixed_size_binary_with_non_zero_offset() -> Result<()> { + fn fixed_size_binary_with_non_zero_offset() { let values: [u8; 15] = *b"hellotherearrow"; // set the first and third element to be valid let bits_v = [0b101_u8]; @@ -873,7 +710,7 @@ mod tests { // array is `[null, "arrow"]` let array = FixedSizeBinaryArray::from(data); // result is `[null, "rrow"]` - let result = substring(&array, 1, None)?; + let result = substring(&array, 1, None).unwrap(); let result = result .as_any() .downcast_ref::() @@ -883,165 +720,90 @@ mod tests { ) .unwrap(); assert_eq!(result, &expected); - - Ok(()) } - fn with_nulls_generic_string() -> Result<()> { - let cases = vec![ - // all-nulls array is always identical - (vec![None, None, None], 0, None, vec![None, None, None]), + fn with_nulls_generic_string() { + let input = vec![Some("hello"), None, Some("word")]; + // all-nulls array is always identical + let base_case = + gen_test_cases!(vec![None, None, None], (0, None, vec![None, None, None])); + let cases = gen_test_cases!( + input, // identity - ( - vec![Some("hello"), None, Some("word")], - 0, - None, - vec![Some("hello"), None, Some("word")], - ), + (0, None, input.clone()), // 0 length -> Nothing - ( - vec![Some("hello"), None, Some("word")], - 0, - Some(0), - vec![Some(""), None, Some("")], - ), + (0, Some(0), vec![Some(""), None, Some("")]), // high start -> Nothing - ( - vec![Some("hello"), None, Some("word")], - 1000, - Some(0), - vec![Some(""), None, Some("")], - ), + (1000, Some(0), vec![Some(""), None, Some("")]), // high negative start -> identity - ( - vec![Some("hello"), None, Some("word")], - -1000, - None, - vec![Some("hello"), None, Some("word")], - ), + (-1000, None, input.clone()), // high length -> identity - ( - vec![Some("hello"), None, Some("word")], - 0, - Some(1000), - vec![Some("hello"), None, Some("word")], - ), - ]; - - cases.into_iter().try_for_each::<_, Result<()>>( - |(array, start, length, expected)| { - let array = GenericStringArray::::from(array); - let result: ArrayRef = substring(&array, start, length)?; - assert_eq!(array.len(), result.len()); - - let result = result - .as_any() - .downcast_ref::>() - .unwrap(); - let expected = GenericStringArray::::from(expected); - assert_eq!(&expected, result); - Ok(()) - }, - )?; - - Ok(()) + (0, Some(1000), input.clone()) + ); + + do_test!( + [&base_case[..], &cases[..]].concat(), + GenericStringArray, + substring + ); } #[test] - fn with_nulls_string() -> Result<()> { + fn with_nulls_string() { with_nulls_generic_string::() } #[test] - fn with_nulls_large_string() -> Result<()> { + fn with_nulls_large_string() { with_nulls_generic_string::() } - fn without_nulls_generic_string() -> Result<()> { - let cases = vec![ - // empty array is always identical - (vec!["", "", ""], 0, None, vec!["", "", ""]), - // increase start - ( - vec!["hello", "", "word"], - 0, - None, - vec!["hello", "", "word"], - ), - (vec!["hello", "", "word"], 1, None, vec!["ello", "", "ord"]), - (vec!["hello", "", "word"], 2, None, vec!["llo", "", "rd"]), - (vec!["hello", "", "word"], 3, None, vec!["lo", "", "d"]), - (vec!["hello", "", "word"], 10, None, vec!["", "", ""]), + fn without_nulls_generic_string() { + let input = vec!["hello", "", "word"]; + // empty array is always identical + let base_case = gen_test_cases!(vec!["", "", ""], (0, None, vec!["", "", ""])); + let cases = gen_test_cases!( + input, + // identity + (0, None, input.clone()), + (1, None, vec!["ello", "", "ord"]), + (2, None, vec!["llo", "", "rd"]), + (3, None, vec!["lo", "", "d"]), + (10, None, vec!["", "", ""]), // increase start negatively - (vec!["hello", "", "word"], -1, None, vec!["o", "", "d"]), - (vec!["hello", "", "word"], -2, None, vec!["lo", "", "rd"]), - (vec!["hello", "", "word"], -3, None, vec!["llo", "", "ord"]), - ( - vec!["hello", "", "word"], - -10, - None, - vec!["hello", "", "word"], - ), + (-1, None, vec!["o", "", "d"]), + (-2, None, vec!["lo", "", "rd"]), + (-3, None, vec!["llo", "", "ord"]), + (-10, None, input.clone()), // increase length - (vec!["hello", "", "word"], 1, Some(1), vec!["e", "", "o"]), - (vec!["hello", "", "word"], 1, Some(2), vec!["el", "", "or"]), - ( - vec!["hello", "", "word"], - 1, - Some(3), - vec!["ell", "", "ord"], - ), - ( - vec!["hello", "", "word"], - 1, - Some(4), - vec!["ello", "", "ord"], - ), - (vec!["hello", "", "word"], -3, Some(1), vec!["l", "", "o"]), - (vec!["hello", "", "word"], -3, Some(2), vec!["ll", "", "or"]), - ( - vec!["hello", "", "word"], - -3, - Some(3), - vec!["llo", "", "ord"], - ), - ( - vec!["hello", "", "word"], - -3, - Some(4), - vec!["llo", "", "ord"], - ), - ]; - - cases.into_iter().try_for_each::<_, Result<()>>( - |(array, start, length, expected)| { - let array = GenericStringArray::::from(array); - let result = substring(&array, start, length)?; - assert_eq!(array.len(), result.len()); - let result = result - .as_any() - .downcast_ref::>() - .unwrap(); - let expected = GenericStringArray::::from(expected); - assert_eq!(&expected, result,); - Ok(()) - }, - )?; - - Ok(()) + (1, Some(1), vec!["e", "", "o"]), + (1, Some(2), vec!["el", "", "or"]), + (1, Some(3), vec!["ell", "", "ord"]), + (1, Some(4), vec!["ello", "", "ord"]), + (-3, Some(1), vec!["l", "", "o"]), + (-3, Some(2), vec!["ll", "", "or"]), + (-3, Some(3), vec!["llo", "", "ord"]), + (-3, Some(4), vec!["llo", "", "ord"]) + ); + + do_test!( + [&base_case[..], &cases[..]].concat(), + GenericStringArray, + substring + ); } #[test] - fn without_nulls_string() -> Result<()> { + fn without_nulls_string() { without_nulls_generic_string::() } #[test] - fn without_nulls_large_string() -> Result<()> { + fn without_nulls_large_string() { without_nulls_generic_string::() } - fn generic_string_with_non_zero_offset() -> Result<()> { + fn generic_string_with_non_zero_offset() { let values = "hellotherearrow"; let offsets = &[ O::zero(), @@ -1058,45 +820,164 @@ mod tests { .add_buffer(Buffer::from(values)) .null_bit_buffer(Some(Buffer::from(bitmap))) .offset(1) - .build()?; + .build() + .unwrap(); // array is `[null, "arrow"]` let array = GenericStringArray::::from(data); // result is `[null, "rrow"]` - let result = substring(&array, 1, None)?; + let result = substring(&array, 1, None).unwrap(); let result = result .as_any() .downcast_ref::>() .unwrap(); let expected = GenericStringArray::::from(vec![None, Some("rrow")]); assert_eq!(result, &expected); - - Ok(()) } #[test] - fn string_with_non_zero_offset() -> Result<()> { + fn string_with_non_zero_offset() { generic_string_with_non_zero_offset::() } #[test] - fn large_string_with_non_zero_offset() -> Result<()> { + fn large_string_with_non_zero_offset() { generic_string_with_non_zero_offset::() } + fn with_nulls_generic_string_by_char() { + let input = vec![Some("hello"), None, Some("Γ ⊢x:T")]; + // all-nulls array is always identical + let base_case = + gen_test_cases!(vec![None, None, None], (0, None, vec![None, None, None])); + let cases = gen_test_cases!( + input, + // identity + (0, None, input.clone()), + // 0 length -> Nothing + (0, Some(0), vec![Some(""), None, Some("")]), + // high start -> Nothing + (1000, Some(0), vec![Some(""), None, Some("")]), + // high negative start -> identity + (-1000, None, input.clone()), + // high length -> identity + (0, Some(1000), input.clone()) + ); + + do_test!( + [&base_case[..], &cases[..]].concat(), + GenericStringArray, + substring_by_char + ); + } + #[test] - fn dictionary() -> Result<()> { - _dictionary::()?; - _dictionary::()?; - _dictionary::()?; - _dictionary::()?; - _dictionary::()?; - _dictionary::()?; - _dictionary::()?; - _dictionary::()?; - Ok(()) + fn with_nulls_string_by_char() { + with_nulls_generic_string_by_char::() + } + + #[test] + fn with_nulls_large_string_by_char() { + with_nulls_generic_string_by_char::() + } + + fn without_nulls_generic_string_by_char() { + let input = vec!["hello", "", "Γ ⊢x:T"]; + // empty array is always identical + let base_case = gen_test_cases!(vec!["", "", ""], (0, None, vec!["", "", ""])); + let cases = gen_test_cases!( + input, + //identity + (0, None, input.clone()), + // increase start + (1, None, vec!["ello", "", " ⊢x:T"]), + (2, None, vec!["llo", "", "⊢x:T"]), + (3, None, vec!["lo", "", "x:T"]), + (10, None, vec!["", "", ""]), + // increase start negatively + (-1, None, vec!["o", "", "T"]), + (-2, None, vec!["lo", "", ":T"]), + (-4, None, vec!["ello", "", "⊢x:T"]), + (-10, None, input.clone()), + // increase length + (1, Some(1), vec!["e", "", " "]), + (1, Some(2), vec!["el", "", " ⊢"]), + (1, Some(3), vec!["ell", "", " ⊢x"]), + (1, Some(6), vec!["ello", "", " ⊢x:T"]), + (-4, Some(1), vec!["e", "", "⊢"]), + (-4, Some(2), vec!["el", "", "⊢x"]), + (-4, Some(3), vec!["ell", "", "⊢x:"]), + (-4, Some(4), vec!["ello", "", "⊢x:T"]) + ); + + do_test!( + [&base_case[..], &cases[..]].concat(), + GenericStringArray, + substring_by_char + ); + } + + #[test] + fn without_nulls_string_by_char() { + without_nulls_generic_string_by_char::() } - fn _dictionary() -> Result<()> { + #[test] + fn without_nulls_large_string_by_char() { + without_nulls_generic_string_by_char::() + } + + fn generic_string_by_char_with_non_zero_offset() { + let values = "S→T = Πx:S.T"; + let offsets = &[ + O::zero(), + O::from_usize(values.char_indices().nth(3).map(|(pos, _)| pos).unwrap()) + .unwrap(), + O::from_usize(values.char_indices().nth(6).map(|(pos, _)| pos).unwrap()) + .unwrap(), + O::from_usize(values.len()).unwrap(), + ]; + // set the first and third element to be valid + let bitmap = [0b101_u8]; + + let data = ArrayData::builder(GenericStringArray::::get_data_type()) + .len(2) + .add_buffer(Buffer::from_slice_ref(offsets)) + .add_buffer(Buffer::from(values)) + .null_bit_buffer(Some(Buffer::from(bitmap))) + .offset(1) + .build() + .unwrap(); + // array is `[null, "Πx:S.T"]` + let array = GenericStringArray::::from(data); + // result is `[null, "x:S.T"]` + let result = substring_by_char(&array, 1, None).unwrap(); + let expected = GenericStringArray::::from(vec![None, Some("x:S.T")]); + assert_eq!(result, expected); + } + + #[test] + fn string_with_non_zero_offset_by_char() { + generic_string_by_char_with_non_zero_offset::() + } + + #[test] + fn large_string_with_non_zero_offset_by_char() { + generic_string_by_char_with_non_zero_offset::() + } + + #[test] + fn dictionary() { + _dictionary::(); + _dictionary::(); + _dictionary::(); + _dictionary::(); + _dictionary::(); + _dictionary::(); + _dictionary::(); + _dictionary::(); + } + + fn _dictionary() { const TOTAL: i32 = 100; let v = ["aaa", "bbb", "ccc", "ddd", "eee"]; @@ -1116,7 +997,7 @@ mod tests { let expected: Vec> = data.iter().map(|opt| opt.map(|s| &s[1..3])).collect(); - let res = substring(&dict_array, 1, Some(2))?; + let res = substring(&dict_array, 1, Some(2)).unwrap(); let actual = res.as_any().downcast_ref::>().unwrap(); let actual: Vec> = actual .values() @@ -1129,8 +1010,6 @@ mod tests { for i in 0..TOTAL as usize { assert_eq!(expected[i], actual[i],); } - - Ok(()) } #[test] diff --git a/arrow/src/compute/kernels/take.rs b/arrow/src/compute/kernels/take.rs index 567bf5c8ba27..624e9ddcdb58 100644 --- a/arrow/src/compute/kernels/take.rs +++ b/arrow/src/compute/kernels/take.rs @@ -524,7 +524,7 @@ where if decimal_values.is_null(index) { Ok(None) } else { - Ok(Some(decimal_values.value(index))) + Ok(Some(decimal_values.value(index).as_i128())) } }); let t: Result>> = t.transpose(); @@ -688,8 +688,7 @@ where let bytes_offset = (data_len + 1) * std::mem::size_of::(); let mut offsets_buffer = MutableBuffer::from_len_zeroed(bytes_offset); - // Safety: the buffer is always treated as as a type of `OffsetSize` in the code below - let offsets = unsafe { offsets_buffer.typed_data_mut() }; + let offsets = offsets_buffer.typed_data_mut(); let mut values = MutableBuffer::new(0); let mut length_so_far = OffsetSize::zero(); offsets[0] = length_so_far; diff --git a/arrow/src/compute/kernels/temporal.rs b/arrow/src/compute/kernels/temporal.rs index aa49462da864..9998649ead30 100644 --- a/arrow/src/compute/kernels/temporal.rs +++ b/arrow/src/compute/kernels/temporal.rs @@ -17,7 +17,7 @@ //! Defines temporal kernels for time and date related functions. -use chrono::{Datelike, Timelike}; +use chrono::{Datelike, NaiveDate, NaiveDateTime, Timelike}; use crate::array::*; use crate::datatypes::*; @@ -112,6 +112,34 @@ macro_rules! return_compute_error_with { }; } +trait ChronoDateQuarter { + /// Returns a value in range `1..=4` indicating the quarter this date falls into + fn quarter(&self) -> u32; + + /// Returns a value in range `0..=3` indicating the quarter (zero-based) this date falls into + fn quarter0(&self) -> u32; +} + +impl ChronoDateQuarter for NaiveDateTime { + fn quarter(&self) -> u32 { + self.quarter0() + 1 + } + + fn quarter0(&self) -> u32 { + self.month0() / 3 + } +} + +impl ChronoDateQuarter for NaiveDate { + fn quarter(&self) -> u32 { + self.quarter0() + 1 + } + + fn quarter0(&self) -> u32 { + self.month0() / 3 + } +} + #[cfg(not(feature = "chrono-tz"))] pub fn using_chrono_tz_and_utc_naive_date_time( _tz: &str, @@ -183,6 +211,34 @@ where Ok(b.finish()) } +/// Extracts the quarter of a given temporal array as an array of integers +pub fn quarter(array: &PrimitiveArray) -> Result +where + T: ArrowTemporalType + ArrowNumericType, + i64: std::convert::From, +{ + let mut b = Int32Builder::new(array.len()); + match array.data_type() { + &DataType::Date32 | &DataType::Date64 | &DataType::Timestamp(_, None) => { + extract_component_from_array!(array, b, quarter, value_as_datetime) + } + &DataType::Timestamp(_, Some(ref tz)) => { + let mut scratch = Parsed::new(); + extract_component_from_array!( + array, + b, + quarter, + value_as_datetime_with_tz, + tz, + scratch + ) + } + dt => return_compute_error_with!("quarter does not support", dt), + } + + Ok(b.finish()) +} + /// Extracts the month of a given temporal array as an array of integers pub fn month(array: &PrimitiveArray) -> Result where @@ -211,6 +267,37 @@ where Ok(b.finish()) } +/// Extracts the day of week of a given temporal array as an array of +/// integers. +/// +/// Monday is encoded as `0`, Tuesday as `1`, etc. +pub fn weekday(array: &PrimitiveArray) -> Result +where + T: ArrowTemporalType + ArrowNumericType, + i64: std::convert::From, +{ + let mut b = Int32Builder::new(array.len()); + match array.data_type() { + &DataType::Date32 | &DataType::Date64 | &DataType::Timestamp(_, None) => { + extract_component_from_array!(array, b, weekday, value_as_datetime) + } + &DataType::Timestamp(_, Some(ref tz)) => { + let mut scratch = Parsed::new(); + extract_component_from_array!( + array, + b, + weekday, + value_as_datetime_with_tz, + tz, + scratch + ) + } + dt => return_compute_error_with!("weekday does not support", dt), + } + + Ok(b.finish()) +} + /// Extracts the day of a given temporal array as an array of integers pub fn day(array: &PrimitiveArray) -> Result where @@ -389,6 +476,48 @@ mod tests { assert_eq!(2012, b.value(2)); } + #[test] + fn test_temporal_array_date64_quarter() { + //1514764800000 -> 2018-01-01 + //1566275025000 -> 2019-08-20 + let a: PrimitiveArray = + vec![Some(1514764800000), None, Some(1566275025000)].into(); + + let b = quarter(&a).unwrap(); + assert_eq!(1, b.value(0)); + assert!(!b.is_valid(1)); + assert_eq!(3, b.value(2)); + } + + #[test] + fn test_temporal_array_date32_quarter() { + let a: PrimitiveArray = vec![Some(1), None, Some(300)].into(); + + let b = quarter(&a).unwrap(); + assert_eq!(1, b.value(0)); + assert!(!b.is_valid(1)); + assert_eq!(4, b.value(2)); + } + + #[test] + fn test_temporal_array_timestamp_quarter_with_timezone() { + use std::sync::Arc; + + // 24 * 60 * 60 = 86400 + let a = Arc::new(TimestampSecondArray::from_vec( + vec![86400 * 90], + Some("+00:00".to_string()), + )); + let b = quarter(&a).unwrap(); + assert_eq!(2, b.value(0)); + let a = Arc::new(TimestampSecondArray::from_vec( + vec![86400 * 90], + Some("-10:00".to_string()), + )); + let b = quarter(&a).unwrap(); + assert_eq!(1, b.value(0)); + } + #[test] fn test_temporal_array_date64_month() { //1514764800000 -> 2018-01-01 @@ -416,7 +545,7 @@ mod tests { fn test_temporal_array_timestamp_month_with_timezone() { use std::sync::Arc; - // 24 * 60 * 60 = 8640 + // 24 * 60 * 60 = 86400 let a = Arc::new(TimestampSecondArray::from_vec( vec![86400 * 31], Some("+00:00".to_string()), @@ -435,7 +564,7 @@ mod tests { fn test_temporal_array_timestamp_day_with_timezone() { use std::sync::Arc; - // 24 * 60 * 60 = 8640 + // 24 * 60 * 60 = 86400 let a = Arc::new(TimestampSecondArray::from_vec( vec![86400], Some("+00:00".to_string()), @@ -450,6 +579,19 @@ mod tests { assert_eq!(1, b.value(0)); } + #[test] + fn test_temporal_array_date64_weekday() { + //1514764800000 -> 2018-01-01 (Monday) + //1550636625000 -> 2019-02-20 (Wednesday) + let a: PrimitiveArray = + vec![Some(1514764800000), None, Some(1550636625000)].into(); + + let b = weekday(&a).unwrap(); + assert_eq!(0, b.value(0)); + assert!(!b.is_valid(1)); + assert_eq!(2, b.value(2)); + } + #[test] fn test_temporal_array_date64_day() { //1514764800000 -> 2018-01-01 diff --git a/arrow/src/compute/util.rs b/arrow/src/compute/util.rs index 4b5029d68a7c..c8e68fbeb353 100644 --- a/arrow/src/compute/util.rs +++ b/arrow/src/compute/util.rs @@ -24,38 +24,41 @@ use crate::error::{ArrowError, Result}; use num::{One, ToPrimitive, Zero}; use std::ops::Add; -/// Combines the null bitmaps of two arrays using a bitwise `and` operation. +/// Combines the null bitmaps of multiple arrays using a bitwise `and` operation. /// /// This function is useful when implementing operations on higher level arrays. #[allow(clippy::unnecessary_wraps)] pub(super) fn combine_option_bitmap( - left_data: &ArrayData, - right_data: &ArrayData, + arrays: &[&ArrayData], len_in_bits: usize, ) -> Result> { - let left_offset_in_bits = left_data.offset(); - let right_offset_in_bits = right_data.offset(); - - let left = left_data.null_buffer(); - let right = right_data.null_buffer(); - - match left { - None => match right { - None => Ok(None), - Some(r) => Ok(Some(r.bit_slice(right_offset_in_bits, len_in_bits))), - }, - Some(l) => match right { - None => Ok(Some(l.bit_slice(left_offset_in_bits, len_in_bits))), - - Some(r) => Ok(Some(buffer_bin_and( - l, - left_offset_in_bits, - r, - right_offset_in_bits, - len_in_bits, - ))), - }, - } + arrays + .iter() + .map(|array| (array.null_buffer().cloned(), array.offset())) + .reduce(|acc, buffer_and_offset| match (acc, buffer_and_offset) { + ((None, _), (None, _)) => (None, 0), + ((Some(buffer), offset), (None, _)) | ((None, _), (Some(buffer), offset)) => { + (Some(buffer), offset) + } + ((Some(buffer_left), offset_left), (Some(buffer_right), offset_right)) => ( + Some(buffer_bin_and( + &buffer_left, + offset_left, + &buffer_right, + offset_right, + len_in_bits, + )), + 0, + ), + }) + .map_or( + Err(ArrowError::ComputeError( + "Arrays must not be empty".to_string(), + )), + |(buffer, offset)| { + Ok(buffer.map(|buffer| buffer.bit_slice(offset, len_in_bits))) + }, + ) } /// Takes/filters a list array's inner data using the offsets of the list array. @@ -184,7 +187,7 @@ pub(super) mod tests { offset: usize, null_bit_buffer: Option, ) -> Arc { - let buffer = Buffer::from(&vec![11; len]); + let buffer = Buffer::from(&vec![11; len + offset]); Arc::new( ArrayData::try_new( @@ -206,25 +209,87 @@ pub(super) mod tests { make_data_with_null_bit_buffer(8, 0, Some(Buffer::from([0b01001010]))); let inverse_bitmap = make_data_with_null_bit_buffer(8, 0, Some(Buffer::from([0b10110101]))); + let some_other_bitmap = + make_data_with_null_bit_buffer(8, 0, Some(Buffer::from([0b11010111]))); assert_eq!( - None, - combine_option_bitmap(&none_bitmap, &none_bitmap, 8).unwrap() + combine_option_bitmap(&[], 8).unwrap_err().to_string(), + "Compute error: Arrays must not be empty", ); assert_eq!( Some(Buffer::from([0b01001010])), - combine_option_bitmap(&some_bitmap, &none_bitmap, 8).unwrap() + combine_option_bitmap(&[&some_bitmap], 8).unwrap() + ); + assert_eq!( + None, + combine_option_bitmap(&[&none_bitmap, &none_bitmap], 8).unwrap() ); assert_eq!( Some(Buffer::from([0b01001010])), - combine_option_bitmap(&none_bitmap, &some_bitmap, 8,).unwrap() + combine_option_bitmap(&[&some_bitmap, &none_bitmap], 8).unwrap() + ); + assert_eq!( + Some(Buffer::from([0b11010111])), + combine_option_bitmap(&[&none_bitmap, &some_other_bitmap], 8).unwrap() ); assert_eq!( Some(Buffer::from([0b01001010])), - combine_option_bitmap(&some_bitmap, &some_bitmap, 8,).unwrap() + combine_option_bitmap(&[&some_bitmap, &some_bitmap], 8,).unwrap() ); assert_eq!( Some(Buffer::from([0b0])), - combine_option_bitmap(&some_bitmap, &inverse_bitmap, 8,).unwrap() + combine_option_bitmap(&[&some_bitmap, &inverse_bitmap], 8,).unwrap() + ); + assert_eq!( + Some(Buffer::from([0b01000010])), + combine_option_bitmap(&[&some_bitmap, &some_other_bitmap, &none_bitmap], 8,) + .unwrap() + ); + assert_eq!( + Some(Buffer::from([0b00001001])), + combine_option_bitmap( + &[ + &some_bitmap.slice(3, 5), + &inverse_bitmap.slice(2, 5), + &some_other_bitmap.slice(1, 5) + ], + 5, + ) + .unwrap() + ); + } + + #[test] + fn test_combine_option_bitmap_with_offsets() { + let none_bitmap = make_data_with_null_bit_buffer(8, 0, None); + let bitmap0 = + make_data_with_null_bit_buffer(8, 0, Some(Buffer::from([0b10101010]))); + let bitmap1 = + make_data_with_null_bit_buffer(8, 1, Some(Buffer::from([0b01010100, 0b1]))); + let bitmap2 = + make_data_with_null_bit_buffer(8, 2, Some(Buffer::from([0b10101000, 0b10]))); + assert_eq!( + Some(Buffer::from([0b10101010])), + combine_option_bitmap(&[&bitmap1], 8).unwrap() + ); + assert_eq!( + Some(Buffer::from([0b10101010])), + combine_option_bitmap(&[&bitmap2], 8).unwrap() + ); + assert_eq!( + Some(Buffer::from([0b10101010])), + combine_option_bitmap(&[&bitmap1, &none_bitmap], 8).unwrap() + ); + assert_eq!( + Some(Buffer::from([0b10101010])), + combine_option_bitmap(&[&none_bitmap, &bitmap2], 8).unwrap() + ); + assert_eq!( + Some(Buffer::from([0b10101010])), + combine_option_bitmap(&[&bitmap0, &bitmap1], 8).unwrap() + ); + assert_eq!( + Some(Buffer::from([0b10101010])), + combine_option_bitmap(&[&bitmap1, &bitmap2], 8).unwrap() ); } diff --git a/arrow/src/csv/reader.rs b/arrow/src/csv/reader.rs index d8841964b586..21e107ee4c8e 100644 --- a/arrow/src/csv/reader.rs +++ b/arrow/src/csv/reader.rs @@ -120,7 +120,7 @@ pub struct ReaderOptions { /// Return inferred schema and number of records used for inference. This function does not change /// reader cursor offset. pub fn infer_file_schema( - reader: &mut R, + reader: R, delimiter: u8, max_read_records: Option, has_header: bool, @@ -136,12 +136,13 @@ pub fn infer_file_schema( } fn infer_file_schema_with_csv_options( - reader: &mut R, - roptoins: ReaderOptions, + mut reader: R, + roptions: ReaderOptions, ) -> Result<(Schema, usize)> { let saved_offset = reader.seek(SeekFrom::Current(0))?; - let (schema, records_count) = infer_reader_schema_with_csv_options(reader, roptoins)?; + let (schema, records_count) = + infer_reader_schema_with_csv_options(&mut reader, roptions)?; // return the reader seek back to the start reader.seek(SeekFrom::Start(saved_offset))?; @@ -155,7 +156,7 @@ fn infer_file_schema_with_csv_options( /// /// Return infered schema and number of records used for inference. pub fn infer_reader_schema( - reader: &mut R, + reader: R, delimiter: u8, max_read_records: Option, has_header: bool, @@ -170,7 +171,7 @@ pub fn infer_reader_schema( } fn infer_reader_schema_with_csv_options( - reader: &mut R, + reader: R, roptions: ReaderOptions, ) -> Result<(Schema, usize)> { let mut csv_reader = Reader::build_csv_reader( @@ -1203,8 +1204,8 @@ mod tests { fn test_csv_reader_with_decimal() { let schema = Schema::new(vec![ Field::new("city", DataType::Utf8, false), - Field::new("lat", DataType::Decimal(26, 6), false), - Field::new("lng", DataType::Decimal(26, 6), false), + Field::new("lat", DataType::Decimal(38, 6), false), + Field::new("lng", DataType::Decimal(38, 6), false), ]); let file = File::open("test/data/decimal_test.csv").unwrap(); diff --git a/arrow/src/csv/writer.rs b/arrow/src/csv/writer.rs index b7755fae311b..6735d9668560 100644 --- a/arrow/src/csv/writer.rs +++ b/arrow/src/csv/writer.rs @@ -798,6 +798,7 @@ sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03,foo // starting at row 2 and up to row 6. None, None, + None, ); let rb = reader.next().unwrap().unwrap(); let c1 = rb.column(0).as_any().downcast_ref::().unwrap(); diff --git a/arrow/src/datatypes/datatype.rs b/arrow/src/datatypes/datatype.rs index a740e8ecc019..895e5cc67c38 100644 --- a/arrow/src/datatypes/datatype.rs +++ b/arrow/src/datatypes/datatype.rs @@ -671,7 +671,7 @@ impl DataType { /// Compares the datatype with another, ignoring nested field names /// and metadata. - pub(crate) fn equals_datatype(&self, other: &DataType) -> bool { + pub fn equals_datatype(&self, other: &DataType) -> bool { match (&self, other) { (DataType::List(a), DataType::List(b)) | (DataType::LargeList(a), DataType::LargeList(b)) => { diff --git a/arrow/src/datatypes/native.rs b/arrow/src/datatypes/native.rs index 2a8c99f0f89e..d9a3f667d8e4 100644 --- a/arrow/src/datatypes/native.rs +++ b/arrow/src/datatypes/native.rs @@ -19,6 +19,10 @@ use super::DataType; use half::f16; use serde_json::{Number, Value}; +mod private { + pub trait Sealed {} +} + /// Trait declaring any type that is serializable to JSON. This includes all primitive types (bool, i32, etc.). pub trait JsonSerializable: 'static { fn into_json_value(self) -> Option; @@ -26,8 +30,26 @@ pub trait JsonSerializable: 'static { /// Trait expressing a Rust type that has the same in-memory representation /// as Arrow. This includes `i16`, `f32`, but excludes `bool` (which in arrow is represented in bits). +/// /// In little endian machines, types that implement [`ArrowNativeType`] can be memcopied to arrow buffers /// as is. +/// +/// # Transmute Safety +/// +/// A type T implementing this trait means that any arbitrary slice of bytes of length and +/// alignment `size_of::()` can be safely interpreted as a value of that type without +/// being unsound, i.e. potentially resulting in undefined behaviour. +/// +/// Note: in the case of floating point numbers this transmutation can result in a signalling +/// NaN, which, whilst sound, can be unwieldy. In general, whilst it is perfectly sound to +/// reinterpret bytes as different types using this trait, it is likely unwise. For more information +/// see [f32::from_bits] and [f64::from_bits]. +/// +/// Note: `bool` is restricted to `0` or `1`, and so `bool: !ArrowNativeType` +/// +/// # Sealed +/// +/// Due to the above restrictions, this trait is sealed to prevent accidental misuse pub trait ArrowNativeType: std::fmt::Debug + Send @@ -37,6 +59,7 @@ pub trait ArrowNativeType: + std::str::FromStr + Default + JsonSerializable + + private::Sealed { /// Convert native type from usize. #[inline] @@ -109,6 +132,7 @@ impl JsonSerializable for i8 { } } +impl private::Sealed for i8 {} impl ArrowNativeType for i8 { #[inline] fn from_usize(v: usize) -> Option { @@ -132,6 +156,7 @@ impl JsonSerializable for i16 { } } +impl private::Sealed for i16 {} impl ArrowNativeType for i16 { #[inline] fn from_usize(v: usize) -> Option { @@ -155,6 +180,7 @@ impl JsonSerializable for i32 { } } +impl private::Sealed for i32 {} impl ArrowNativeType for i32 { #[inline] fn from_usize(v: usize) -> Option { @@ -184,6 +210,7 @@ impl JsonSerializable for i64 { } } +impl private::Sealed for i64 {} impl ArrowNativeType for i64 { #[inline] fn from_usize(v: usize) -> Option { @@ -217,6 +244,7 @@ impl JsonSerializable for i128 { } } +impl private::Sealed for i128 {} impl ArrowNativeType for i128 { #[inline] fn from_usize(v: usize) -> Option { @@ -246,6 +274,7 @@ impl JsonSerializable for u8 { } } +impl private::Sealed for u8 {} impl ArrowNativeType for u8 { #[inline] fn from_usize(v: usize) -> Option { @@ -269,6 +298,7 @@ impl JsonSerializable for u16 { } } +impl private::Sealed for u16 {} impl ArrowNativeType for u16 { #[inline] fn from_usize(v: usize) -> Option { @@ -292,6 +322,7 @@ impl JsonSerializable for u32 { } } +impl private::Sealed for u32 {} impl ArrowNativeType for u32 { #[inline] fn from_usize(v: usize) -> Option { @@ -315,6 +346,7 @@ impl JsonSerializable for u64 { } } +impl private::Sealed for u64 {} impl ArrowNativeType for u64 { #[inline] fn from_usize(v: usize) -> Option { @@ -351,8 +383,11 @@ impl JsonSerializable for f64 { } impl ArrowNativeType for f16 {} +impl private::Sealed for f16 {} impl ArrowNativeType for f32 {} +impl private::Sealed for f32 {} impl ArrowNativeType for f64 {} +impl private::Sealed for f64 {} /// Allows conversion from supported Arrow types to a byte slice. pub trait ToByteSlice { diff --git a/arrow/src/ffi.rs b/arrow/src/ffi.rs index 4ab929829bfd..84905af20a63 100644 --- a/arrow/src/ffi.rs +++ b/arrow/src/ffi.rs @@ -79,6 +79,8 @@ //! unsafe { //! Box::from_raw(out_array_ptr); //! Box::from_raw(out_schema_ptr); +//! Arc::from_raw(array_ptr); +//! Arc::from_raw(schema_ptr); //! } //! //! Ok(()) @@ -907,6 +909,7 @@ mod tests { } #[test] + #[cfg(not(feature = "force_validate"))] fn test_decimal_round_trip() -> Result<()> { // create an array natively let original_array = [Some(12345_i128), Some(-12345_i128), None] diff --git a/arrow/src/ffi_stream.rs b/arrow/src/ffi_stream.rs index ab4caea36f8e..3a85f2ef6421 100644 --- a/arrow/src/ffi_stream.rs +++ b/arrow/src/ffi_stream.rs @@ -81,7 +81,6 @@ const EINVAL: i32 = 22; const ENOSYS: i32 = 78; /// ABI-compatible struct for `ArrayStream` from C Stream Interface -/// This interface is experimental /// See /// This was created by bindgen #[repr(C)] @@ -198,13 +197,6 @@ impl ExportedArrayStream { } pub fn get_schema(&mut self, out: *mut FFI_ArrowSchema) -> i32 { - unsafe { - match (*out).release { - None => (), - Some(release) => release(out), - }; - }; - let mut private_data = self.get_private_data(); let reader = &private_data.batch_reader; @@ -224,18 +216,17 @@ impl ExportedArrayStream { } pub fn get_next(&mut self, out: *mut FFI_ArrowArray) -> i32 { - unsafe { - match (*out).release { - None => (), - Some(release) => release(out), - }; - }; - let mut private_data = self.get_private_data(); let reader = &mut private_data.batch_reader; let ret_code = match reader.next() { - None => 0, + None => { + // Marks ArrowArray released to indicate reaching the end of stream. + unsafe { + (*out).release = None; + } + 0 + } Some(next_batch) => { if let Ok(batch) = next_batch { let struct_array = StructArray::from(batch); @@ -275,7 +266,7 @@ fn get_error_code(err: &ArrowError) -> i32 { /// Struct used to fetch `RecordBatch` from the C Stream Interface. /// Its main responsibility is to expose `RecordBatchReader` functionality /// that requires [FFI_ArrowArrayStream]. -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct ArrowArrayStreamReader { stream: Arc, schema: SchemaRef, @@ -508,6 +499,8 @@ mod tests { } assert_eq!(produced_batches, vec![batch.clone(), batch]); + + unsafe { Arc::from_raw(stream_ptr) }; Ok(()) } @@ -537,6 +530,8 @@ mod tests { } assert_eq!(produced_batches, vec![batch.clone(), batch]); + + unsafe { Arc::from_raw(stream_ptr) }; Ok(()) } diff --git a/arrow/src/ipc/reader.rs b/arrow/src/ipc/reader.rs index 7a0f2cce05c2..2ce29024ba11 100644 --- a/arrow/src/ipc/reader.rs +++ b/arrow/src/ipc/reader.rs @@ -31,7 +31,7 @@ use crate::compute::cast; use crate::datatypes::{DataType, Field, IntervalUnit, Schema, SchemaRef, UnionMode}; use crate::error::{ArrowError, Result}; use crate::ipc; -use crate::record_batch::{RecordBatch, RecordBatchReader}; +use crate::record_batch::{RecordBatch, RecordBatchOptions, RecordBatchReader}; use crate::ipc::compression::compression::CompressionCodecType; use crate::ipc::compression::{ @@ -111,6 +111,7 @@ fn read_uncompressed_size(buffer: &[u8]) -> i64 { /// - check if the bit width of non-64-bit numbers is 64, and /// - read the buffer as 64-bit (signed integer or float), and /// - cast the 64-bit array to the appropriate data type +#[allow(clippy::too_many_arguments)] fn create_array( nodes: &[ipc::FieldNode], field: &Field, @@ -120,6 +121,7 @@ fn create_array( mut node_index: usize, mut buffer_index: usize, compression_codec: &CompressionCodecType, + metadata: &ipc::MetadataVersion, ) -> Result<(ArrayRef, usize, usize)> { use DataType::*; let data_type = field.data_type(); @@ -167,6 +169,7 @@ fn create_array( node_index, buffer_index, compression_codec, + metadata, )?; node_index = triple.1; buffer_index = triple.2; @@ -190,6 +193,7 @@ fn create_array( node_index, buffer_index, compression_codec, + metadata, )?; node_index = triple.1; buffer_index = triple.2; @@ -217,6 +221,7 @@ fn create_array( node_index, buffer_index, compression_codec, + metadata, )?; node_index = triple.1; buffer_index = triple.2; @@ -265,6 +270,13 @@ fn create_array( let len = union_node.length() as usize; + // In V4, union types has validity bitmap + // In V5 and later, union types have no validity bitmap + if metadata < &ipc::MetadataVersion::V5 { + read_buffer(&buffers[buffer_index], data); + buffer_index += 1; + } + let type_ids: Buffer = read_buffer(&buffers[buffer_index], data, compression_codec)[..len] .into(); @@ -293,6 +305,7 @@ fn create_array( node_index, buffer_index, compression_codec, + metadata, )?; node_index = triple.1; @@ -649,6 +662,7 @@ pub fn read_record_batch( schema: SchemaRef, dictionaries_by_id: &HashMap, projection: Option<&[usize]>, + metadata: &ipc::MetadataVersion, ) -> Result { let buffers = batch.buffers().ok_or_else(|| { ArrowError::IoError("Unable to get buffers from IPC RecordBatch".to_string()) @@ -672,6 +686,11 @@ pub fn read_record_batch( let mut node_index = 0; let mut arrays = vec![]; + let options = RecordBatchOptions { + row_count: Some(batch.length() as usize), + ..Default::default() + }; + if let Some(projection) = projection { // project fields for (idx, field) in schema.fields().iter().enumerate() { @@ -686,6 +705,7 @@ pub fn read_record_batch( node_index, buffer_index, &compression_codec, + metadata, )?; node_index = triple.1; buffer_index = triple.2; @@ -707,7 +727,11 @@ pub fn read_record_batch( } } - RecordBatch::try_new(Arc::new(schema.project(projection)?), arrays) + RecordBatch::try_new_with_options( + Arc::new(schema.project(projection)?), + arrays, + &options, + ) } else { // keep track of index as lists require more than one node for field in schema.fields() { @@ -720,12 +744,13 @@ pub fn read_record_batch( node_index, buffer_index, &compression_codec, + metadata, )?; node_index = triple.1; buffer_index = triple.2; arrays.push(triple.0); } - RecordBatch::try_new(schema, arrays) + RecordBatch::try_new_with_options(schema, arrays, &options) } } @@ -736,6 +761,7 @@ pub fn read_dictionary( batch: ipc::DictionaryBatch, schema: &Schema, dictionaries_by_id: &mut HashMap, + metadata: &ipc::MetadataVersion, ) -> Result<()> { if batch.isDelta() { return Err(ArrowError::IoError( @@ -756,7 +782,7 @@ pub fn read_dictionary( DataType::Dictionary(_, ref value_type) => { // Make a fake schema for the dictionary batch. let schema = Schema { - fields: vec![Field::new("", value_type.as_ref().clone(), false)], + fields: vec![Field::new("", value_type.as_ref().clone(), true)], metadata: HashMap::new(), }; // Read a single column @@ -766,6 +792,7 @@ pub fn read_dictionary( Arc::new(schema), dictionaries_by_id, None, + metadata, )?; Some(record_batch.column(0).clone()) } @@ -896,7 +923,13 @@ impl FileReader { ))?; reader.read_exact(&mut buf)?; - read_dictionary(&buf, batch, &schema, &mut dictionaries_by_id)?; + read_dictionary( + &buf, + batch, + &schema, + &mut dictionaries_by_id, + &message.version(), + )?; } t => { return Err(ArrowError::IoError(format!( @@ -1004,6 +1037,7 @@ impl FileReader { self.schema(), &self.dictionaries_by_id, self.projection.as_ref().map(|x| x.0.as_ref()), + &message.version() ).map(Some) } @@ -1178,7 +1212,7 @@ impl StreamReader { let mut buf = vec![0; message.bodyLength() as usize]; self.reader.read_exact(&mut buf)?; - read_record_batch(&buf, batch, self.schema(), &self.dictionaries_by_id, self.projection.as_ref().map(|x| x.0.as_ref())).map(Some) + read_record_batch(&buf, batch, self.schema(), &self.dictionaries_by_id, self.projection.as_ref().map(|x| x.0.as_ref()), &message.version()).map(Some) } ipc::MessageHeader::DictionaryBatch => { let batch = message.header_as_dictionary_batch().ok_or_else(|| { @@ -1191,7 +1225,7 @@ impl StreamReader { self.reader.read_exact(&mut buf)?; read_dictionary( - &buf, batch, &self.schema, &mut self.dictionaries_by_id + &buf, batch, &self.schema, &mut self.dictionaries_by_id, &message.version() )?; // read the next message until we encounter a RecordBatch @@ -1233,6 +1267,7 @@ mod tests { use crate::{datatypes, util::integration_util::*}; #[test] + #[cfg(not(feature = "force_validate"))] fn read_generated_files_014() { let testdata = crate::util::test_util::arrow_test_data(); let version = "0.14.1"; @@ -1353,6 +1388,7 @@ mod tests { } #[test] + #[cfg(not(feature = "force_validate"))] fn read_generated_streams_014() { let testdata = crate::util::test_util::arrow_test_data(); let version = "0.14.1"; @@ -2032,4 +2068,17 @@ mod tests { let output_batch = roundtrip_ipc_stream(&input_batch); assert_eq!(input_batch, output_batch); } + + #[test] + fn test_no_columns_batch() { + let schema = Arc::new(Schema::new(vec![])); + let options = RecordBatchOptions { + match_field_names: true, + row_count: Some(10), + }; + let input_batch = + RecordBatch::try_new_with_options(schema, vec![], &options).unwrap(); + let output_batch = roundtrip_ipc_stream(&input_batch); + assert_eq!(input_batch, output_batch); + } } diff --git a/arrow/src/ipc/writer.rs b/arrow/src/ipc/writer.rs index 1c7eb1e2d99a..dabbd70abc43 100644 --- a/arrow/src/ipc/writer.rs +++ b/arrow/src/ipc/writer.rs @@ -401,6 +401,7 @@ impl IpcDataGenerator { array.len(), array.null_count(), &compression_codec, + write_options, ); } // pad the tail of body data @@ -474,6 +475,7 @@ impl IpcDataGenerator { array_data.len(), array_data.null_count(), &compression_codec, + write_options, ); // pad the tail of body data @@ -945,7 +947,18 @@ fn write_continuation( Ok(written) } +/// In V4, null types have no validity bitmap +/// In V5 and later, null and union types have no validity bitmap +fn has_validity_bitmap(data_type: &DataType, write_options: &IpcWriteOptions) -> bool { + if write_options.metadata_version < ipc::MetadataVersion::V5 { + !matches!(data_type, DataType::Null) + } else { + !matches!(data_type, DataType::Null | DataType::Union(_, _, _)) + } +} + /// Write array data to a vector of bytes +#[allow(clippy::too_many_arguments)] fn write_array_data( array_data: &ArrayData, buffers: &mut Vec, @@ -955,6 +968,7 @@ fn write_array_data( num_rows: usize, null_count: usize, compression_codec: &CompressionCodecType, + write_options: &IpcWriteOptions, ) -> i64 { let mut offset = offset; if !matches!(array_data.data_type(), DataType::Null) { @@ -964,12 +978,7 @@ fn write_array_data( // where null_count is always 0. nodes.push(ipc::FieldNode::new(num_rows as i64, num_rows as i64)); } - // NullArray does not have any buffers, thus the null buffer is not generated - // UnionArray does not have a validity buffer - if !matches!( - array_data.data_type(), - DataType::Null | DataType::Union(_, _, _) - ) { + if has_validity_bitmap(array_data.data_type(), write_options) { // write null buffer if exists let null_buffer = match array_data.null_buffer() { None => { @@ -1003,6 +1012,7 @@ fn write_array_data( data_ref.len(), data_ref.null_count(), &compression_codec, + write_options, ); }); } @@ -1383,6 +1393,7 @@ mod tests { } #[test] + #[cfg(not(feature = "force_validate"))] fn read_and_rewrite_generated_files_014() { let testdata = crate::util::test_util::arrow_test_data(); let version = "0.14.1"; @@ -1435,6 +1446,7 @@ mod tests { } #[test] + #[cfg(not(feature = "force_validate"))] fn read_and_rewrite_generated_streams_014() { let testdata = crate::util::test_util::arrow_test_data(); let version = "0.14.1"; @@ -1688,4 +1700,116 @@ mod tests { // Dictionary with id 2 should have been written to the dict tracker assert!(dict_tracker.written.contains_key(&2)); } + + #[test] + fn read_union_017() { + let testdata = crate::util::test_util::arrow_test_data(); + let version = "0.17.1"; + let data_file = File::open(format!( + "{}/arrow-ipc-stream/integration/0.17.1/generated_union.stream", + testdata, + )) + .unwrap(); + + let reader = StreamReader::try_new(data_file, None).unwrap(); + + // read and rewrite the stream to a temp location + { + let file = File::create(format!( + "target/debug/testdata/{}-generated_union.stream", + version + )) + .unwrap(); + let mut writer = StreamWriter::try_new(file, &reader.schema()).unwrap(); + reader.for_each(|batch| { + writer.write(&batch.unwrap()).unwrap(); + }); + writer.finish().unwrap(); + } + + // Compare original file and rewrote file + let file = File::open(format!( + "target/debug/testdata/{}-generated_union.stream", + version + )) + .unwrap(); + let rewrite_reader = StreamReader::try_new(file, None).unwrap(); + + let data_file = File::open(format!( + "{}/arrow-ipc-stream/integration/0.17.1/generated_union.stream", + testdata, + )) + .unwrap(); + let reader = StreamReader::try_new(data_file, None).unwrap(); + + reader.into_iter().zip(rewrite_reader.into_iter()).for_each( + |(batch1, batch2)| { + assert_eq!(batch1.unwrap(), batch2.unwrap()); + }, + ); + } + + fn write_union_file(options: IpcWriteOptions) { + let schema = Schema::new(vec![Field::new( + "union", + DataType::Union( + vec![ + Field::new("a", DataType::Int32, false), + Field::new("c", DataType::Float64, false), + ], + vec![0, 1], + UnionMode::Sparse, + ), + true, + )]); + let mut builder = UnionBuilder::new_sparse(5); + builder.append::("a", 1).unwrap(); + builder.append_null::("a").unwrap(); + builder.append::("c", 3.0).unwrap(); + builder.append_null::("c").unwrap(); + builder.append::("a", 4).unwrap(); + let union = builder.build().unwrap(); + + let batch = RecordBatch::try_new( + Arc::new(schema.clone()), + vec![Arc::new(union) as ArrayRef], + ) + .unwrap(); + let file_name = "target/debug/testdata/union.arrow_file"; + { + let file = File::create(&file_name).unwrap(); + let mut writer = + FileWriter::try_new_with_options(file, &schema, options).unwrap(); + + writer.write(&batch).unwrap(); + writer.finish().unwrap(); + } + + { + let file = File::open(&file_name).unwrap(); + let reader = FileReader::try_new(file, None).unwrap(); + reader.for_each(|maybe_batch| { + maybe_batch + .unwrap() + .columns() + .iter() + .zip(batch.columns()) + .for_each(|(a, b)| { + assert_eq!(a.data_type(), b.data_type()); + assert_eq!(a.len(), b.len()); + assert_eq!(a.null_count(), b.null_count()); + }); + }); + } + } + + #[test] + fn test_write_union_file_v4_v5() { + write_union_file( + IpcWriteOptions::try_new(8, false, MetadataVersion::V4).unwrap(), + ); + write_union_file( + IpcWriteOptions::try_new(8, false, MetadataVersion::V5).unwrap(), + ); + } } diff --git a/arrow/src/lib.rs b/arrow/src/lib.rs index 0d3ea0f0a2c3..95c69ca0be6d 100644 --- a/arrow/src/lib.rs +++ b/arrow/src/lib.rs @@ -15,123 +15,220 @@ // specific language governing permissions and limitations // under the License. -//! A native Rust implementation of [Apache Arrow](https://arrow.apache.org), a cross-language +//! A complete, safe, native Rust implementation of [Apache Arrow](https://arrow.apache.org), a cross-language //! development platform for in-memory data. //! -//! ### DataType +//! # Columnar Format //! -//! Every [`Array`](array::Array) in this crate has an associated [`DataType`](datatypes::DataType), -//! that specifies how its data is layed in memory and represented. -//! Thus, a central enum of this crate is [`DataType`](datatypes::DataType), that contains the set of valid -//! DataTypes in the specification. For example, [`DataType::Utf8`](datatypes::DataType::Utf8). +//! The [`array`] module provides statically typed implementations of all the array +//! types as defined by the [Arrow Columnar Format](https://arrow.apache.org/docs/format/Columnar.html). //! -//! ## Array -//! -//! The central trait of this package is the dynamically-typed [`Array`](array::Array) that -//! represents a fixed-sized, immutable, Send + Sync Array of nullable elements. An example of such an array is [`UInt32Array`](array::UInt32Array). -//! One way to think about an arrow [`Array`](array::Array) is a `Arc<[Option; len]>` where T can be anything ranging from an integer to a string, or even -//! another [`Array`](array::Array). -//! -//! [`Arrays`](array::Array) have [`len()`](array::Array::len), [`data_type()`](array::Array::data_type), and the nullability of each of its elements, -//! can be obtained via [`is_null(index)`](array::Array::is_null). To downcast an [`Array`](array::Array) to a specific implementation, you can use +//! For example, an [`Int32Array`](array::Int32Array) represents a nullable array of `i32` //! //! ```rust -//! use arrow::array::{Array, UInt32Array}; -//! let array = UInt32Array::from(vec![Some(1), None, Some(3)]); +//! # use arrow::array::{Array, Int32Array}; +//! let array = Int32Array::from(vec![Some(1), None, Some(3)]); //! assert_eq!(array.len(), 3); //! assert_eq!(array.value(0), 1); //! assert_eq!(array.is_null(1), true); -//! ``` //! -//! To make the array dynamically typed, we wrap it in an [`Arc`](std::sync::Arc): -//! -//! ```rust -//! # use std::sync::Arc; -//! use arrow::datatypes::DataType; -//! use arrow::array::{UInt32Array, ArrayRef}; -//! # let array = UInt32Array::from(vec![Some(1), None, Some(3)]); -//! let array: ArrayRef = Arc::new(array); -//! assert_eq!(array.len(), 3); -//! // array.value() is not available in the dynamically-typed version -//! assert_eq!(array.is_null(1), true); -//! assert_eq!(array.data_type(), &DataType::UInt32); +//! let collected: Vec<_> = array.iter().collect(); +//! assert_eq!(collected, vec![Some(1), None, Some(3)]); +//! assert_eq!(array.values(), [1, 0, 3]) //! ``` //! -//! to downcast, use `as_any()`: +//! It is also possible to write generic code. For example, the following is generic over +//! all primitively typed arrays: //! //! ```rust -//! # use std::sync::Arc; -//! # use arrow::array::{UInt32Array, ArrayRef}; -//! # let array = UInt32Array::from(vec![Some(1), None, Some(3)]); -//! # let array: ArrayRef = Arc::new(array); -//! let array = array.as_any().downcast_ref::().unwrap(); -//! assert_eq!(array.value(0), 1); +//! # use std::iter::Sum; +//! # use arrow::array::{Float32Array, PrimitiveArray, TimestampNanosecondArray}; +//! # use arrow::datatypes::ArrowPrimitiveType; +//! # +//! fn sum(array: &PrimitiveArray) -> T::Native +//! where +//! T: ArrowPrimitiveType, +//! T::Native: Sum +//! { +//! array.iter().map(|v| v.unwrap_or_default()).sum() +//! } +//! +//! assert_eq!(sum(&Float32Array::from(vec![1.1, 2.9, 3.])), 7.); +//! assert_eq!(sum(&TimestampNanosecondArray::from(vec![1, 2, 3])), 6); //! ``` //! -//! ## Memory and Buffers +//! For more examples, consult the [`array`] docs. //! -//! Data in [`Array`](array::Array) is stored in [`ArrayData`](array::ArrayData), that in turn -//! is a collection of other [`ArrayData`](array::ArrayData) and [`Buffers`](buffer::Buffer). -//! [`Buffers`](buffer::Buffer) is the central struct that array implementations use keep allocated memory and pointers. -//! The [`MutableBuffer`](buffer::MutableBuffer) is the mutable counter-part of[`Buffer`](buffer::Buffer). -//! These are the lowest abstractions of this crate, and are used throughout the crate to -//! efficiently allocate, write, read and deallocate memory. +//! # Type Erasure / Trait Objects //! -//! ## Field, Schema and RecordBatch +//! It is often the case that code wishes to handle any type of array, without necessarily knowing +//! its concrete type. This use-case is catered for by a combination of [`Array`] +//! and [`DataType`](datatypes::DataType), with the former providing a type-erased container for +//! the array, and the latter identifying the concrete type of array. //! -//! [`Field`](datatypes::Field) is a struct that contains an array's metadata (datatype and whether its values -//! can be null), and a name. [`Schema`](datatypes::Schema) is a vector of fields with optional metadata. -//! Together, they form the basis of a schematic representation of a group of [`Arrays`](array::Array). +//! ```rust +//! # use arrow::array::{Array, Float32Array}; +//! # use arrow::array::StringArray; +//! # use arrow::datatypes::DataType; +//! # +//! fn impl_string(array: &StringArray) {} +//! fn impl_f32(array: &Float32Array) {} +//! +//! fn impl_dyn(array: &dyn Array) { +//! match array.data_type() { +//! DataType::Utf8 => impl_string(array.as_any().downcast_ref().unwrap()), +//! DataType::Float32 => impl_f32(array.as_any().downcast_ref().unwrap()), +//! _ => unimplemented!() +//! } +//! } +//! ``` //! -//! In fact, [`RecordBatch`](record_batch::RecordBatch) is a struct with a [`Schema`](datatypes::Schema) and a vector of -//! [`Array`](array::Array)s, all with the same `len`. A record batch is the highest order struct that this crate currently offers -//! and is broadly used to represent a table where each column in an `Array`. +//! It is also common to want to write a function that returns one of a number of possible +//! array implementations. [`ArrayRef`] is a type-alias for [`Arc`](array::Array) +//! which is frequently used for this purpose //! -//! ## Compute +//! ```rust +//! # use std::str::FromStr; +//! # use std::sync::Arc; +//! # use arrow::array::{ArrayRef, Int32Array, PrimitiveArray}; +//! # use arrow::datatypes::{ArrowPrimitiveType, DataType, Int32Type, UInt32Type}; +//! # use arrow::compute::cast; +//! # +//! fn parse_to_primitive<'a, T, I>(iter: I) -> PrimitiveArray +//! where +//! T: ArrowPrimitiveType, +//! I: IntoIterator, +//! { +//! PrimitiveArray::from_iter(iter.into_iter().map(|val| T::Native::from_str(val).ok())) +//! } +//! +//! fn parse_strings<'a, I>(iter: I, to_data_type: DataType) -> ArrayRef +//! where +//! I: IntoIterator, +//! { +//! match to_data_type { +//! DataType::Int32 => Arc::new(parse_to_primitive::(iter)) as _, +//! DataType::UInt32 => Arc::new(parse_to_primitive::(iter)) as _, +//! _ => unimplemented!() +//! } +//! } +//! +//! let array = parse_strings(["1", "2", "3"], DataType::Int32); +//! let integers = array.as_any().downcast_ref::().unwrap(); +//! assert_eq!(integers.values(), [1, 2, 3]) +//! ``` //! -//! This crate offers many operations (called kernels) to operate on [`Array`](array::Array)s, that you can find at [`Kernel`](compute::kernels). -//! It has both vertical and horizontal operations, and some of them have an SIMD implementation. +//! # Compute Kernels //! -//! ## Status +//! The [`compute`](compute) module provides optimised implementations of many common operations, +//! for example the `parse_strings` operation above could also be implemented as follows: //! -//! This crate has most of the implementation of the arrow specification. Specifically, it supports the following types: +//! ``` +//! # use std::sync::Arc; +//! # use arrow::error::Result; +//! # use arrow::array::{ArrayRef, StringArray, UInt32Array}; +//! # use arrow::datatypes::DataType; +//! # +//! fn parse_strings<'a, I>(iter: I, to_data_type: &DataType) -> Result +//! where +//! I: IntoIterator, +//! { +//! let array = Arc::new(StringArray::from_iter(iter.into_iter().map(Some))) as _; +//! arrow::compute::cast(&array, to_data_type) +//! } +//! +//! let array = parse_strings(["1", "2", "3"], &DataType::UInt32).unwrap(); +//! let integers = array.as_any().downcast_ref::().unwrap(); +//! assert_eq!(integers.values(), [1, 2, 3]) +//! ``` //! -//! * All arrow primitive types, such as [`Int32Array`](array::UInt8Array), [`BooleanArray`](array::BooleanArray) and [`Float64Array`](array::Float64Array). -//! * All arrow variable length types, such as [`StringArray`](array::StringArray) and [`BinaryArray`](array::BinaryArray) -//! * All composite types such as [`StructArray`](array::StructArray) and [`ListArray`](array::ListArray) -//! * Dictionary types [`DictionaryArray`](array::DictionaryArray) - +//! This module also implements many common vertical operations: //! -//! This crate also implements many common vertical operations: -//! * all mathematical binary operators, such as [`subtract`](compute::kernels::arithmetic::subtract) -//! * all boolean binary operators such as [`equality`](compute::kernels::comparison::eq) +//! * All mathematical binary operators, such as [`subtract`](compute::kernels::arithmetic::subtract) +//! * All boolean binary operators such as [`equality`](compute::kernels::comparison::eq) //! * [`cast`](compute::kernels::cast::cast) //! * [`filter`](compute::kernels::filter::filter) //! * [`take`](compute::kernels::take::take) and [`limit`](compute::kernels::limit::limit) //! * [`sort`](compute::kernels::sort::sort) //! * some string operators such as [`substring`](compute::kernels::substring::substring) and [`length`](compute::kernels::length::length) //! -//! as well as some horizontal operations, such as +//! As well as some horizontal operations, such as: //! //! * [`min`](compute::kernels::aggregate::min) and [`max`](compute::kernels::aggregate::max) //! * [`sum`](compute::kernels::aggregate::sum) //! -//! Finally, this crate implements some readers and writers to different formats: +//! # Tabular Representation +//! +//! It is common to want to group one or more columns together into a tabular representation. This +//! is provided by [`RecordBatch`] which combines a [`Schema`](datatypes::Schema) +//! and a corresponding list of [`ArrayRef`]. //! -//! * JSON: [`Reader`](json::reader::Reader) +//! +//! ``` +//! # use std::sync::Arc; +//! # use arrow::array::{Float32Array, Int32Array}; +//! # use arrow::record_batch::RecordBatch; +//! # +//! let col_1 = Arc::new(Int32Array::from_iter([1, 2, 3])) as _; +//! let col_2 = Arc::new(Float32Array::from_iter([1., 6.3, 4.])) as _; +//! +//! let batch = RecordBatch::try_from_iter([("col1", col_1), ("col_2", col_2)]).unwrap(); +//! ``` +//! +//! # IO +//! +//! This crate provides readers and writers for various formats to/from [`RecordBatch`] +//! +//! * JSON: [`Reader`](json::reader::Reader) and [`Writer`](json::writer::Writer) //! * CSV: [`Reader`](csv::reader::Reader) and [`Writer`](csv::writer::Writer) //! * IPC: [`Reader`](ipc::reader::StreamReader) and [`Writer`](ipc::writer::FileWriter) //! -//! The parquet implementation is on a [separate crate](https://crates.io/crates/parquet) +//! Parquet is published as a [separate crate](https://crates.io/crates/parquet) +//! +//! # Memory and Buffers +//! +//! Advanced users may wish to interact with the underlying buffers of an [`Array`], for example, +//! for FFI or high-performance conversion from other formats. This interface is provided by +//! [`ArrayData`] which stores the [`Buffer`] comprising an [`Array`], and can be accessed +//! with [`Array::data`](array::Array::data) +//! +//! The APIs for constructing [`ArrayData`] come in safe, and unsafe variants, with the former +//! performing extensive, but potentially expensive validation to ensure the buffers are well-formed. +//! +//! An [`ArrayRef`] can be cheaply created from an [`ArrayData`] using [`make_array`], +//! or by using the appropriate [`From`] conversion on the concrete [`Array`] implementation. +//! +//! # Safety and Security +//! +//! Like many crates, this crate makes use of unsafe where prudent. However, it endeavours to be +//! sound. Specifically, **it should not be possible to trigger undefined behaviour using safe APIs.** +//! +//! If you think you have found an instance where this is possible, please file +//! a ticket in our [issue tracker] and it will be triaged and fixed. For more information on +//! arrow's use of unsafe, see [here](https://github.com/apache/arrow-rs/tree/master/arrow#safety). +//! +//! # Higher-level Processing +//! +//! This crate aims to provide reusable, low-level primitives for operating on columnar data. For +//! more sophisticated query processing workloads, consider checking out [DataFusion]. This +//! orchestrates the primitives exported by this crate into an embeddable query engine, with +//! SQL and DataFrame frontends, and heavily influences this crate's roadmap. +//! +//! [`array`]: mod@array +//! [`Array`]: array::Array +//! [`ArrayRef`]: array::ArrayRef +//! [`ArrayData`]: array::ArrayData +//! [`make_array`]: array::make_array +//! [`Buffer`]: buffer::Buffer +//! [`RecordBatch`]: record_batch::RecordBatch +//! [DataFusion]: https://github.com/apache/arrow-datafusion +//! [issue tracker]: https://github.com/apache/arrow-rs/issues +//! -#![cfg_attr(feature = "avx512", feature(stdsimd))] -#![cfg_attr(feature = "avx512", feature(repr_simd))] -#![cfg_attr(feature = "avx512", feature(avx512_target_feature))] #![deny(clippy::redundant_clone)] #![warn(missing_debug_implementations)] pub mod alloc; -mod arch; pub mod array; pub mod bitmap; pub mod buffer; diff --git a/arrow/src/pyarrow.rs b/arrow/src/pyarrow.rs index 62e6316b621c..3ae5b3b9987f 100644 --- a/arrow/src/pyarrow.rs +++ b/arrow/src/pyarrow.rs @@ -24,13 +24,16 @@ use std::sync::Arc; use pyo3::ffi::Py_uintptr_t; use pyo3::import_exception; use pyo3::prelude::*; -use pyo3::types::PyList; +use pyo3::types::{PyList, PyTuple}; use crate::array::{Array, ArrayData, ArrayRef}; use crate::datatypes::{DataType, Field, Schema}; use crate::error::ArrowError; use crate::ffi; use crate::ffi::FFI_ArrowSchema; +use crate::ffi_stream::{ + export_reader_into_raw, ArrowArrayStreamReader, FFI_ArrowArrayStream, +}; use crate::record_batch::RecordBatch; import_exception!(pyarrow, ArrowException); @@ -198,6 +201,42 @@ impl PyArrowConvert for RecordBatch { } } +impl PyArrowConvert for ArrowArrayStreamReader { + fn from_pyarrow(value: &PyAny) -> PyResult { + // prepare a pointer to receive the stream struct + let stream = Box::new(FFI_ArrowArrayStream::empty()); + let stream_ptr = Box::into_raw(stream) as *mut FFI_ArrowArrayStream; + + // make the conversion through PyArrow's private API + // this changes the pointer's memory and is thus unsafe. + // In particular, `_export_to_c` can go out of bounds + let args = PyTuple::new(value.py(), &[stream_ptr as Py_uintptr_t]); + value.call_method1("_export_to_c", args)?; + + let stream_reader = + unsafe { ArrowArrayStreamReader::from_raw(stream_ptr).unwrap() }; + + unsafe { + Box::from_raw(stream_ptr); + } + + Ok(stream_reader) + } + + fn to_pyarrow(&self, py: Python) -> PyResult { + let stream = Box::new(FFI_ArrowArrayStream::empty()); + let stream_ptr = Box::into_raw(stream) as *mut FFI_ArrowArrayStream; + + unsafe { export_reader_into_raw(Box::new(self.clone()), stream_ptr) }; + + let module = py.import("pyarrow")?; + let class = module.getattr("RecordBatchReader")?; + let args = PyTuple::new(py, &[stream_ptr as Py_uintptr_t]); + let reader = class.call_method1("_import_from_c", args)?; + Ok(PyObject::from(reader)) + } +} + macro_rules! add_conversion { ($typ:ty) => { impl<'source> FromPyObject<'source> for $typ { @@ -219,3 +258,4 @@ add_conversion!(Field); add_conversion!(Schema); add_conversion!(ArrayData); add_conversion!(RecordBatch); +add_conversion!(ArrowArrayStreamReader); diff --git a/arrow/src/util/bit_chunk_iterator.rs b/arrow/src/util/bit_chunk_iterator.rs index db5aca2a1b3f..f0127ed2267f 100644 --- a/arrow/src/util/bit_chunk_iterator.rs +++ b/arrow/src/util/bit_chunk_iterator.rs @@ -1,5 +1,3 @@ -use std::fmt::Debug; - // Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information @@ -16,7 +14,11 @@ use std::fmt::Debug; // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. + +//! Types for iterating over bitmasks in 64-bit chunks + use crate::util::bit_util::ceil; +use std::fmt::Debug; /// Iterates over an arbitrarily aligned byte buffer /// @@ -611,6 +613,7 @@ mod tests { } #[test] + #[cfg_attr(miri, ignore)] fn fuzz_unaligned_bit_chunk_iterator() { let mut rng = thread_rng(); diff --git a/arrow/src/util/bit_iterator.rs b/arrow/src/util/bit_iterator.rs new file mode 100644 index 000000000000..bba9dac60a4b --- /dev/null +++ b/arrow/src/util/bit_iterator.rs @@ -0,0 +1,160 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::util::bit_chunk_iterator::{UnalignedBitChunk, UnalignedBitChunkIterator}; + +/// Iterator of contiguous ranges of set bits within a provided packed bitmask +/// +/// Returns `(usize, usize)` each representing an interval where the corresponding +/// bits in the provides mask are set +/// +#[derive(Debug)] +pub struct BitSliceIterator<'a> { + iter: UnalignedBitChunkIterator<'a>, + len: usize, + current_offset: i64, + current_chunk: u64, +} + +impl<'a> BitSliceIterator<'a> { + /// Create a new [`BitSliceIterator`] from the provide `buffer`, + /// and `offset` and `len` in bits + pub fn new(buffer: &'a [u8], offset: usize, len: usize) -> Self { + let chunk = UnalignedBitChunk::new(buffer, offset, len); + let mut iter = chunk.iter(); + + let current_offset = -(chunk.lead_padding() as i64); + let current_chunk = iter.next().unwrap_or(0); + + Self { + iter, + len, + current_offset, + current_chunk, + } + } + + /// Returns `Some((chunk_offset, bit_offset))` for the next chunk that has at + /// least one bit set, or None if there is no such chunk. + /// + /// Where `chunk_offset` is the bit offset to the current `u64` chunk + /// and `bit_offset` is the offset of the first `1` bit in that chunk + fn advance_to_set_bit(&mut self) -> Option<(i64, u32)> { + loop { + if self.current_chunk != 0 { + // Find the index of the first 1 + let bit_pos = self.current_chunk.trailing_zeros(); + return Some((self.current_offset, bit_pos)); + } + + self.current_chunk = self.iter.next()?; + self.current_offset += 64; + } + } +} + +impl<'a> Iterator for BitSliceIterator<'a> { + type Item = (usize, usize); + + fn next(&mut self) -> Option { + // Used as termination condition + if self.len == 0 { + return None; + } + + let (start_chunk, start_bit) = self.advance_to_set_bit()?; + + // Set bits up to start + self.current_chunk |= (1 << start_bit) - 1; + + loop { + if self.current_chunk != u64::MAX { + // Find the index of the first 0 + let end_bit = self.current_chunk.trailing_ones(); + + // Zero out up to end_bit + self.current_chunk &= !((1 << end_bit) - 1); + + return Some(( + (start_chunk + start_bit as i64) as usize, + (self.current_offset + end_bit as i64) as usize, + )); + } + + match self.iter.next() { + Some(next) => { + self.current_chunk = next; + self.current_offset += 64; + } + None => { + return Some(( + (start_chunk + start_bit as i64) as usize, + std::mem::replace(&mut self.len, 0), + )); + } + } + } + } +} + +/// An iterator of `usize` whose index in a provided bitmask is true +/// +/// This provides the best performance on most masks, apart from those which contain +/// large runs and therefore favour [`BitSliceIterator`] +#[derive(Debug)] +pub struct BitIndexIterator<'a> { + current_chunk: u64, + chunk_offset: i64, + iter: UnalignedBitChunkIterator<'a>, +} + +impl<'a> BitIndexIterator<'a> { + /// Create a new [`BitIndexIterator`] from the provide `buffer`, + /// and `offset` and `len` in bits + pub fn new(buffer: &'a [u8], offset: usize, len: usize) -> Self { + let chunks = UnalignedBitChunk::new(buffer, offset, len); + let mut iter = chunks.iter(); + + let current_chunk = iter.next().unwrap_or(0); + let chunk_offset = -(chunks.lead_padding() as i64); + + Self { + current_chunk, + chunk_offset, + iter, + } + } +} + +impl<'a> Iterator for BitIndexIterator<'a> { + type Item = usize; + + fn next(&mut self) -> Option { + loop { + if self.current_chunk != 0 { + let bit_pos = self.current_chunk.trailing_zeros(); + self.current_chunk ^= 1 << bit_pos; + return Some((self.chunk_offset + bit_pos as i64) as usize); + } + + self.current_chunk = self.iter.next()?; + self.chunk_offset += 64; + } + } +} + +// Note: tests located in filter module diff --git a/arrow/src/util/decimal.rs b/arrow/src/util/decimal.rs new file mode 100644 index 000000000000..b78af3acc6cd --- /dev/null +++ b/arrow/src/util/decimal.rs @@ -0,0 +1,150 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Decimal related utils + +use std::cmp::Ordering; + +/// Represents a decimal value with precision and scale. +/// The decimal value is represented by a signed 128-bit integer. +#[derive(Debug)] +pub struct Decimal128 { + #[allow(dead_code)] + precision: usize, + scale: usize, + value: i128, +} + +impl PartialOrd for Decimal128 { + fn partial_cmp(&self, other: &Self) -> Option { + assert_eq!( + self.scale, other.scale, + "Cannot compare two Decimal128 with different scale: {}, {}", + self.scale, other.scale + ); + self.value.partial_cmp(&other.value) + } +} + +impl Ord for Decimal128 { + fn cmp(&self, other: &Self) -> Ordering { + assert_eq!( + self.scale, other.scale, + "Cannot compare two Decimal128 with different scale: {}, {}", + self.scale, other.scale + ); + self.value.cmp(&other.value) + } +} + +impl PartialEq for Decimal128 { + fn eq(&self, other: &Self) -> bool { + assert_eq!( + self.scale, other.scale, + "Cannot compare two Decimal128 with different scale: {}, {}", + self.scale, other.scale + ); + self.value.eq(&other.value) + } +} + +impl Eq for Decimal128 {} + +impl Decimal128 { + pub fn new_from_bytes(precision: usize, scale: usize, bytes: &[u8]) -> Self { + let as_array = bytes.try_into(); + let value = match as_array { + Ok(v) if bytes.len() == 16 => i128::from_le_bytes(v), + _ => panic!("Input to Decimal128 is not 128bit integer."), + }; + + Decimal128 { + precision, + scale, + value, + } + } + + pub fn new_from_i128(precision: usize, scale: usize, value: i128) -> Self { + Decimal128 { + precision, + scale, + value, + } + } + + pub fn as_i128(&self) -> i128 { + self.value + } + + pub fn as_string(&self) -> String { + let value_str = self.value.to_string(); + + if self.scale == 0 { + value_str + } else { + let (sign, rest) = value_str.split_at(if self.value >= 0 { 0 } else { 1 }); + + if rest.len() > self.scale { + // Decimal separator is in the middle of the string + let (whole, decimal) = value_str.split_at(value_str.len() - self.scale); + format!("{}.{}", whole, decimal) + } else { + // String has to be padded + format!("{}0.{:0>width$}", sign, rest, width = self.scale) + } + } + } +} + +impl From for i128 { + fn from(decimal: Decimal128) -> Self { + decimal.as_i128() + } +} + +#[cfg(test)] +mod tests { + use crate::util::decimal::Decimal128; + + #[test] + fn decimal_128_to_string() { + let mut value = Decimal128::new_from_i128(5, 2, 100); + assert_eq!(value.as_string(), "1.00"); + + value = Decimal128::new_from_i128(5, 3, 100); + assert_eq!(value.as_string(), "0.100"); + } + + #[test] + fn decimal_128_from_bytes() { + let bytes = 100_i128.to_le_bytes(); + let value = Decimal128::new_from_bytes(5, 2, &bytes); + assert_eq!(value.as_string(), "1.00"); + } + + fn i128_func(value: impl Into) -> i128 { + value.into() + } + + #[test] + fn decimal_128_to_i128() { + let value = Decimal128::new_from_i128(5, 2, 100); + let integer = i128_func(value); + assert_eq!(integer, 100); + } +} diff --git a/arrow/src/util/mod.rs b/arrow/src/util/mod.rs index 3b6de8a4b263..86253da8d777 100644 --- a/arrow/src/util/mod.rs +++ b/arrow/src/util/mod.rs @@ -18,6 +18,7 @@ #[cfg(feature = "test_utils")] pub mod bench_util; pub mod bit_chunk_iterator; +pub mod bit_iterator; pub(crate) mod bit_mask; pub mod bit_util; #[cfg(feature = "test_utils")] @@ -35,4 +36,5 @@ pub mod test_util; mod trusted_len; pub(crate) use trusted_len::trusted_len_unzip; +pub mod decimal; pub(crate) mod reader_parser; diff --git a/arrow/test/dependency/README.md b/arrow/test/dependency/README.md deleted file mode 100644 index b618b4636e7c..000000000000 --- a/arrow/test/dependency/README.md +++ /dev/null @@ -1,21 +0,0 @@ - - -This directory contains projects that use arrow as a dependency with -various combinations of feature flags. diff --git a/arrow/test/dependency/default-features/Cargo.toml b/arrow/test/dependency/default-features/Cargo.toml deleted file mode 100644 index c03aef9979e5..000000000000 --- a/arrow/test/dependency/default-features/Cargo.toml +++ /dev/null @@ -1,30 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -[package] -name = "defeault-features" -description = "Models a user application of arrow that uses default features of arrow" -version = "0.1.0" -edition = "2021" -rust-version = "1.57" - -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html - -[dependencies] -arrow = { path = "../../../../arrow", version = "15.0.0" } - -[workspace] diff --git a/arrow/test/dependency/default-features/src/main.rs b/arrow/test/dependency/default-features/src/main.rs deleted file mode 100644 index e7a11a969c03..000000000000 --- a/arrow/test/dependency/default-features/src/main.rs +++ /dev/null @@ -1,3 +0,0 @@ -fn main() { - println!("Hello, world!"); -} diff --git a/arrow/test/dependency/no-default-features/Cargo.toml b/arrow/test/dependency/no-default-features/Cargo.toml deleted file mode 100644 index c637aa6e9a8b..000000000000 --- a/arrow/test/dependency/no-default-features/Cargo.toml +++ /dev/null @@ -1,30 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -[package] -name = "no-default-features" -description = "Models a user application of arrow that specifies no-default-features=true" -version = "0.1.0" -edition = "2021" -rust-version = "1.57" - -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html - -[dependencies] -arrow = { path = "../../../../arrow", version = "15.0.0", default-features = false } - -[workspace] diff --git a/arrow/test/dependency/no-default-features/src/main.rs b/arrow/test/dependency/no-default-features/src/main.rs deleted file mode 100644 index e7a11a969c03..000000000000 --- a/arrow/test/dependency/no-default-features/src/main.rs +++ /dev/null @@ -1,3 +0,0 @@ -fn main() { - println!("Hello, world!"); -} diff --git a/arrow/test/dependency/simd/Cargo.toml b/arrow/test/dependency/simd/Cargo.toml deleted file mode 100644 index ecc6a93a8be9..000000000000 --- a/arrow/test/dependency/simd/Cargo.toml +++ /dev/null @@ -1,30 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -[package] -name = "defeault-features" -description = "Models a user application of arrow that uses the simd feature of arrow" -version = "0.1.0" -edition = "2021" -rust-version = "1.57" - -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html - -[dependencies] -arrow = { path = "../../../../arrow", version = "15.0.0", features = ["simd"]} - -[workspace] diff --git a/arrow/test/dependency/simd/src/main.rs b/arrow/test/dependency/simd/src/main.rs deleted file mode 100644 index e7a11a969c03..000000000000 --- a/arrow/test/dependency/simd/src/main.rs +++ /dev/null @@ -1,3 +0,0 @@ -fn main() { - println!("Hello, world!"); -} diff --git a/dev/release/README.md b/dev/release/README.md index 96c730a63c6e..912b60dae6b3 100644 --- a/dev/release/README.md +++ b/dev/release/README.md @@ -61,7 +61,7 @@ CHANGELOG_GITHUB_TOKEN= ./dev/release/update_change_log.sh git commit -a -m 'Create changelog' # update versions -sed -i '' -e 's/14.0.0/15.0.0/g' `find . -name 'Cargo.toml' -or -name '*.md' | grep -v CHANGELOG.md` +sed -i '' -e 's/14.0.0/16.0.0/g' `find . -name 'Cargo.toml' -or -name '*.md' | grep -v CHANGELOG.md` git commit -a -m 'Update version' ``` diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt index c7996a78af86..466f6fa45267 100644 --- a/dev/release/rat_exclude_files.txt +++ b/dev/release/rat_exclude_files.txt @@ -20,3 +20,4 @@ conbench/.isort.cfg arrow-flight/src/arrow.flight.protocol.rs arrow-flight/src/sql/arrow.flight.protocol.sql.rs .github/* +parquet/src/bin/parquet-fromcsv-help.txt diff --git a/dev/release/update_change_log.sh b/dev/release/update_change_log.sh index 43b4ee606421..316f10c2594b 100755 --- a/dev/release/update_change_log.sh +++ b/dev/release/update_change_log.sh @@ -29,8 +29,8 @@ set -e -SINCE_TAG="14.0.0" -FUTURE_RELEASE="15.0.0" +SINCE_TAG="15.0.0" +FUTURE_RELEASE="16.0.0" SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" diff --git a/integration-testing/Cargo.toml b/integration-testing/Cargo.toml index 7215c31d2f9a..57b5211129ff 100644 --- a/integration-testing/Cargo.toml +++ b/integration-testing/Cargo.toml @@ -18,7 +18,7 @@ [package] name = "arrow-integration-testing" description = "Binaries used in the Arrow integration tests" -version = "15.0.0" +version = "16.0.0" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" authors = ["Apache Arrow "] @@ -34,7 +34,7 @@ logging = ["tracing-subscriber"] arrow = { path = "../arrow" } arrow-flight = { path = "../arrow-flight" } async-trait = "0.1.41" -clap = { version = "3", features = ["derive", "env"] } +clap = { version = "~3.1", features = ["derive", "env"] } futures = "0.3" hex = "0.4" prost = "0.10" diff --git a/integration-testing/src/flight_client_scenarios/integration_test.rs b/integration-testing/src/flight_client_scenarios/integration_test.rs index 4158a7352140..62fe2b85d262 100644 --- a/integration-testing/src/flight_client_scenarios/integration_test.rs +++ b/integration-testing/src/flight_client_scenarios/integration_test.rs @@ -270,6 +270,7 @@ async fn receive_batch_flight_data( .expect("Error parsing dictionary"), &schema, dictionaries_by_id, + &message.version(), ) .expect("Error reading dictionary"); diff --git a/integration-testing/src/flight_server_scenarios/integration_test.rs b/integration-testing/src/flight_server_scenarios/integration_test.rs index 52086aade748..7ad3d18eb5ba 100644 --- a/integration-testing/src/flight_server_scenarios/integration_test.rs +++ b/integration-testing/src/flight_server_scenarios/integration_test.rs @@ -296,6 +296,7 @@ async fn record_batch_from_message( schema_ref, dictionaries_by_id, None, + &message.version(), ); arrow_batch_result.map_err(|e| { @@ -313,8 +314,13 @@ async fn dictionary_from_message( Status::internal("Could not parse message header as dictionary batch") })?; - let dictionary_batch_result = - reader::read_dictionary(data_body, ipc_batch, &schema_ref, dictionaries_by_id); + let dictionary_batch_result = reader::read_dictionary( + data_body, + ipc_batch, + &schema_ref, + dictionaries_by_id, + &message.version(), + ); dictionary_batch_result.map_err(|e| { Status::internal(format!("Could not convert to Dictionary: {:?}", e)) }) diff --git a/integration-testing/src/lib.rs b/integration-testing/src/lib.rs index 90537242a11f..c7796ece4c73 100644 --- a/integration-testing/src/lib.rs +++ b/integration-testing/src/lib.rs @@ -593,6 +593,10 @@ fn array_from_json( } DataType::Decimal(precision, scale) => { let mut b = DecimalBuilder::new(json_col.count, *precision, *scale); + // C++ interop tests involve incompatible decimal values + unsafe { + b.disable_value_validation(); + } for (is_valid, value) in json_col .validity .as_ref() diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index bb7a8cd10583..28347bcb7dda 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "parquet" -version = "15.0.0" +version = "16.0.0" license = "Apache-2.0" description = "Apache Parquet implementation in Rust" homepage = "https://github.com/apache/arrow-rs" @@ -30,47 +30,52 @@ edition = "2021" rust-version = "1.57" [dependencies] -parquet-format = "4.0.0" -bytes = "1.1" -byteorder = "1" -thrift = "0.13" -snap = { version = "1.0", optional = true } -brotli = { version = "3.3", optional = true } -flate2 = { version = "1.0", optional = true } -lz4 = { version = "1.23", optional = true } +parquet-format = { version = "4.0.0", default-features = false } +bytes = { version = "1.1", default-features = false, features = ["std"] } +byteorder = { version = "1", default-features = false } +thrift = { version = "0.13", default-features = false } +snap = { version = "1.0", default-features = false, optional = true } +brotli = { version = "3.3", default-features = false, features = ["std"], optional = true } +flate2 = { version = "1.0", default-features = false, features = ["rust_backend"], optional = true } +lz4 = { version = "1.23", default-features = false, optional = true } zstd = { version = "0.11.1", optional = true, default-features = false } -chrono = { version = "0.4", default-features = false } -num = "0.4" -num-bigint = "0.4" -arrow = { path = "../arrow", version = "15.0.0", optional = true, default-features = false, features = ["ipc"] } -base64 = { version = "0.13", optional = true } -clap = { version = "3", optional = true, features = ["derive", "env"] } -serde_json = { version = "1.0", features = ["preserve_order"], optional = true } -rand = "0.8" -futures = { version = "0.3", optional = true } +chrono = { version = "0.4", default-features = false, features = ["alloc"] } +num = { version = "0.4", default-features = false } +num-bigint = { version = "0.4", default-features = false } +arrow = { path = "../arrow", version = "16.0.0", optional = true, default-features = false, features = ["ipc"] } +base64 = { version = "0.13", default-features = false, features = ["std"], optional = true } +clap = { version = "~3.1", default-features = false, features = ["std", "derive", "env"], optional = true } +serde_json = { version = "1.0", default-features = false, optional = true } +rand = { version = "0.8", default-features = false } +futures = { version = "0.3", default-features = false, features = ["std" ], optional = true } tokio = { version = "1.0", optional = true, default-features = false, features = ["macros", "fs", "rt", "io-util"] } [dev-dependencies] -criterion = "0.3" -rand = "0.8" -snap = "1.0" -tempfile = "3.0" -brotli = "3.3" -flate2 = "1.0" -lz4 = "1.23" -serde_json = { version = "1.0", features = ["preserve_order"] } -arrow = { path = "../arrow", version = "15.0.0", default-features = false, features = ["test_utils", "prettyprint"] } +base64 = { version = "0.13", default-features = false, features = ["std"] } +criterion = { version = "0.3", default-features = false } +snap = { version = "1.0", default-features = false } +tempfile = { version = "3.0", default-features = false } +brotli = { version = "3.3", default-features = false, features = [ "std" ] } +flate2 = { version = "1.0", default-features = false, features = [ "rust_backend" ] } +lz4 = { version = "1.23", default-features = false } +zstd = { version = "0.11", default-features = false } +serde_json = { version = "1.0", default-features = false, features = ["preserve_order"] } +arrow = { path = "../arrow", version = "16.0.0", default-features = false, features = ["ipc", "test_utils", "prettyprint"] } [package.metadata.docs.rs] all-features = true [features] default = ["arrow", "snap", "brotli", "flate2", "lz4", "zstd", "base64"] -cli = ["serde_json", "base64", "clap"] +# Enable arrow reader/writer APIs +arrow = ["dep:arrow", "base64"] +# Enable CLI tools +cli = ["serde_json", "base64", "clap","arrow/csv"] +# Enable internal testing APIs test_common = [] # Experimental, unstable functionality primarily used for testing experimental = [] -# Enable async API +# Enable async APIs async = ["futures", "tokio"] [[bin]] @@ -85,13 +90,18 @@ required-features = ["cli"] name = "parquet-rowcount" required-features = ["cli"] +[[bin]] +name = "parquet-fromcsv" +required-features = ["cli"] + [[bench]] name = "arrow_writer" +required-features = ["arrow"] harness = false [[bench]] name = "arrow_reader" -required-features = ["test_common", "experimental"] +required-features = ["arrow", "test_common", "experimental"] harness = false [lib] diff --git a/parquet/README.md b/parquet/README.md index ed8b5518b8f9..fbb6e3e1b5d5 100644 --- a/parquet/README.md +++ b/parquet/README.md @@ -27,7 +27,7 @@ See [crate documentation](https://docs.rs/parquet/latest/parquet/) for examples ## Rust Version Compatbility -This crate is tested with the latest stable version of Rust. We do not currrently test against other, older versions of the Rust compiler. +This crate is tested with the latest stable version of Rust. We do not currently test against other, older versions of the Rust compiler. ## Features diff --git a/parquet/benches/arrow_reader.rs b/parquet/benches/arrow_reader.rs index 04e48baef705..647a8dc6f393 100644 --- a/parquet/benches/arrow_reader.rs +++ b/parquet/benches/arrow_reader.rs @@ -355,27 +355,6 @@ fn create_string_byte_array_dictionary_reader( .unwrap() } -fn create_complex_object_byte_array_dictionary_reader( - page_iterator: impl PageIterator + 'static, - column_desc: ColumnDescPtr, -) -> Box { - use parquet::arrow::array_reader::ComplexObjectArrayReader; - use parquet::arrow::converter::{Utf8ArrayConverter, Utf8Converter}; - let arrow_type = - DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)); - - let converter = Utf8Converter::new(Utf8ArrayConverter {}); - Box::new( - ComplexObjectArrayReader::::new( - Box::new(page_iterator), - column_desc, - converter, - Some(arrow_type), - ) - .unwrap(), - ) -} - fn bench_primitive( group: &mut BenchmarkGroup, schema: &SchemaDescPtr, @@ -678,18 +657,7 @@ fn add_benches(c: &mut Criterion) { let mut group = c.benchmark_group("arrow_array_reader/StringDictionary"); - group.bench_function("dictionary encoded, mandatory, no NULLs - old", |b| { - b.iter(|| { - let array_reader = create_complex_object_byte_array_dictionary_reader( - dictionary_string_no_null_data.clone(), - mandatory_string_column_desc.clone(), - ); - count = bench_array_reader(array_reader); - }); - assert_eq!(count, EXPECTED_VALUE_COUNT); - }); - - group.bench_function("dictionary encoded, mandatory, no NULLs - new", |b| { + group.bench_function("dictionary encoded, mandatory, no NULLs", |b| { b.iter(|| { let array_reader = create_string_byte_array_dictionary_reader( dictionary_string_no_null_data.clone(), @@ -700,18 +668,7 @@ fn add_benches(c: &mut Criterion) { assert_eq!(count, EXPECTED_VALUE_COUNT); }); - group.bench_function("dictionary encoded, optional, no NULLs - old", |b| { - b.iter(|| { - let array_reader = create_complex_object_byte_array_dictionary_reader( - dictionary_string_no_null_data.clone(), - optional_string_column_desc.clone(), - ); - count = bench_array_reader(array_reader); - }); - assert_eq!(count, EXPECTED_VALUE_COUNT); - }); - - group.bench_function("dictionary encoded, optional, no NULLs - new", |b| { + group.bench_function("dictionary encoded, optional, no NULLs", |b| { b.iter(|| { let array_reader = create_string_byte_array_dictionary_reader( dictionary_string_no_null_data.clone(), @@ -722,18 +679,7 @@ fn add_benches(c: &mut Criterion) { assert_eq!(count, EXPECTED_VALUE_COUNT); }); - group.bench_function("dictionary encoded, optional, half NULLs - old", |b| { - b.iter(|| { - let array_reader = create_complex_object_byte_array_dictionary_reader( - dictionary_string_half_null_data.clone(), - optional_string_column_desc.clone(), - ); - count = bench_array_reader(array_reader); - }); - assert_eq!(count, EXPECTED_VALUE_COUNT); - }); - - group.bench_function("dictionary encoded, optional, half NULLs - new", |b| { + group.bench_function("dictionary encoded, optional, half NULLs", |b| { b.iter(|| { let array_reader = create_string_byte_array_dictionary_reader( dictionary_string_half_null_data.clone(), diff --git a/parquet/src/arrow/array_reader/builder.rs b/parquet/src/arrow/array_reader/builder.rs index 7b9adfc23f25..e8c22f95aa0a 100644 --- a/parquet/src/arrow/array_reader/builder.rs +++ b/parquet/src/arrow/array_reader/builder.rs @@ -25,7 +25,7 @@ use crate::arrow::array_reader::{ ComplexObjectArrayReader, ListArrayReader, MapArrayReader, NullArrayReader, PrimitiveArrayReader, RowGroupCollection, StructArrayReader, }; -use crate::arrow::converter::{ +use crate::arrow::buffer::converter::{ DecimalArrayConverter, DecimalConverter, FixedLenBinaryConverter, FixedSizeArrayConverter, Int96ArrayConverter, Int96Converter, IntervalDayTimeArrayConverter, IntervalDayTimeConverter, diff --git a/parquet/src/arrow/array_reader/byte_array.rs b/parquet/src/arrow/array_reader/byte_array.rs index b3606a7808b0..9e0f83fa9450 100644 --- a/parquet/src/arrow/array_reader/byte_array.rs +++ b/parquet/src/arrow/array_reader/byte_array.rs @@ -15,8 +15,8 @@ // specific language governing permissions and limitations // under the License. -use crate::arrow::array_reader::offset_buffer::OffsetBuffer; use crate::arrow::array_reader::{read_records, ArrayReader}; +use crate::arrow::buffer::offset_buffer::OffsetBuffer; use crate::arrow::record_reader::buffer::ScalarValue; use crate::arrow::record_reader::GenericRecordReader; use crate::arrow::schema::parquet_to_arrow_field; @@ -125,13 +125,13 @@ impl ArrayReader for ByteArrayReader { fn get_def_levels(&self) -> Option<&[i16]> { self.def_levels_buffer .as_ref() - .map(|buf| unsafe { buf.typed_data() }) + .map(|buf| buf.typed_data()) } fn get_rep_levels(&self) -> Option<&[i16]> { self.rep_levels_buffer .as_ref() - .map(|buf| unsafe { buf.typed_data() }) + .map(|buf| buf.typed_data()) } } diff --git a/parquet/src/arrow/array_reader/byte_array_dictionary.rs b/parquet/src/arrow/array_reader/byte_array_dictionary.rs index fe8448ffb31e..0cd67206f000 100644 --- a/parquet/src/arrow/array_reader/byte_array_dictionary.rs +++ b/parquet/src/arrow/array_reader/byte_array_dictionary.rs @@ -24,12 +24,11 @@ use arrow::array::{Array, ArrayRef, OffsetSizeTrait}; use arrow::buffer::Buffer; use arrow::datatypes::{ArrowNativeType, DataType as ArrowType}; -use crate::arrow::array_reader::dictionary_buffer::DictionaryBuffer; -use crate::arrow::array_reader::{ - byte_array::{ByteArrayDecoder, ByteArrayDecoderPlain}, - offset_buffer::OffsetBuffer, -}; +use crate::arrow::array_reader::byte_array::{ByteArrayDecoder, ByteArrayDecoderPlain}; use crate::arrow::array_reader::{read_records, ArrayReader}; +use crate::arrow::buffer::{ + dictionary_buffer::DictionaryBuffer, offset_buffer::OffsetBuffer, +}; use crate::arrow::record_reader::buffer::{BufferQueue, ScalarValue}; use crate::arrow::record_reader::GenericRecordReader; use crate::arrow::schema::parquet_to_arrow_field; @@ -188,13 +187,13 @@ where fn get_def_levels(&self) -> Option<&[i16]> { self.def_levels_buffer .as_ref() - .map(|buf| unsafe { buf.typed_data() }) + .map(|buf| buf.typed_data()) } fn get_rep_levels(&self) -> Option<&[i16]> { self.rep_levels_buffer .as_ref() - .map(|buf| unsafe { buf.typed_data() }) + .map(|buf| buf.typed_data()) } } @@ -236,13 +235,13 @@ where fn new(col: &ColumnDescPtr) -> Self { let validate_utf8 = col.converted_type() == ConvertedType::UTF8; - let value_type = - match (V::IS_LARGE, col.converted_type() == ConvertedType::UTF8) { - (true, true) => ArrowType::LargeUtf8, - (true, false) => ArrowType::LargeBinary, - (false, true) => ArrowType::Utf8, - (false, false) => ArrowType::Binary, - }; + let value_type = match (V::IS_LARGE, col.converted_type() == ConvertedType::UTF8) + { + (true, true) => ArrowType::LargeUtf8, + (true, false) => ArrowType::LargeBinary, + (false, true) => ArrowType::Utf8, + (false, false) => ArrowType::Binary, + }; Self { dict: None, @@ -357,7 +356,7 @@ where assert_eq!(dict.data_type(), &self.value_type); let dict_buffers = dict.data().buffers(); - let dict_offsets = unsafe { dict_buffers[0].typed_data::() }; + let dict_offsets = dict_buffers[0].typed_data::(); let dict_values = dict_buffers[1].as_slice(); values.extend_from_dictionary( diff --git a/parquet/src/arrow/array_reader.rs b/parquet/src/arrow/array_reader/mod.rs similarity index 99% rename from parquet/src/arrow/array_reader.rs rename to parquet/src/arrow/array_reader/mod.rs index c70071dacf3f..6207b377d137 100644 --- a/parquet/src/arrow/array_reader.rs +++ b/parquet/src/arrow/array_reader/mod.rs @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +//! Logic for reading into arrow arrays + use std::any::Any; use std::cmp::max; use std::marker::PhantomData; @@ -34,7 +36,7 @@ use arrow::datatypes::{ UInt32Type as ArrowUInt32Type, UInt64Type as ArrowUInt64Type, }; -use crate::arrow::converter::Converter; +use crate::arrow::buffer::converter::Converter; use crate::arrow::record_reader::buffer::{ScalarValue, ValuesBuffer}; use crate::arrow::record_reader::{GenericRecordReader, RecordReader}; use crate::arrow::schema::parquet_to_arrow_field; @@ -50,11 +52,9 @@ use crate::schema::types::{ColumnDescPtr, SchemaDescPtr}; mod builder; mod byte_array; mod byte_array_dictionary; -mod dictionary_buffer; mod empty_array; mod list_array; mod map_array; -mod offset_buffer; #[cfg(test)] mod test_util; @@ -226,13 +226,13 @@ where fn get_def_levels(&self) -> Option<&[i16]> { self.def_levels_buffer .as_ref() - .map(|buf| unsafe { buf.typed_data() }) + .map(|buf| buf.typed_data()) } fn get_rep_levels(&self) -> Option<&[i16]> { self.rep_levels_buffer .as_ref() - .map(|buf| unsafe { buf.typed_data() }) + .map(|buf| buf.typed_data()) } } @@ -447,13 +447,13 @@ where fn get_def_levels(&self) -> Option<&[i16]> { self.def_levels_buffer .as_ref() - .map(|buf| unsafe { buf.typed_data() }) + .map(|buf| buf.typed_data()) } fn get_rep_levels(&self) -> Option<&[i16]> { self.rep_levels_buffer .as_ref() - .map(|buf| unsafe { buf.typed_data() }) + .map(|buf| buf.typed_data()) } } @@ -811,7 +811,7 @@ mod tests { TimestampMillisecondType as ArrowTimestampMillisecondType, }; - use crate::arrow::converter::{Utf8ArrayConverter, Utf8Converter}; + use crate::arrow::buffer::converter::{Utf8ArrayConverter, Utf8Converter}; use crate::basic::{Encoding, Type as PhysicalType}; use crate::column::page::Page; use crate::data_type::{ByteArray, ByteArrayType, DataType, Int32Type, Int64Type}; @@ -1384,8 +1384,7 @@ mod tests { let mut all_values = Vec::with_capacity(num_pages * values_per_page); for i in 0..num_pages { - let mut dict_encoder = - DictEncoder::::new(column_desc.clone()); + let mut dict_encoder = DictEncoder::::new(column_desc.clone()); // add data page let mut values = Vec::with_capacity(values_per_page); diff --git a/parquet/src/arrow/arrow_reader.rs b/parquet/src/arrow/arrow_reader.rs index 34a14f3725f7..89406cd616a4 100644 --- a/parquet/src/arrow/arrow_reader.rs +++ b/parquet/src/arrow/arrow_reader.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -//! Contains reader which reads parquet data into arrow array. +//! Contains reader which reads parquet data into arrow [`RecordBatch`] use std::sync::Arc; @@ -31,7 +31,8 @@ use crate::arrow::schema::parquet_to_arrow_schema_by_columns; use crate::arrow::ProjectionMask; use crate::errors::Result; use crate::file::metadata::{KeyValue, ParquetMetaData}; -use crate::file::reader::FileReader; +use crate::file::reader::{ChunkReader, FileReader, SerializedFileReader}; +use crate::schema::types::SchemaDescriptor; /// Arrow reader api. /// With this api, user can get arrow schema from parquet file, and read parquet data @@ -144,15 +145,40 @@ impl ArrowReader for ParquetFileArrowReader { } impl ParquetFileArrowReader { - /// Create a new [`ParquetFileArrowReader`] + /// Create a new [`ParquetFileArrowReader`] with the provided [`ChunkReader`] + /// + /// ```no_run + /// # use std::fs::File; + /// # use bytes::Bytes; + /// # use parquet::arrow::ParquetFileArrowReader; + /// + /// let file = File::open("file.parquet").unwrap(); + /// let reader = ParquetFileArrowReader::try_new(file).unwrap(); + /// + /// let bytes = Bytes::from(vec![]); + /// let reader = ParquetFileArrowReader::try_new(bytes).unwrap(); + /// ``` + pub fn try_new(chunk_reader: R) -> Result { + Self::try_new_with_options(chunk_reader, Default::default()) + } + + /// Create a new [`ParquetFileArrowReader`] with the provided [`ChunkReader`] + /// and [`ArrowReaderOptions`] + pub fn try_new_with_options( + chunk_reader: R, + options: ArrowReaderOptions, + ) -> Result { + let file_reader = Arc::new(SerializedFileReader::new(chunk_reader)?); + Ok(Self::new_with_options(file_reader, options)) + } + + /// Create a new [`ParquetFileArrowReader`] with the provided [`Arc`] pub fn new(file_reader: Arc) -> Self { - Self { - file_reader, - options: Default::default(), - } + Self::new_with_options(file_reader, Default::default()) } - /// Create a new [`ParquetFileArrowReader`] with the provided [`ArrowReaderOptions`] + /// Create a new [`ParquetFileArrowReader`] with the provided [`Arc`] + /// and [`ArrowReaderOptions`] pub fn new_with_options( file_reader: Arc, options: ArrowReaderOptions, @@ -164,10 +190,21 @@ impl ParquetFileArrowReader { } /// Expose the reader metadata + #[deprecated = "use metadata() instead"] pub fn get_metadata(&mut self) -> ParquetMetaData { self.file_reader.metadata().clone() } + /// Returns the parquet metadata + pub fn metadata(&self) -> &ParquetMetaData { + self.file_reader.metadata() + } + + /// Returns the parquet schema + pub fn parquet_schema(&self) -> &SchemaDescriptor { + self.file_reader.metadata().file_metadata().schema_descr() + } + /// Returns the key value metadata, returns `None` if [`ArrowReaderOptions::skip_arrow_metadata`] fn get_kv_metadata(&self) -> Option<&Vec> { if self.options.skip_arrow_metadata { @@ -236,6 +273,7 @@ impl ParquetRecordBatchReader { #[cfg(test)] mod tests { + use bytes::Bytes; use std::cmp::min; use std::convert::TryFrom; use std::fs::File; @@ -256,7 +294,7 @@ mod tests { use crate::arrow::arrow_reader::{ ArrowReader, ArrowReaderOptions, ParquetFileArrowReader, }; - use crate::arrow::converter::{ + use crate::arrow::buffer::converter::{ BinaryArrayConverter, Converter, FixedSizeArrayConverter, FromConverter, IntervalDayTimeArrayConverter, LargeUtf8ArrayConverter, Utf8ArrayConverter, }; @@ -273,7 +311,6 @@ mod tests { use crate::file::writer::SerializedFileWriter; use crate::schema::parser::parse_message_type; use crate::schema::types::{Type, TypePtr}; - use crate::util::cursor::SliceableCursor; use crate::util::test_common::RandGen; #[test] @@ -357,8 +394,7 @@ mod tests { file.rewind().unwrap(); - let parquet_reader = SerializedFileReader::try_from(file).unwrap(); - let mut arrow_reader = ParquetFileArrowReader::new(Arc::new(parquet_reader)); + let mut arrow_reader = ParquetFileArrowReader::try_new(file).unwrap(); let record_reader = arrow_reader.get_record_reader(2).unwrap(); let batches = record_reader.collect::>>().unwrap(); @@ -589,9 +625,8 @@ mod tests { let file_variants = vec![("fixed_length", 25), ("int32", 4), ("int64", 10)]; for (prefix, target_precision) in file_variants { let path = format!("{}/{}_decimal.parquet", testdata, prefix); - let parquet_reader = - SerializedFileReader::try_from(File::open(&path).unwrap()).unwrap(); - let mut arrow_reader = ParquetFileArrowReader::new(Arc::new(parquet_reader)); + let file = File::open(&path).unwrap(); + let mut arrow_reader = ParquetFileArrowReader::try_new(file).unwrap(); let mut record_reader = arrow_reader.get_record_reader(32).unwrap(); @@ -609,7 +644,7 @@ mod tests { assert_eq!(col.scale(), 2); for (i, v) in expected.enumerate() { - assert_eq!(col.value(i), v * 100_i128); + assert_eq!(col.value(i).as_i128(), v * 100_i128); } } } @@ -859,9 +894,7 @@ mod tests { file.rewind().unwrap(); - let parquet_reader = SerializedFileReader::try_from(file).unwrap(); - let mut arrow_reader = ParquetFileArrowReader::new(Arc::new(parquet_reader)); - + let mut arrow_reader = ParquetFileArrowReader::try_new(file).unwrap(); let mut record_reader = arrow_reader .get_record_reader(opts.record_batch_size) .unwrap(); @@ -1010,11 +1043,7 @@ mod tests { let testdata = arrow::util::test_util::parquet_test_data(); let path = format!("{}/nested_structs.rust.parquet", testdata); let file = File::open(&path).unwrap(); - let parquet_file_reader = SerializedFileReader::try_from(file).unwrap(); - let file_metadata = parquet_file_reader.metadata().file_metadata(); - let schema = file_metadata.schema_descr_ptr(); - - let mut arrow_reader = ParquetFileArrowReader::new(Arc::new(parquet_file_reader)); + let mut arrow_reader = ParquetFileArrowReader::try_new(file).unwrap(); let record_batch_reader = arrow_reader .get_record_reader(60) .expect("Failed to read into array!"); @@ -1023,7 +1052,7 @@ mod tests { batch.unwrap(); } - let mask = ProjectionMask::leaves(&schema, [3, 8, 10]); + let mask = ProjectionMask::leaves(arrow_reader.parquet_schema(), [3, 8, 10]); let projected_reader = arrow_reader .get_record_reader_by_columns(mask.clone(), 60) .unwrap(); @@ -1063,9 +1092,8 @@ mod tests { fn test_read_maps() { let testdata = arrow::util::test_util::parquet_test_data(); let path = format!("{}/nested_maps.snappy.parquet", testdata); - let parquet_file_reader = - SerializedFileReader::try_from(File::open(&path).unwrap()).unwrap(); - let mut arrow_reader = ParquetFileArrowReader::new(Arc::new(parquet_file_reader)); + let file = File::open(&path).unwrap(); + let mut arrow_reader = ParquetFileArrowReader::try_new(file).unwrap(); let record_batch_reader = arrow_reader .get_record_reader(60) .expect("Failed to read into array!"); @@ -1112,14 +1140,12 @@ mod tests { writer.close().unwrap(); } - let file_reader = Arc::new(SerializedFileReader::new(file).unwrap()); - let file_metadata = file_reader.metadata().file_metadata(); - let mask = ProjectionMask::leaves(file_metadata.schema_descr(), [0]); + let mut reader = ParquetFileArrowReader::try_new(file).unwrap(); + let mask = ProjectionMask::leaves(reader.parquet_schema(), [0]); - let mut batch = ParquetFileArrowReader::new(file_reader); - let reader = batch.get_record_reader_by_columns(mask, 1024).unwrap(); + let reader = reader.get_record_reader_by_columns(mask, 1024).unwrap(); - let expected_schema = arrow::datatypes::Schema::new(vec![Field::new( + let expected_schema = Schema::new(vec![Field::new( "group", ArrowDataType::Struct(vec![Field::new("leaf", ArrowDataType::Int32, false)]), true, @@ -1150,10 +1176,8 @@ mod tests { 114, 111, 119, 0, 130, 0, 0, 0, 80, 65, 82, 49, ]; - let file = SliceableCursor::new(data); - let file_reader = SerializedFileReader::new(file).unwrap(); - let mut arrow_reader = ParquetFileArrowReader::new(Arc::new(file_reader)); - + let file = Bytes::from(data); + let mut arrow_reader = ParquetFileArrowReader::try_new(file).unwrap(); let mut record_batch_reader = arrow_reader .get_record_reader_by_columns(ProjectionMask::all(), 10) .unwrap(); @@ -1229,8 +1253,7 @@ mod tests { file.rewind().unwrap(); - let parquet_reader = SerializedFileReader::try_from(file).unwrap(); - let mut arrow_reader = ParquetFileArrowReader::new(Arc::new(parquet_reader)); + let mut arrow_reader = ParquetFileArrowReader::try_new(file).unwrap(); let record_reader = arrow_reader.get_record_reader(3).unwrap(); @@ -1268,9 +1291,8 @@ mod tests { fn test_read_null_list() { let testdata = arrow::util::test_util::parquet_test_data(); let path = format!("{}/null_list.parquet", testdata); - let parquet_file_reader = - SerializedFileReader::try_from(File::open(&path).unwrap()).unwrap(); - let mut arrow_reader = ParquetFileArrowReader::new(Arc::new(parquet_file_reader)); + let file = File::open(&path).unwrap(); + let mut arrow_reader = ParquetFileArrowReader::try_new(file).unwrap(); let mut record_batch_reader = arrow_reader .get_record_reader(60) .expect("Failed to read into array!"); @@ -1390,12 +1412,12 @@ mod tests { let testdata = arrow::util::test_util::parquet_test_data(); let path = format!("{}/alltypes_plain.parquet", testdata); let file = File::open(&path).unwrap(); - let reader = SerializedFileReader::try_from(file).unwrap(); - let file_metadata = reader.metadata().file_metadata(); + + let mut arrow_reader = ParquetFileArrowReader::try_new(file).unwrap(); + let file_metadata = arrow_reader.metadata().file_metadata(); let expected_rows = file_metadata.num_rows() as usize; let schema = file_metadata.schema_descr_ptr(); - let mut arrow_reader = ParquetFileArrowReader::new(Arc::new(reader)); let mask = ProjectionMask::leaves(&schema, []); let batch_reader = arrow_reader.get_record_reader_by_columns(mask, 2).unwrap(); diff --git a/parquet/src/arrow/levels.rs b/parquet/src/arrow/arrow_writer/levels.rs similarity index 100% rename from parquet/src/arrow/levels.rs rename to parquet/src/arrow/arrow_writer/levels.rs diff --git a/parquet/src/arrow/arrow_writer.rs b/parquet/src/arrow/arrow_writer/mod.rs similarity index 98% rename from parquet/src/arrow/arrow_writer.rs rename to parquet/src/arrow/arrow_writer/mod.rs index 530dfe2ad090..83f1bc70b525 100644 --- a/parquet/src/arrow/arrow_writer.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -27,18 +27,20 @@ use arrow::datatypes::{DataType as ArrowDataType, IntervalUnit, SchemaRef}; use arrow::record_batch::RecordBatch; use arrow_array::Array; -use super::levels::LevelInfo; use super::schema::{ add_encoded_arrow_schema_to_metadata, arrow_to_parquet_schema, decimal_length_from_precision, }; -use crate::arrow::levels::calculate_array_levels; use crate::column::writer::ColumnWriter; use crate::errors::{ParquetError, Result}; +use crate::file::metadata::RowGroupMetaDataPtr; use crate::file::properties::WriterProperties; use crate::file::writer::{SerializedColumnWriter, SerializedRowGroupWriter}; use crate::{data_type::*, file::writer::SerializedFileWriter}; +use levels::{calculate_array_levels, LevelInfo}; + +mod levels; /// Arrow writer /// @@ -95,6 +97,11 @@ impl ArrowWriter { }) } + /// Returns metadata for any flushed row groups + pub fn flushed_row_groups(&self) -> &[RowGroupMetaDataPtr] { + self.writer.flushed_row_groups() + } + /// Enqueues the provided `RecordBatch` to be written /// /// If following this there are more than `max_row_group_size` rows buffered, @@ -569,7 +576,7 @@ macro_rules! def_get_binary_array_fn { fn $name(array: &$ty) -> Vec { let mut byte_array = ByteArray::new(); let ptr = crate::util::memory::ByteBufferPtr::new( - unsafe { array.value_data().typed_data::() }.to_vec(), + array.value_data().as_slice().to_vec(), ); byte_array.set_data(ptr); array @@ -666,7 +673,7 @@ fn get_decimal_array_slice( let mut values = Vec::with_capacity(indices.len()); let size = decimal_length_from_precision(array.precision()); for i in indices { - let as_be_bytes = array.value(*i).to_be_bytes(); + let as_be_bytes = array.value(*i).as_i128().to_be_bytes(); let resized_value = as_be_bytes[(16 - size)..].to_vec(); values.push(FixedLenByteArray::from(ByteArray::from(resized_value))); } @@ -689,6 +696,7 @@ fn get_fsb_array_slice( mod tests { use super::*; + use bytes::Bytes; use std::fs::File; use std::sync::Arc; @@ -750,9 +758,8 @@ mod tests { writer.close().unwrap(); } - let cursor = crate::file::serialized_reader::SliceableCursor::new(buffer); - let reader = SerializedFileReader::new(cursor).unwrap(); - let mut arrow_reader = ParquetFileArrowReader::new(Arc::new(reader)); + let cursor = Bytes::from(buffer); + let mut arrow_reader = ParquetFileArrowReader::try_new(cursor).unwrap(); let mut record_batch_reader = arrow_reader.get_record_reader(1024).unwrap(); let actual_batch = record_batch_reader @@ -1187,8 +1194,8 @@ mod tests { writer.write(&expected_batch).unwrap(); writer.close().unwrap(); - let reader = SerializedFileReader::new(file.try_clone().unwrap()).unwrap(); - let mut arrow_reader = ParquetFileArrowReader::new(Arc::new(reader)); + let mut arrow_reader = + ParquetFileArrowReader::try_new(file.try_clone().unwrap()).unwrap(); let mut record_batch_reader = arrow_reader.get_record_reader(1024).unwrap(); let actual_batch = record_batch_reader @@ -1917,10 +1924,9 @@ mod tests { writer.close().unwrap(); - let reader = SerializedFileReader::new(file).unwrap(); - assert_eq!(&row_group_sizes(reader.metadata()), &[200, 200, 50]); + let mut arrow_reader = ParquetFileArrowReader::try_new(file).unwrap(); + assert_eq!(&row_group_sizes(arrow_reader.metadata()), &[200, 200, 50]); - let mut arrow_reader = ParquetFileArrowReader::new(Arc::new(reader)); let batches = arrow_reader .get_record_reader(100) .unwrap() @@ -2060,13 +2066,12 @@ mod tests { writer.close().unwrap(); // Read Data - let reader = SerializedFileReader::new(file).unwrap(); - // Should have written entire first batch and first row of second to the first row group // leaving a single row in the second row group - assert_eq!(&row_group_sizes(reader.metadata()), &[6, 1]); - let mut arrow_reader = ParquetFileArrowReader::new(Arc::new(reader)); + let mut arrow_reader = ParquetFileArrowReader::try_new(file).unwrap(); + assert_eq!(&row_group_sizes(arrow_reader.metadata()), &[6, 1]); + let batches = arrow_reader .get_record_reader(2) .unwrap() diff --git a/parquet/src/arrow/async_reader.rs b/parquet/src/arrow/async_reader.rs index 5cd091184bfa..3f14114e3c60 100644 --- a/parquet/src/arrow/async_reader.rs +++ b/parquet/src/arrow/async_reader.rs @@ -78,13 +78,15 @@ use std::collections::VecDeque; use std::fmt::Formatter; use std::io::{Cursor, SeekFrom}; +use std::ops::Range; use std::pin::Pin; use std::sync::Arc; use std::task::{Context, Poll}; -use byteorder::{ByteOrder, LittleEndian}; +use bytes::{Buf, Bytes}; use futures::future::{BoxFuture, FutureExt}; use futures::stream::Stream; +use parquet_format::PageType; use tokio::io::{AsyncRead, AsyncReadExt, AsyncSeek, AsyncSeekExt}; use arrow::datatypes::SchemaRef; @@ -95,14 +97,64 @@ use crate::arrow::arrow_reader::ParquetRecordBatchReader; use crate::arrow::schema::parquet_to_arrow_schema; use crate::arrow::ProjectionMask; use crate::basic::Compression; -use crate::column::page::{PageIterator, PageReader}; +use crate::column::page::{Page, PageIterator, PageReader}; +use crate::compression::{create_codec, Codec}; use crate::errors::{ParquetError, Result}; -use crate::file::footer::parse_metadata_buffer; +use crate::file::footer::{decode_footer, decode_metadata}; use crate::file::metadata::ParquetMetaData; use crate::file::reader::SerializedPageReader; -use crate::file::PARQUET_MAGIC; -use crate::schema::types::{ColumnDescPtr, SchemaDescPtr}; -use crate::util::memory::ByteBufferPtr; +use crate::file::serialized_reader::{decode_page, read_page_header}; +use crate::file::FOOTER_SIZE; +use crate::schema::types::{ColumnDescPtr, SchemaDescPtr, SchemaDescriptor}; + +/// The asynchronous interface used by [`ParquetRecordBatchStream`] to read parquet files +pub trait AsyncFileReader { + /// Retrieve the bytes in `range` + fn get_bytes(&mut self, range: Range) -> BoxFuture<'_, Result>; + + /// Provides asynchronous access to the [`ParquetMetaData`] of a parquet file, + /// allowing fine-grained control over how metadata is sourced, in particular allowing + /// for caching, pre-fetching, catalog metadata, etc... + fn get_metadata(&mut self) -> BoxFuture<'_, Result>>; +} + +impl AsyncFileReader for T { + fn get_bytes(&mut self, range: Range) -> BoxFuture<'_, Result> { + async move { + self.seek(SeekFrom::Start(range.start as u64)).await?; + + let to_read = range.end - range.start; + let mut buffer = Vec::with_capacity(to_read); + let read = self.take(to_read as u64).read_to_end(&mut buffer).await?; + if read != to_read { + eof_err!("expected to read {} bytes, got {}", to_read, read); + } + + Ok(buffer.into()) + } + .boxed() + } + + fn get_metadata(&mut self) -> BoxFuture<'_, Result>> { + const FOOTER_SIZE_I64: i64 = FOOTER_SIZE as i64; + async move { + self.seek(SeekFrom::End(-FOOTER_SIZE_I64)).await?; + + let mut buf = [0_u8; FOOTER_SIZE]; + self.read_exact(&mut buf).await?; + + let metadata_len = decode_footer(&buf)?; + self.seek(SeekFrom::End(-FOOTER_SIZE_I64 - metadata_len as i64)) + .await?; + + let mut buf = Vec::with_capacity(metadata_len); + self.read_to_end(&mut buf).await?; + + Ok(Arc::new(decode_metadata(&buf)?)) + } + .boxed() + } +} /// A builder used to construct a [`ParquetRecordBatchStream`] for a parquet file /// @@ -124,10 +176,10 @@ pub struct ParquetRecordBatchStreamBuilder { projection: ProjectionMask, } -impl ParquetRecordBatchStreamBuilder { +impl ParquetRecordBatchStreamBuilder { /// Create a new [`ParquetRecordBatchStreamBuilder`] with the provided parquet file pub async fn new(mut input: T) -> Result { - let metadata = Arc::new(read_footer(&mut input).await?); + let metadata = input.get_metadata().await?; let schema = Arc::new(parquet_to_arrow_schema( metadata.file_metadata().schema_descr(), @@ -149,6 +201,11 @@ impl ParquetRecordBatchStreamBuilder { &self.metadata } + /// Returns the parquet [`SchemaDescriptor`] for this parquet file + pub fn parquet_schema(&self) -> &SchemaDescriptor { + self.metadata.file_metadata().schema_descr() + } + /// Returns the arrow [`SchemaRef`] for this parquet file pub fn schema(&self) -> &SchemaRef { &self.schema @@ -264,8 +321,9 @@ impl ParquetRecordBatchStream { } } -impl Stream - for ParquetRecordBatchStream +impl Stream for ParquetRecordBatchStream +where + T: AsyncFileReader + Unpin + Send + 'static, { type Item = Result; @@ -309,6 +367,7 @@ impl Stream let mut column_chunks = vec![None; row_group_metadata.columns().len()]; + // TODO: Combine consecutive ranges for (idx, chunk) in column_chunks.iter_mut().enumerate() { if !projection.leaf_included(idx) { continue; @@ -316,18 +375,16 @@ impl Stream let column = row_group_metadata.column(idx); let (start, length) = column.byte_range(); - let end = start + length; - - input.seek(SeekFrom::Start(start)).await?; - let mut buffer = vec![0_u8; (end - start) as usize]; - input.read_exact(buffer.as_mut_slice()).await?; + let data = input + .get_bytes(start as usize..(start + length) as usize) + .await?; *chunk = Some(InMemoryColumnChunk { num_values: column.num_values(), compression: column.compression(), physical_type: column.column_type(), - data: ByteBufferPtr::new(buffer), + data, }); } @@ -379,34 +436,7 @@ impl Stream } } -async fn read_footer( - input: &mut T, -) -> Result { - input.seek(SeekFrom::End(-8)).await?; - - let mut buf = [0_u8; 8]; - input.read_exact(&mut buf).await?; - - if buf[4..] != PARQUET_MAGIC { - return Err(general_err!("Invalid Parquet file. Corrupt footer")); - } - - let metadata_len = LittleEndian::read_i32(&buf[..4]) as i64; - if metadata_len < 0 { - return Err(general_err!( - "Invalid Parquet file. Metadata length is less than zero ({})", - metadata_len - )); - } - - input.seek(SeekFrom::End(-8 - metadata_len)).await?; - - let mut buf = Vec::with_capacity(metadata_len as usize + 8); - input.read_to_end(&mut buf).await?; - - parse_metadata_buffer(&mut Cursor::new(buf)) -} - +/// An in-memory collection of column chunks struct InMemoryRowGroup { schema: SchemaDescPtr, column_chunks: Vec>, @@ -433,18 +463,19 @@ impl RowGroupCollection for InMemoryRowGroup { } } +/// Data for a single column chunk #[derive(Clone)] struct InMemoryColumnChunk { num_values: i64, compression: Compression, physical_type: crate::basic::Type, - data: ByteBufferPtr, + data: Bytes, } impl InMemoryColumnChunk { fn pages(&self) -> Result> { let page_reader = SerializedPageReader::new( - Cursor::new(self.data.clone()), + self.data.clone().reader(), self.num_values, self.compression, self.physical_type, @@ -454,6 +485,82 @@ impl InMemoryColumnChunk { } } +// A serialized implementation for Parquet [`PageReader`]. +struct InMemoryColumnChunkReader { + chunk: InMemoryColumnChunk, + decompressor: Option>, + offset: usize, + seen_num_values: i64, +} + +impl InMemoryColumnChunkReader { + /// Creates a new serialized page reader from file source. + pub fn new(chunk: InMemoryColumnChunk) -> Result { + let decompressor = create_codec(chunk.compression)?; + let result = Self { + chunk, + decompressor, + offset: 0, + seen_num_values: 0, + }; + Ok(result) + } +} + +impl Iterator for InMemoryColumnChunkReader { + type Item = Result; + + fn next(&mut self) -> Option { + self.get_next_page().transpose() + } +} + +impl PageReader for InMemoryColumnChunkReader { + fn get_next_page(&mut self) -> Result> { + while self.seen_num_values < self.chunk.num_values { + let mut cursor = Cursor::new(&self.chunk.data.as_ref()[self.offset..]); + let page_header = read_page_header(&mut cursor)?; + let compressed_size = page_header.compressed_page_size as usize; + + self.offset += cursor.position() as usize; + let start_offset = self.offset; + let end_offset = self.offset + compressed_size; + self.offset = end_offset; + + let buffer = self.chunk.data.slice(start_offset..end_offset); + + let result = match page_header.type_ { + PageType::DataPage | PageType::DataPageV2 => { + let decoded = decode_page( + page_header, + buffer.into(), + self.chunk.physical_type, + self.decompressor.as_mut(), + )?; + self.seen_num_values += decoded.num_values() as i64; + decoded + } + PageType::DictionaryPage => decode_page( + page_header, + buffer.into(), + self.chunk.physical_type, + self.decompressor.as_mut(), + )?, + _ => { + // For unknown page type (e.g., INDEX_PAGE), skip and read next. + continue; + } + }; + + return Ok(Some(result)); + } + + // We are at the end of this column chunk and no more page left. Return None. + Ok(None) + } +} + +/// Implements [`PageIterator`] for a single column chunk, yielding a single [`PageReader`] struct ColumnChunkIterator { schema: SchemaDescPtr, column_schema: ColumnDescPtr, @@ -477,3 +584,82 @@ impl PageIterator for ColumnChunkIterator { Ok(self.column_schema.clone()) } } + +#[cfg(test)] +mod tests { + use super::*; + use crate::arrow::{ArrowReader, ParquetFileArrowReader}; + use arrow::error::Result as ArrowResult; + use futures::TryStreamExt; + use std::sync::Mutex; + + struct TestReader { + data: Bytes, + metadata: Arc, + requests: Arc>>>, + } + + impl AsyncFileReader for TestReader { + fn get_bytes(&mut self, range: Range) -> BoxFuture<'_, Result> { + self.requests.lock().unwrap().push(range.clone()); + futures::future::ready(Ok(self.data.slice(range))).boxed() + } + + fn get_metadata(&mut self) -> BoxFuture<'_, Result>> { + futures::future::ready(Ok(self.metadata.clone())).boxed() + } + } + + #[tokio::test] + async fn test_async_reader() { + let testdata = arrow::util::test_util::parquet_test_data(); + let path = format!("{}/alltypes_plain.parquet", testdata); + let data = Bytes::from(std::fs::read(path).unwrap()); + + let metadata = crate::file::footer::parse_metadata(&data).unwrap(); + let metadata = Arc::new(metadata); + + assert_eq!(metadata.num_row_groups(), 1); + + let async_reader = TestReader { + data: data.clone(), + metadata: metadata.clone(), + requests: Default::default(), + }; + + let requests = async_reader.requests.clone(); + let builder = ParquetRecordBatchStreamBuilder::new(async_reader) + .await + .unwrap(); + + let mask = ProjectionMask::leaves(builder.parquet_schema(), vec![1, 2]); + let stream = builder + .with_projection(mask.clone()) + .with_batch_size(1024) + .build() + .unwrap(); + + let async_batches: Vec<_> = stream.try_collect().await.unwrap(); + + let mut sync_reader = ParquetFileArrowReader::try_new(data).unwrap(); + let sync_batches = sync_reader + .get_record_reader_by_columns(mask, 1024) + .unwrap() + .collect::>>() + .unwrap(); + + assert_eq!(async_batches, sync_batches); + + let requests = requests.lock().unwrap(); + let (offset_1, length_1) = metadata.row_group(0).column(1).byte_range(); + let (offset_2, length_2) = metadata.row_group(0).column(2).byte_range(); + + assert_eq!( + &requests[..], + &[ + offset_1 as usize..(offset_1 + length_1) as usize, + offset_2 as usize..(offset_2 + length_2) as usize + ] + ); + } +} diff --git a/parquet/src/arrow/bit_util.rs b/parquet/src/arrow/buffer/bit_util.rs similarity index 100% rename from parquet/src/arrow/bit_util.rs rename to parquet/src/arrow/buffer/bit_util.rs diff --git a/parquet/src/arrow/converter.rs b/parquet/src/arrow/buffer/converter.rs similarity index 100% rename from parquet/src/arrow/converter.rs rename to parquet/src/arrow/buffer/converter.rs diff --git a/parquet/src/arrow/array_reader/dictionary_buffer.rs b/parquet/src/arrow/buffer/dictionary_buffer.rs similarity index 98% rename from parquet/src/arrow/array_reader/dictionary_buffer.rs rename to parquet/src/arrow/buffer/dictionary_buffer.rs index 6dc9cc80f398..ffa3a4843c50 100644 --- a/parquet/src/arrow/array_reader/dictionary_buffer.rs +++ b/parquet/src/arrow/buffer/dictionary_buffer.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::arrow::array_reader::offset_buffer::OffsetBuffer; +use crate::arrow::buffer::offset_buffer::OffsetBuffer; use crate::arrow::record_reader::buffer::{ BufferQueue, ScalarBuffer, ScalarValue, ValuesBuffer, }; @@ -106,7 +106,7 @@ impl Self::Dict { keys, values } => { let mut spilled = OffsetBuffer::default(); let dict_buffers = values.data().buffers(); - let dict_offsets = unsafe { dict_buffers[0].typed_data::() }; + let dict_offsets = dict_buffers[0].typed_data::(); let dict_values = dict_buffers[1].as_slice(); if values.is_empty() { diff --git a/arrow/src/arch/mod.rs b/parquet/src/arrow/buffer/mod.rs similarity index 79% rename from arrow/src/arch/mod.rs rename to parquet/src/arrow/buffer/mod.rs index 56d8f4c0e2cf..5ee89aa1a782 100644 --- a/arrow/src/arch/mod.rs +++ b/parquet/src/arrow/buffer/mod.rs @@ -15,8 +15,9 @@ // specific language governing permissions and limitations // under the License. -/// -/// Arch module contains architecture specific code. -/// Be aware that not all machines have these specific operations available. -#[cfg(all(target_arch = "x86_64", feature = "avx512"))] -pub(crate) mod avx512; +//! Logic for reading data into arrow buffers + +pub mod bit_util; +pub mod converter; +pub mod dictionary_buffer; +pub mod offset_buffer; diff --git a/parquet/src/arrow/array_reader/offset_buffer.rs b/parquet/src/arrow/buffer/offset_buffer.rs similarity index 98% rename from parquet/src/arrow/array_reader/offset_buffer.rs rename to parquet/src/arrow/buffer/offset_buffer.rs index 23e7af7595c4..2d73e3f146b6 100644 --- a/parquet/src/arrow/array_reader/offset_buffer.rs +++ b/parquet/src/arrow/buffer/offset_buffer.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::arrow::bit_util::iter_set_bits_rev; +use crate::arrow::buffer::bit_util::iter_set_bits_rev; use crate::arrow::record_reader::buffer::{ BufferQueue, ScalarBuffer, ScalarValue, ValuesBuffer, }; @@ -58,7 +58,7 @@ impl OffsetBuffer { /// the start of a UTF-8 codepoint /// /// Note: This does not verify that the entirety of `data` is valid - /// UTF-8. This should be done by calling [`Self::values_as_str`] after + /// UTF-8. This should be done by calling [`Self::check_valid_utf8`] after /// all data has been written pub fn try_push(&mut self, data: &[u8], validate_utf8: bool) -> Result<()> { if validate_utf8 { diff --git a/parquet/src/arrow/mod.rs b/parquet/src/arrow/mod.rs index 5a5135cd34d7..3aee7cf42cbc 100644 --- a/parquet/src/arrow/mod.rs +++ b/parquet/src/arrow/mod.rs @@ -96,12 +96,9 @@ //! # writer.close().unwrap(); //! //! let file = File::open("data.parquet").unwrap(); -//! let file_reader = SerializedFileReader::new(file).unwrap(); //! -//! let file_metadata = file_reader.metadata().file_metadata(); -//! let mask = ProjectionMask::leaves(file_metadata.schema_descr(), [0]); -//! -//! let mut arrow_reader = ParquetFileArrowReader::new(Arc::new(file_reader)); +//! let mut arrow_reader = ParquetFileArrowReader::try_new(file).unwrap(); +//! let mask = ProjectionMask::leaves(arrow_reader.parquet_schema(), [0]); //! //! println!("Converted arrow schema is: {}", arrow_reader.get_schema().unwrap()); //! println!("Arrow schema after projection is: {}", @@ -125,14 +122,12 @@ experimental_mod!(array_reader); pub mod arrow_reader; pub mod arrow_writer; -mod bit_util; +mod buffer; #[cfg(feature = "async")] pub mod async_reader; -experimental_mod!(converter); -pub(in crate::arrow) mod levels; -pub(in crate::arrow) mod record_reader; +mod record_reader; experimental_mod!(schema); pub use self::arrow_reader::ArrowReader; diff --git a/parquet/src/arrow/record_reader/buffer.rs b/parquet/src/arrow/record_reader/buffer.rs index 4fa8213dedcc..7101eaa9ccc9 100644 --- a/parquet/src/arrow/record_reader/buffer.rs +++ b/parquet/src/arrow/record_reader/buffer.rs @@ -17,9 +17,9 @@ use std::marker::PhantomData; -use crate::arrow::bit_util::iter_set_bits_rev; +use crate::arrow::buffer::bit_util::iter_set_bits_rev; use arrow::buffer::{Buffer, MutableBuffer}; -use arrow::datatypes::ToByteSlice; +use arrow::datatypes::ArrowNativeType; /// A buffer that supports writing new data to the end, and removing data from the front /// @@ -172,7 +172,7 @@ impl ScalarBuffer { } } -impl ScalarBuffer { +impl ScalarBuffer { pub fn push(&mut self, v: T) { self.buffer.push(v); self.len += 1; diff --git a/parquet/src/arrow/record_reader/definition_levels.rs b/parquet/src/arrow/record_reader/definition_levels.rs index 93de4006c10d..9cca25c8ae5c 100644 --- a/parquet/src/arrow/record_reader/definition_levels.rs +++ b/parquet/src/arrow/record_reader/definition_levels.rs @@ -21,7 +21,7 @@ use arrow::array::BooleanBufferBuilder; use arrow::bitmap::Bitmap; use arrow::buffer::Buffer; -use crate::arrow::bit_util::count_set_bits; +use crate::arrow::buffer::bit_util::count_set_bits; use crate::arrow::record_reader::buffer::BufferQueue; use crate::basic::Encoding; use crate::column::reader::decoder::{ diff --git a/parquet/src/arrow/record_reader.rs b/parquet/src/arrow/record_reader/mod.rs similarity index 99% rename from parquet/src/arrow/record_reader.rs rename to parquet/src/arrow/record_reader/mod.rs index 89d782b1aca8..023a538a2741 100644 --- a/parquet/src/arrow/record_reader.rs +++ b/parquet/src/arrow/record_reader/mod.rs @@ -573,7 +573,7 @@ mod tests { // Verify result record data let actual = record_reader.consume_record_data().unwrap(); - let actual_values = unsafe { actual.typed_data::() }; + let actual_values = actual.typed_data::(); let expected = &[0, 7, 0, 6, 3, 0, 8]; assert_eq!(actual_values.len(), expected.len()); @@ -687,7 +687,7 @@ mod tests { // Verify result record data let actual = record_reader.consume_record_data().unwrap(); - let actual_values = unsafe { actual.typed_data::() }; + let actual_values = actual.typed_data::(); let expected = &[4, 0, 0, 7, 6, 3, 2, 8, 9]; assert_eq!(actual_values.len(), expected.len()); diff --git a/parquet/src/arrow/schema.rs b/parquet/src/arrow/schema.rs index 5416e4078538..f3d0a3d9b36b 100644 --- a/parquet/src/arrow/schema.rs +++ b/parquet/src/arrow/schema.rs @@ -478,11 +478,11 @@ fn arrow_to_parquet_type(field: &Field) -> Result { mod tests { use super::*; - use std::{collections::HashMap, convert::TryFrom, sync::Arc}; + use std::{collections::HashMap, sync::Arc}; use arrow::datatypes::{DataType, Field, IntervalUnit, TimeUnit}; - use crate::file::{metadata::KeyValue, reader::SerializedFileReader}; + use crate::file::metadata::KeyValue; use crate::{ arrow::{ArrowReader, ArrowWriter, ParquetFileArrowReader}, schema::{parser::parse_message_type, types::SchemaDescriptor}, @@ -571,9 +571,12 @@ mod tests { ]; assert_eq!(&arrow_fields, converted_arrow_schema.fields()); - let converted_arrow_schema = - parquet_to_arrow_schema_by_columns(&parquet_schema, ProjectionMask::all(), None) - .unwrap(); + let converted_arrow_schema = parquet_to_arrow_schema_by_columns( + &parquet_schema, + ProjectionMask::all(), + None, + ) + .unwrap(); assert_eq!(&arrow_fields, converted_arrow_schema.fields()); } @@ -1599,13 +1602,13 @@ mod tests { writer.close()?; // read file back - let parquet_reader = SerializedFileReader::try_from(file)?; - let mut arrow_reader = ParquetFileArrowReader::new(Arc::new(parquet_reader)); + let mut arrow_reader = ParquetFileArrowReader::try_new(file).unwrap(); let read_schema = arrow_reader.get_schema()?; assert_eq!(schema, read_schema); // read all fields by columns - let partial_read_schema = arrow_reader.get_schema_by_columns(ProjectionMask::all())?; + let partial_read_schema = + arrow_reader.get_schema_by_columns(ProjectionMask::all())?; assert_eq!(schema, partial_read_schema); Ok(()) @@ -1668,13 +1671,13 @@ mod tests { writer.close()?; // read file back - let parquet_reader = SerializedFileReader::try_from(file)?; - let mut arrow_reader = ParquetFileArrowReader::new(Arc::new(parquet_reader)); + let mut arrow_reader = ParquetFileArrowReader::try_new(file).unwrap(); let read_schema = arrow_reader.get_schema()?; assert_eq!(schema, read_schema); // read all fields by columns - let partial_read_schema = arrow_reader.get_schema_by_columns(ProjectionMask::all())?; + let partial_read_schema = + arrow_reader.get_schema_by_columns(ProjectionMask::all())?; assert_eq!(schema, partial_read_schema); Ok(()) diff --git a/parquet/src/basic.rs b/parquet/src/basic.rs index 7eff2156fd9d..59a0fe07b7de 100644 --- a/parquet/src/basic.rs +++ b/parquet/src/basic.rs @@ -41,7 +41,7 @@ pub use parquet_format::{ /// control the on disk storage format. /// For example INT16 is not included as a type since a good encoding of INT32 /// would handle this. -#[derive(Debug, Clone, Copy, PartialEq)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] pub enum Type { BOOLEAN, INT32, diff --git a/parquet/src/bin/parquet-fromcsv-help.txt b/parquet/src/bin/parquet-fromcsv-help.txt new file mode 100644 index 000000000000..f4fe704ab267 --- /dev/null +++ b/parquet/src/bin/parquet-fromcsv-help.txt @@ -0,0 +1,66 @@ +Apache Arrow +Binary to convert csv to Parquet + +USAGE: + parquet [OPTIONS] --schema --input-file --output-file + +OPTIONS: + -b, --batch-size + batch size + + [env: PARQUET_FROM_CSV_BATCHSIZE=] + [default: 1000] + + -c, --parquet-compression + compression mode + + [default: SNAPPY] + + -d, --delimiter + field delimiter + + default value: when input_format==CSV: ',' when input_format==TSV: 'TAB' + + -D, --double-quote + double quote + + -e, --escape-char + escape charactor + + -f, --input-format + input file format + + [default: csv] + [possible values: csv, tsv] + + -h, --has-header + has header + + --help + Print help information + + -i, --input-file + input CSV file + + -m, --max-row-group-size + max row group size + + -o, --output-file + output Parquet file + + -q, --quote-char + quate charactor + + -r, --record-terminator + record terminator + + [possible values: lf, crlf, cr] + + -s, --schema + message schema for output Parquet + + -V, --version + Print version information + + -w, --writer-version + writer version diff --git a/parquet/src/bin/parquet-fromcsv.rs b/parquet/src/bin/parquet-fromcsv.rs new file mode 100644 index 000000000000..aa1d50563cd9 --- /dev/null +++ b/parquet/src/bin/parquet-fromcsv.rs @@ -0,0 +1,636 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Binary file to converts csv to Parquet file +//! +//! # Install +//! +//! `parquet-fromcsv` can be installed using `cargo`: +//! +//! ```text +//! cargo install parquet --features=cli +//! ``` +//! +//! After this `parquet-fromcsv` shoud be available: +//! +//! ```text +//! parquet-fromcsv --schema message_schema_for_parquet.txt input.csv output.parquet +//! ``` +//! +//! The binary can also be built from the source code and run as follows: +//! +//! ```text +//! cargo run --features=cli --bin parquet-fromcsv --schema message_schema_for_parquet.txt \ +//! \ input.csv output.parquet +//! ``` +//! +//! # Options +//! +//! ```text +#![doc = include_str!("./parquet-fromcsv-help.txt")] // Update for this file : Run test test_command_help +//! ``` +//! +//! ## Parquet file options +//! +//! - `-b`, `--batch-size` : Batch size for Parquet +//! - `-c`, `--parquet-compression` : Compression option for Parquet, default is SNAPPY +//! - `-s`, `--schema` : Path to message schema for generated Parquet file +//! - `-o`, `--output-file` : Path to output Parquet file +//! - `-w`, `--writer-version` : Writer version +//! - `-m`, `--max-row-group-size` : Max row group size +//! +//! ## Input file options +//! +//! - `-i`, `--input-file` : Path to input CSV file +//! - `-f`, `--input-format` : Dialect for input file, `csv` or `tsv`. +//! - `-d`, `--delimiter : Field delimitor for CSV file, default depends `--input-format` +//! - `-e`, `--escape` : Escape charactor for input file +//! - `-h`, `--has-header` : Input has header +//! - `-r`, `--record-terminator` : Record terminator charactor for input. default is CRLF +//! - `-q`, `--quote-char` : Input quoting charactor +//! + +use std::{ + fmt::Display, + fs::{read_to_string, File}, + path::{Path, PathBuf}, + sync::Arc, +}; + +use arrow::{csv::ReaderBuilder, datatypes::Schema, error::ArrowError}; +use clap::{ArgEnum, Parser}; +use parquet::{ + arrow::{parquet_to_arrow_schema, ArrowWriter}, + basic::Compression, + errors::ParquetError, + file::properties::{WriterProperties, WriterVersion}, + schema::{parser::parse_message_type, types::SchemaDescriptor}, +}; + +#[derive(Debug)] +enum ParquetFromCsvError { + CommandLineParseError(clap::Error), + IoError(std::io::Error), + ArrowError(ArrowError), + ParquetError(ParquetError), + WithContext(String, Box), +} + +impl From for ParquetFromCsvError { + fn from(e: std::io::Error) -> Self { + Self::IoError(e) + } +} + +impl From for ParquetFromCsvError { + fn from(e: ArrowError) -> Self { + Self::ArrowError(e) + } +} + +impl From for ParquetFromCsvError { + fn from(e: ParquetError) -> Self { + Self::ParquetError(e) + } +} + +impl From for ParquetFromCsvError { + fn from(e: clap::Error) -> Self { + Self::CommandLineParseError(e) + } +} + +impl ParquetFromCsvError { + pub fn with_context>( + inner_error: E, + context: &str, + ) -> ParquetFromCsvError { + let inner = inner_error.into(); + ParquetFromCsvError::WithContext(context.to_string(), Box::new(inner)) + } +} + +impl Display for ParquetFromCsvError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + ParquetFromCsvError::CommandLineParseError(e) => write!(f, "{}", e), + ParquetFromCsvError::IoError(e) => write!(f, "{}", e), + ParquetFromCsvError::ArrowError(e) => write!(f, "{}", e), + ParquetFromCsvError::ParquetError(e) => write!(f, "{}", e), + ParquetFromCsvError::WithContext(c, e) => { + writeln!(f, "{}", e)?; + write!(f, "context: {}", c) + } + } + } +} + +#[derive(Debug, Parser)] +#[clap(author, version, about("Binary to convert csv to Parquet"), long_about=None)] +struct Args { + /// Path to a text file containing a parquet schema definition + #[clap(short, long, help("message schema for output Parquet"))] + schema: PathBuf, + /// input CSV file path + #[clap(short, long, help("input CSV file"))] + input_file: PathBuf, + /// output Parquet file path + #[clap(short, long, help("output Parquet file"))] + output_file: PathBuf, + /// input file format + #[clap( + arg_enum, + short('f'), + long, + help("input file format"), + default_value_t=CsvDialect::Csv + )] + input_format: CsvDialect, + /// batch size + #[clap( + short, + long, + help("batch size"), + default_value_t = 1000, + env = "PARQUET_FROM_CSV_BATCHSIZE" + )] + batch_size: usize, + /// has header line + #[clap(short, long, help("has header"))] + has_header: bool, + /// field delimiter + /// + /// default value: + /// when input_format==CSV: ',' + /// when input_format==TSV: 'TAB' + #[clap(short, long, help("field delimiter"))] + delimiter: Option, + #[clap(arg_enum, short, long, help("record terminator"))] + record_terminator: Option, + #[clap(short, long, help("escape charactor"))] + escape_char: Option, + #[clap(short, long, help("quate charactor"))] + quote_char: Option, + #[clap(short('D'), long, help("double quote"))] + double_quote: Option, + #[clap(short('c'), long, help("compression mode"), default_value_t=Compression::SNAPPY)] + #[clap(parse(try_from_str =compression_from_str))] + parquet_compression: Compression, + + #[clap(short, long, help("writer version"))] + #[clap(parse(try_from_str =writer_version_from_str))] + writer_version: Option, + #[clap(short, long, help("max row group size"))] + max_row_group_size: Option, +} + +fn compression_from_str(cmp: &str) -> Result { + match cmp.to_uppercase().as_str() { + "UNCOMPRESSED" => Ok(Compression::UNCOMPRESSED), + "SNAPPY" => Ok(Compression::SNAPPY), + "GZIP" => Ok(Compression::GZIP), + "LZO" => Ok(Compression::LZO), + "BROTLI" => Ok(Compression::BROTLI), + "LZ4" => Ok(Compression::LZ4), + "ZSTD" => Ok(Compression::ZSTD), + v => Err( + format!("Unknown compression {0} : possible values UNCOMPRESSED, SNAPPY, GZIP, LZO, BROTLI, LZ4, ZSTD ",v) + ) + } +} + +fn writer_version_from_str(cmp: &str) -> Result { + match cmp.to_uppercase().as_str() { + "1" => Ok(WriterVersion::PARQUET_1_0), + "2" => Ok(WriterVersion::PARQUET_2_0), + v => Err(format!( + "Unknown writer version {0} : possible values 1, 2", + v + )), + } +} + +impl Args { + fn schema_path(&self) -> &Path { + self.schema.as_path() + } + fn get_delimiter(&self) -> u8 { + match self.delimiter { + Some(ch) => ch as u8, + None => match self.input_format { + CsvDialect::Csv => b',', + CsvDialect::Tsv => b'\t', + }, + } + } + fn get_terminator(&self) -> Option { + match self.record_terminator { + Some(RecordTerminator::LF) => Some(0x0a), + Some(RecordTerminator::CR) => Some(0x0d), + Some(RecordTerminator::Crlf) => None, + None => match self.input_format { + CsvDialect::Csv => None, + CsvDialect::Tsv => Some(0x0a), + }, + } + } + fn get_escape(&self) -> Option { + self.escape_char.map(|ch| ch as u8) + } + fn get_quote(&self) -> Option { + if self.quote_char.is_none() { + match self.input_format { + CsvDialect::Csv => Some(b'\"'), + CsvDialect::Tsv => None, + } + } else { + self.quote_char.map(|c| c as u8) + } + } +} + +#[derive(Debug, Clone, Copy, ArgEnum, PartialEq)] +enum CsvDialect { + Csv, + Tsv, +} + +#[derive(Debug, Clone, Copy, ArgEnum, PartialEq)] +enum RecordTerminator { + LF, + Crlf, + CR, +} + +fn configure_writer_properties(args: &Args) -> WriterProperties { + let mut properties_builder = + WriterProperties::builder().set_compression(args.parquet_compression); + if let Some(writer_version) = args.writer_version { + properties_builder = properties_builder.set_writer_version(writer_version); + } + if let Some(max_row_group_size) = args.max_row_group_size { + properties_builder = + properties_builder.set_max_row_group_size(max_row_group_size); + } + properties_builder.build() +} + +fn configure_reader_builder(args: &Args, arrow_schema: Arc) -> ReaderBuilder { + fn configure_reader ReaderBuilder>( + builder: ReaderBuilder, + value: Option, + fun: F, + ) -> ReaderBuilder { + if let Some(val) = value { + fun(builder, val) + } else { + builder + } + } + + let mut builder = ReaderBuilder::new() + .with_schema(arrow_schema) + .with_batch_size(args.batch_size) + .has_header(args.has_header) + .with_delimiter(args.get_delimiter()); + + builder = configure_reader( + builder, + args.get_terminator(), + ReaderBuilder::with_terminator, + ); + builder = configure_reader(builder, args.get_escape(), ReaderBuilder::with_escape); + builder = configure_reader(builder, args.get_quote(), ReaderBuilder::with_quote); + + builder +} + +fn arrow_schema_from_string(schema: &str) -> Result, ParquetFromCsvError> { + let schema = Arc::new(parse_message_type(schema)?); + let desc = SchemaDescriptor::new(schema); + let arrow_schema = Arc::new(parquet_to_arrow_schema(&desc, None)?); + Ok(arrow_schema) +} + +fn convert_csv_to_parquet(args: &Args) -> Result<(), ParquetFromCsvError> { + let schema = read_to_string(args.schema_path()).map_err(|e| { + ParquetFromCsvError::with_context( + e, + &format!("Failed to open schema file {:#?}", args.schema_path()), + ) + })?; + let arrow_schema = arrow_schema_from_string(&schema)?; + + // create output parquet writer + let parquet_file = File::create(&args.output_file).map_err(|e| { + ParquetFromCsvError::with_context( + e, + &format!("Failed to create output file {:#?}", &args.output_file), + ) + })?; + + let writer_properties = Some(configure_writer_properties(args)); + let mut arrow_writer = + ArrowWriter::try_new(parquet_file, arrow_schema.clone(), writer_properties) + .map_err(|e| { + ParquetFromCsvError::with_context(e, "Failed to create ArrowWriter") + })?; + + // open input file + let input_file = File::open(&args.input_file).map_err(|e| { + ParquetFromCsvError::with_context( + e, + &format!("Failed to open input file {:#?}", &args.input_file), + ) + })?; + // create input csv reader + let builder = configure_reader_builder(args, arrow_schema); + let reader = builder.build(input_file)?; + for batch_result in reader { + let batch = batch_result.map_err(|e| { + ParquetFromCsvError::with_context(e, "Failed to read RecordBatch from CSV") + })?; + arrow_writer.write(&batch).map_err(|e| { + ParquetFromCsvError::with_context(e, "Failed to write RecordBatch to parquet") + })?; + } + arrow_writer + .close() + .map_err(|e| ParquetFromCsvError::with_context(e, "Failed to close parquet"))?; + Ok(()) +} + +fn main() -> Result<(), ParquetFromCsvError> { + let args = Args::parse(); + convert_csv_to_parquet(&args) +} + +#[cfg(test)] +mod tests { + use std::{ + io::{Seek, SeekFrom, Write}, + path::{Path, PathBuf}, + }; + + use super::*; + use arrow::datatypes::{DataType, Field}; + use clap::{CommandFactory, Parser}; + use tempfile::NamedTempFile; + + #[test] + fn test_command_help() { + let mut cmd = Args::command(); + let dir = std::env::var("CARGO_MANIFEST_DIR").unwrap(); + let mut path_buf = PathBuf::from(dir); + path_buf.push("src"); + path_buf.push("bin"); + path_buf.push("parquet-fromcsv-help.txt"); + let expected = std::fs::read_to_string(path_buf).unwrap(); + let mut buffer_vec = Vec::new(); + let mut buffer = std::io::Cursor::new(&mut buffer_vec); + cmd.write_long_help(&mut buffer).unwrap(); + // Remove Parquet version string from the help text + let mut actual = String::from_utf8(buffer_vec).unwrap(); + let pos = actual.find('\n').unwrap() + 1; + actual = actual[pos..].to_string(); + assert_eq!( + expected, actual, + "help text not match. please update to \n---\n{}\n---\n", + actual + ) + } + + fn parse_args(mut extra_args: Vec<&str>) -> Result { + let mut args = vec![ + "test", + "--schema", + "test.schema", + "--input-file", + "infile.csv", + "--output-file", + "out.parquet", + ]; + args.append(&mut extra_args); + let args = Args::try_parse_from(args.iter())?; + Ok(args) + } + + #[test] + fn test_parse_arg_minimum() -> Result<(), ParquetFromCsvError> { + let args = parse_args(vec![])?; + + assert_eq!(args.schema, PathBuf::from(Path::new("test.schema"))); + assert_eq!(args.input_file, PathBuf::from(Path::new("infile.csv"))); + assert_eq!(args.output_file, PathBuf::from(Path::new("out.parquet"))); + // test default values + assert_eq!(args.input_format, CsvDialect::Csv); + assert_eq!(args.batch_size, 1000); + assert_eq!(args.has_header, false); + assert_eq!(args.delimiter, None); + assert_eq!(args.get_delimiter(), b','); + assert_eq!(args.record_terminator, None); + assert_eq!(args.get_terminator(), None); // CRLF + assert_eq!(args.quote_char, None); + assert_eq!(args.get_quote(), Some(b'\"')); + assert_eq!(args.double_quote, None); + assert_eq!(args.parquet_compression, Compression::SNAPPY); + Ok(()) + } + + #[test] + fn test_parse_arg_format_variants() -> Result<(), ParquetFromCsvError> { + let args = parse_args(vec!["--input-format", "csv"])?; + assert_eq!(args.input_format, CsvDialect::Csv); + assert_eq!(args.get_delimiter(), b','); + assert_eq!(args.get_terminator(), None); // CRLF + assert_eq!(args.get_quote(), Some(b'\"')); + assert_eq!(args.get_escape(), None); + let args = parse_args(vec!["--input-format", "tsv"])?; + assert_eq!(args.input_format, CsvDialect::Tsv); + assert_eq!(args.get_delimiter(), b'\t'); + assert_eq!(args.get_terminator(), Some(b'\x0a')); // LF + assert_eq!(args.get_quote(), None); // quote none + assert_eq!(args.get_escape(), None); + + let args = parse_args(vec!["--input-format", "csv", "--escape-char", "\\"])?; + assert_eq!(args.input_format, CsvDialect::Csv); + assert_eq!(args.get_delimiter(), b','); + assert_eq!(args.get_terminator(), None); // CRLF + assert_eq!(args.get_quote(), Some(b'\"')); + assert_eq!(args.get_escape(), Some(b'\\')); + + let args = parse_args(vec!["--input-format", "tsv", "--delimiter", ":"])?; + assert_eq!(args.input_format, CsvDialect::Tsv); + assert_eq!(args.get_delimiter(), b':'); + assert_eq!(args.get_terminator(), Some(b'\x0a')); // LF + assert_eq!(args.get_quote(), None); // quote none + assert_eq!(args.get_escape(), None); + + Ok(()) + } + + #[test] + #[should_panic] + fn test_parse_arg_format_error() { + parse_args(vec!["--input-format", "excel"]).unwrap(); + } + + #[test] + fn test_parse_arg_compression_format() { + let args = parse_args(vec!["--parquet-compression", "uncompressed"]).unwrap(); + assert_eq!(args.parquet_compression, Compression::UNCOMPRESSED); + let args = parse_args(vec!["--parquet-compression", "snappy"]).unwrap(); + assert_eq!(args.parquet_compression, Compression::SNAPPY); + let args = parse_args(vec!["--parquet-compression", "gzip"]).unwrap(); + assert_eq!(args.parquet_compression, Compression::GZIP); + let args = parse_args(vec!["--parquet-compression", "lzo"]).unwrap(); + assert_eq!(args.parquet_compression, Compression::LZO); + let args = parse_args(vec!["--parquet-compression", "lz4"]).unwrap(); + assert_eq!(args.parquet_compression, Compression::LZ4); + let args = parse_args(vec!["--parquet-compression", "brotli"]).unwrap(); + assert_eq!(args.parquet_compression, Compression::BROTLI); + let args = parse_args(vec!["--parquet-compression", "zstd"]).unwrap(); + assert_eq!(args.parquet_compression, Compression::ZSTD); + } + + #[test] + fn test_parse_arg_compression_format_fail() { + match parse_args(vec!["--parquet-compression", "zip"]) { + Ok(_) => panic!("unexpected success"), + Err(e) => assert_eq!( + format!("{}", e), + "error: Invalid value \"zip\" for '--parquet-compression ': Unknown compression ZIP : possible values UNCOMPRESSED, SNAPPY, GZIP, LZO, BROTLI, LZ4, ZSTD \n\nFor more information try --help\n"), + } + } + + fn assert_debug_text(debug_text: &str, name: &str, value: &str) { + let pattern = format!(" {}: {}", name, value); + assert!( + debug_text.contains(&pattern), + "\"{}\" not contains \"{}\"", + debug_text, + pattern + ) + } + + #[test] + fn test_configure_reader_builder() { + let args = Args { + schema: PathBuf::from(Path::new("schema.arvo")), + input_file: PathBuf::from(Path::new("test.csv")), + output_file: PathBuf::from(Path::new("out.parquet")), + batch_size: 1000, + input_format: CsvDialect::Csv, + has_header: false, + delimiter: None, + record_terminator: None, + escape_char: None, + quote_char: None, + double_quote: None, + parquet_compression: Compression::SNAPPY, + writer_version: None, + max_row_group_size: None, + }; + let arrow_schema = Arc::new(Schema::new(vec![ + Field::new("field1", DataType::Utf8, false), + Field::new("field2", DataType::Utf8, false), + Field::new("field3", DataType::Utf8, false), + Field::new("field4", DataType::Utf8, false), + Field::new("field5", DataType::Utf8, false), + ])); + + let reader_builder = configure_reader_builder(&args, arrow_schema.clone()); + let builder_debug = format!("{:?}", reader_builder); + assert_debug_text(&builder_debug, "has_header", "false"); + assert_debug_text(&builder_debug, "delimiter", "Some(44)"); + assert_debug_text(&builder_debug, "quote", "Some(34)"); + assert_debug_text(&builder_debug, "terminator", "None"); + assert_debug_text(&builder_debug, "batch_size", "1000"); + assert_debug_text(&builder_debug, "escape", "None"); + + let args = Args { + schema: PathBuf::from(Path::new("schema.arvo")), + input_file: PathBuf::from(Path::new("test.csv")), + output_file: PathBuf::from(Path::new("out.parquet")), + batch_size: 2000, + input_format: CsvDialect::Tsv, + has_header: true, + delimiter: None, + record_terminator: None, + escape_char: Some('\\'), + quote_char: None, + double_quote: None, + parquet_compression: Compression::SNAPPY, + writer_version: None, + max_row_group_size: None, + }; + let arrow_schema = Arc::new(Schema::new(vec![ + Field::new("field1", DataType::Utf8, false), + Field::new("field2", DataType::Utf8, false), + Field::new("field3", DataType::Utf8, false), + Field::new("field4", DataType::Utf8, false), + Field::new("field5", DataType::Utf8, false), + ])); + let reader_builder = configure_reader_builder(&args, arrow_schema.clone()); + let builder_debug = format!("{:?}", reader_builder); + assert_debug_text(&builder_debug, "has_header", "true"); + assert_debug_text(&builder_debug, "delimiter", "Some(9)"); + assert_debug_text(&builder_debug, "quote", "None"); + assert_debug_text(&builder_debug, "terminator", "Some(10)"); + assert_debug_text(&builder_debug, "batch_size", "2000"); + assert_debug_text(&builder_debug, "escape", "Some(92)"); + } + + #[test] + fn test_convert_csv_to_parquet() { + let schema = NamedTempFile::new().unwrap(); + let schema_text = r"message schema { + optional int32 id; + optional binary name (STRING); + }"; + schema.as_file().write_all(schema_text.as_bytes()).unwrap(); + + let mut input_file = NamedTempFile::new().unwrap(); + { + let csv = input_file.as_file_mut(); + for index in 1..2000 { + write!(csv, "{},\"name_{}\"\r\n", index, index).unwrap(); + } + csv.flush().unwrap(); + csv.seek(SeekFrom::Start(0)).unwrap(); + } + let output_parquet = NamedTempFile::new().unwrap(); + + let args = Args { + schema: PathBuf::from(schema.path()), + input_file: PathBuf::from(input_file.path()), + output_file: PathBuf::from(output_parquet.path()), + batch_size: 1000, + input_format: CsvDialect::Csv, + has_header: false, + delimiter: None, + record_terminator: None, + escape_char: None, + quote_char: None, + double_quote: None, + parquet_compression: Compression::SNAPPY, + writer_version: None, + max_row_group_size: None, + }; + convert_csv_to_parquet(&args).unwrap(); + } +} diff --git a/parquet/src/column/page.rs b/parquet/src/column/page.rs index 8c3a31d2f356..9364bd30fffd 100644 --- a/parquet/src/column/page.rs +++ b/parquet/src/column/page.rs @@ -219,7 +219,7 @@ pub trait PageWriter { fn close(&mut self) -> Result<()>; } -/// An iterator over pages of some specific column in a parquet file. +/// An iterator over pages of one specific column in a parquet file. pub trait PageIterator: Iterator>> + Send { /// Get schema of parquet file. fn schema(&mut self) -> Result; diff --git a/parquet/src/compression.rs b/parquet/src/compression.rs index f4aecbf4e86f..a5e49360a28a 100644 --- a/parquet/src/compression.rs +++ b/parquet/src/compression.rs @@ -49,13 +49,14 @@ use crate::errors::{ParquetError, Result}; /// Parquet compression codec interface. pub trait Codec: Send { - /// Compresses data stored in slice `input_buf` and writes the compressed result + /// Compresses data stored in slice `input_buf` and appends the compressed result /// to `output_buf`. + /// /// Note that you'll need to call `clear()` before reusing the same `output_buf` /// across different `compress` calls. fn compress(&mut self, input_buf: &[u8], output_buf: &mut Vec) -> Result<()>; - /// Decompresses data stored in slice `input_buf` and writes output to `output_buf`. + /// Decompresses data stored in slice `input_buf` and appends output to `output_buf`. /// Returns the total number of bytes written. fn decompress(&mut self, input_buf: &[u8], output_buf: &mut Vec) -> Result; @@ -111,9 +112,10 @@ mod snappy_codec { output_buf: &mut Vec, ) -> Result { let len = decompress_len(input_buf)?; - output_buf.resize(len, 0); + let offset = output_buf.len(); + output_buf.resize(offset + len, 0); self.decoder - .decompress(input_buf, output_buf) + .decompress(input_buf, &mut output_buf[offset..]) .map_err(|e| e.into()) } @@ -340,13 +342,13 @@ mod tests { .expect("Error when compressing"); // Decompress with c2 - let mut decompressed_size = c2 + let decompressed_size = c2 .decompress(compressed.as_slice(), &mut decompressed) .expect("Error when decompressing"); assert_eq!(data.len(), decompressed_size); - decompressed.truncate(decompressed_size); assert_eq!(data, decompressed.as_slice()); + decompressed.clear(); compressed.clear(); // Compress with c2 @@ -354,12 +356,32 @@ mod tests { .expect("Error when compressing"); // Decompress with c1 - decompressed_size = c1 + let decompressed_size = c1 .decompress(compressed.as_slice(), &mut decompressed) .expect("Error when decompressing"); assert_eq!(data.len(), decompressed_size); - decompressed.truncate(decompressed_size); assert_eq!(data, decompressed.as_slice()); + + decompressed.clear(); + compressed.clear(); + + // Test does not trample existing data in output buffers + let prefix = &[0xDE, 0xAD, 0xBE, 0xEF]; + decompressed.extend_from_slice(prefix); + compressed.extend_from_slice(prefix); + + c2.compress(data, &mut compressed) + .expect("Error when compressing"); + + assert_eq!(&compressed[..4], prefix); + + let decompressed_size = c2 + .decompress(&compressed[4..], &mut decompressed) + .expect("Error when decompressing"); + + assert_eq!(data.len(), decompressed_size); + assert_eq!(data, &decompressed[4..]); + assert_eq!(&decompressed[..4], prefix); } fn test_codec(c: CodecType) { diff --git a/parquet/src/data_type.rs b/parquet/src/data_type.rs index c01fb153089d..86ccefbd85eb 100644 --- a/parquet/src/data_type.rs +++ b/parquet/src/data_type.rs @@ -30,7 +30,7 @@ use crate::column::reader::{ColumnReader, ColumnReaderImpl}; use crate::column::writer::{ColumnWriter, ColumnWriterImpl}; use crate::errors::{ParquetError, Result}; use crate::util::{ - bit_util::{from_ne_slice, FromBytes}, + bit_util::{from_le_slice, from_ne_slice, FromBytes}, memory::ByteBufferPtr, }; @@ -1194,8 +1194,14 @@ make_type!( impl FromBytes for Int96 { type Buffer = [u8; 12]; - fn from_le_bytes(_bs: Self::Buffer) -> Self { - unimplemented!() + fn from_le_bytes(bs: Self::Buffer) -> Self { + let mut i = Int96::new(); + i.set_data( + from_le_slice(&bs[0..4]), + from_le_slice(&bs[4..8]), + from_le_slice(&bs[8..12]), + ); + i } fn from_be_bytes(_bs: Self::Buffer) -> Self { unimplemented!() @@ -1215,8 +1221,8 @@ impl FromBytes for Int96 { // appear to actual be converted directly from bytes impl FromBytes for ByteArray { type Buffer = [u8; 8]; - fn from_le_bytes(_bs: Self::Buffer) -> Self { - unreachable!() + fn from_le_bytes(bs: Self::Buffer) -> Self { + ByteArray::from(bs.to_vec()) } fn from_be_bytes(_bs: Self::Buffer) -> Self { unreachable!() @@ -1229,8 +1235,8 @@ impl FromBytes for ByteArray { impl FromBytes for FixedLenByteArray { type Buffer = [u8; 8]; - fn from_le_bytes(_bs: Self::Buffer) -> Self { - unreachable!() + fn from_le_bytes(bs: Self::Buffer) -> Self { + Self(ByteArray::from(bs.to_vec())) } fn from_be_bytes(_bs: Self::Buffer) -> Self { unreachable!() diff --git a/parquet/src/errors.rs b/parquet/src/errors.rs index fcbb846f110f..c2fb5bd66cf9 100644 --- a/parquet/src/errors.rs +++ b/parquet/src/errors.rs @@ -17,7 +17,7 @@ //! Common Parquet errors and macros. -use std::{cell, convert, io, result, str}; +use std::{cell, io, result, str}; #[cfg(any(feature = "arrow", test))] use arrow::error::ArrowError; @@ -108,7 +108,7 @@ pub type Result = result::Result; // ---------------------------------------------------------------------- // Conversion from `ParquetError` to other types of `Error`s -impl convert::From for io::Error { +impl From for io::Error { fn from(e: ParquetError) -> Self { io::Error::new(io::ErrorKind::Other, e) } @@ -135,6 +135,7 @@ macro_rules! eof_err { ($fmt:expr, $($args:expr),*) => (ParquetError::EOF(format!($fmt, $($args),*))); } +#[cfg(any(feature = "arrow", test))] macro_rules! arrow_err { ($fmt:expr) => (ParquetError::ArrowError($fmt.to_owned())); ($fmt:expr, $($args:expr),*) => (ParquetError::ArrowError(format!($fmt, $($args),*))); diff --git a/parquet/src/file/footer.rs b/parquet/src/file/footer.rs index db8a23d8ebca..dc1d66d0fa44 100644 --- a/parquet/src/file/footer.rs +++ b/parquet/src/file/footer.rs @@ -15,11 +15,7 @@ // specific language governing permissions and limitations // under the License. -use std::{ - cmp::min, - io::{Cursor, Read, Seek, SeekFrom}, - sync::Arc, -}; +use std::{io::Read, sync::Arc}; use byteorder::{ByteOrder, LittleEndian}; use parquet_format::{ColumnOrder as TColumnOrder, FileMetaData as TFileMetaData}; @@ -28,10 +24,7 @@ use thrift::protocol::TCompactInputProtocol; use crate::basic::ColumnOrder; use crate::errors::{ParquetError, Result}; -use crate::file::{ - metadata::*, reader::ChunkReader, DEFAULT_FOOTER_READ_SIZE, FOOTER_SIZE, - PARQUET_MAGIC, -}; +use crate::file::{metadata::*, reader::ChunkReader, FOOTER_SIZE, PARQUET_MAGIC}; use crate::schema::types::{self, SchemaDescriptor}; @@ -52,55 +45,42 @@ pub fn parse_metadata(chunk_reader: &R) -> Result file_size as usize { return Err(general_err!( - "Invalid Parquet file. Metadata length is less than zero ({})", - metadata_len + "Invalid Parquet file. Reported metadata length of {} + {} byte footer, but file is only {} bytes", + metadata_len, + FOOTER_SIZE, + file_size )); } - let footer_metadata_len = FOOTER_SIZE + metadata_len as usize; - // build up the reader covering the entire metadata - let mut default_end_cursor = Cursor::new(default_len_end_buf); - if footer_metadata_len > file_size as usize { - return Err(general_err!( - "Invalid Parquet file. Metadata start is less than zero ({})", - file_size as i64 - footer_metadata_len as i64 + let mut metadata = Vec::with_capacity(metadata_len); + + let read = chunk_reader + .get_read(file_size - footer_metadata_len as u64, metadata_len)? + .read_to_end(&mut metadata)?; + + if read != metadata_len { + return Err(eof_err!( + "Expected to read {} bytes of metadata, got {}", + metadata_len, + read )); - } else if footer_metadata_len < DEFAULT_FOOTER_READ_SIZE { - // the whole metadata is in the bytes we already read - default_end_cursor.seek(SeekFrom::End(-(footer_metadata_len as i64)))?; - parse_metadata_buffer(&mut default_end_cursor) - } else { - // the end of file read by default is not long enough, read missing bytes - let complementary_end_read = chunk_reader.get_read( - file_size - footer_metadata_len as u64, - FOOTER_SIZE + metadata_len as usize - default_end_len, - )?; - parse_metadata_buffer(&mut complementary_end_read.chain(default_end_cursor)) } + + decode_metadata(&metadata) } -/// Reads [`ParquetMetaData`] from the provided [`Read`] starting at the readers current position -pub(crate) fn parse_metadata_buffer( - metadata_read: &mut T, -) -> Result { +/// Decodes [`ParquetMetaData`] from the provided bytes +pub fn decode_metadata(metadata_read: &[u8]) -> Result { // TODO: row group filtering let mut prot = TCompactInputProtocol::new(metadata_read); let t_file_metadata: TFileMetaData = TFileMetaData::read_from_in_protocol(&mut prot) @@ -124,6 +104,23 @@ pub(crate) fn parse_metadata_buffer( Ok(ParquetMetaData::new(file_metadata, row_groups)) } +/// Decodes the footer returning the metadata length in bytes +pub fn decode_footer(slice: &[u8; FOOTER_SIZE]) -> Result { + // check this is indeed a parquet file + if slice[4..] != PARQUET_MAGIC { + return Err(general_err!("Invalid Parquet file. Corrupt footer")); + } + + // get the metadata length from the footer + let metadata_len = LittleEndian::read_i32(&slice[..4]); + metadata_len.try_into().map_err(|_| { + general_err!( + "Invalid Parquet file. Metadata length is less than zero ({})", + metadata_len + ) + }) +} + /// Parses column orders from Thrift definition. /// If no column orders are defined, returns `None`. fn parse_column_orders( @@ -160,11 +157,11 @@ fn parse_column_orders( #[cfg(test)] mod tests { use super::*; + use bytes::Bytes; use crate::basic::SortOrder; use crate::basic::Type; use crate::schema::types::Type as SchemaType; - use crate::util::cursor::SliceableCursor; use parquet_format::TypeDefinedOrder; #[test] @@ -180,7 +177,7 @@ mod tests { #[test] fn test_parse_metadata_corrupt_footer() { - let data = SliceableCursor::new(Arc::new(vec![1, 2, 3, 4, 5, 6, 7, 8])); + let data = Bytes::from(vec![1, 2, 3, 4, 5, 6, 7, 8]); let reader_result = parse_metadata(&data); assert!(reader_result.is_err()); assert_eq!( @@ -191,8 +188,7 @@ mod tests { #[test] fn test_parse_metadata_invalid_length() { - let test_file = - SliceableCursor::new(Arc::new(vec![0, 0, 0, 255, b'P', b'A', b'R', b'1'])); + let test_file = Bytes::from(vec![0, 0, 0, 255, b'P', b'A', b'R', b'1']); let reader_result = parse_metadata(&test_file); assert!(reader_result.is_err()); assert_eq!( @@ -205,13 +201,14 @@ mod tests { #[test] fn test_parse_metadata_invalid_start() { - let test_file = - SliceableCursor::new(Arc::new(vec![255, 0, 0, 0, b'P', b'A', b'R', b'1'])); + let test_file = Bytes::from(vec![255, 0, 0, 0, b'P', b'A', b'R', b'1']); let reader_result = parse_metadata(&test_file); assert!(reader_result.is_err()); assert_eq!( reader_result.err().unwrap(), - general_err!("Invalid Parquet file. Metadata start is less than zero (-255)") + general_err!( + "Invalid Parquet file. Reported metadata length of 255 + 8 byte footer, but file is only 8 bytes" + ) ); } diff --git a/parquet/src/file/metadata.rs b/parquet/src/file/metadata.rs index 1d35d196322f..a3477dd75779 100644 --- a/parquet/src/file/metadata.rs +++ b/parquet/src/file/metadata.rs @@ -35,11 +35,12 @@ use std::sync::Arc; -use parquet_format::{ColumnChunk, ColumnMetaData, RowGroup}; +use parquet_format::{ColumnChunk, ColumnMetaData, PageLocation, RowGroup}; use crate::basic::{ColumnOrder, Compression, Encoding, Type}; use crate::errors::{ParquetError, Result}; use crate::file::page_encoding_stats::{self, PageEncodingStats}; +use crate::file::page_index::index::Index; use crate::file::statistics::{self, Statistics}; use crate::schema::types::{ ColumnDescPtr, ColumnDescriptor, ColumnPath, SchemaDescPtr, SchemaDescriptor, @@ -51,6 +52,8 @@ use crate::schema::types::{ pub struct ParquetMetaData { file_metadata: FileMetaData, row_groups: Vec, + page_indexes: Option>, + offset_indexes: Option>>, } impl ParquetMetaData { @@ -60,6 +63,22 @@ impl ParquetMetaData { ParquetMetaData { file_metadata, row_groups, + page_indexes: None, + offset_indexes: None, + } + } + + pub fn new_with_page_index( + file_metadata: FileMetaData, + row_groups: Vec, + page_indexes: Option>, + offset_indexes: Option>>, + ) -> Self { + ParquetMetaData { + file_metadata, + row_groups, + page_indexes, + offset_indexes, } } @@ -83,6 +102,16 @@ impl ParquetMetaData { pub fn row_groups(&self) -> &[RowGroupMetaData] { &self.row_groups } + + /// Returns page indexes in this file. + pub fn page_indexes(&self) -> Option<&Vec> { + self.page_indexes.as_ref() + } + + /// Returns offset indexes in this file. + pub fn offset_indexes(&self) -> Option<&Vec>> { + self.offset_indexes.as_ref() + } } pub type KeyValue = parquet_format::KeyValue; @@ -188,12 +217,13 @@ impl FileMetaData { pub type RowGroupMetaDataPtr = Arc; /// Metadata for a row group. -#[derive(Debug, Clone)] +#[derive(Debug, Clone, PartialEq)] pub struct RowGroupMetaData { columns: Vec, num_rows: i64, total_byte_size: i64, schema_descr: SchemaDescPtr, + // Todo add filter result -> row range } impl RowGroupMetaData { diff --git a/parquet/src/file/mod.rs b/parquet/src/file/mod.rs index d293dc7731ad..66d8ce48e0a7 100644 --- a/parquet/src/file/mod.rs +++ b/parquet/src/file/mod.rs @@ -98,14 +98,13 @@ pub mod footer; pub mod metadata; pub mod page_encoding_stats; +pub mod page_index; pub mod properties; pub mod reader; pub mod serialized_reader; pub mod statistics; pub mod writer; -const FOOTER_SIZE: usize = 8; -pub(crate) const PARQUET_MAGIC: [u8; 4] = [b'P', b'A', b'R', b'1']; - -/// The number of bytes read at the end of the parquet file on first read -const DEFAULT_FOOTER_READ_SIZE: usize = 64 * 1024; +/// The length of the parquet footer in bytes +pub const FOOTER_SIZE: usize = 8; +const PARQUET_MAGIC: [u8; 4] = [b'P', b'A', b'R', b'1']; diff --git a/parquet/src/file/page_index/index.rs b/parquet/src/file/page_index/index.rs new file mode 100644 index 000000000000..e97826c63b41 --- /dev/null +++ b/parquet/src/file/page_index/index.rs @@ -0,0 +1,209 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::basic::Type; +use crate::data_type::private::ParquetValueType; +use crate::data_type::Int96; +use crate::errors::ParquetError; +use crate::util::bit_util::from_le_slice; +use parquet_format::{BoundaryOrder, ColumnIndex}; +use std::fmt::Debug; + +/// The statistics in one page +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct PageIndex { + /// The minimum value, It is None when all values are null + pub min: Option, + /// The maximum value, It is None when all values are null + pub max: Option, + /// Null values in the page + pub null_count: Option, +} + +impl PageIndex { + pub fn min(&self) -> Option<&T> { + self.min.as_ref() + } + pub fn max(&self) -> Option<&T> { + self.max.as_ref() + } + pub fn null_count(&self) -> Option { + self.null_count + } +} + +#[derive(Debug, Clone, PartialEq)] +pub enum Index { + BOOLEAN(BooleanIndex), + INT32(NativeIndex), + INT64(NativeIndex), + INT96(NativeIndex), + FLOAT(NativeIndex), + DOUBLE(NativeIndex), + BYTE_ARRAY(ByteArrayIndex), + FIXED_LEN_BYTE_ARRAY(ByteArrayIndex), +} + +/// An index of a column of [`Type`] physical representation +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct NativeIndex { + /// The physical type + pub physical_type: Type, + /// The indexes, one item per page + pub indexes: Vec>, + /// the order + pub boundary_order: BoundaryOrder, +} + +impl NativeIndex { + /// Creates a new [`NativeIndex`] + pub(crate) fn try_new( + index: ColumnIndex, + physical_type: Type, + ) -> Result { + let len = index.min_values.len(); + + let null_counts = index + .null_counts + .map(|x| x.into_iter().map(Some).collect::>()) + .unwrap_or_else(|| vec![None; len]); + + let indexes = index + .min_values + .iter() + .zip(index.max_values.into_iter()) + .zip(index.null_pages.into_iter()) + .zip(null_counts.into_iter()) + .map(|(((min, max), is_null), null_count)| { + let (min, max) = if is_null { + (None, None) + } else { + let min = min.as_slice(); + let max = max.as_slice(); + (Some(from_le_slice::(min)), Some(from_le_slice::(max))) + }; + Ok(PageIndex { + min, + max, + null_count, + }) + }) + .collect::, ParquetError>>()?; + + Ok(Self { + physical_type, + indexes, + boundary_order: index.boundary_order, + }) + } +} + +/// An index of a column of bytes type +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct ByteArrayIndex { + /// The physical type + pub physical_type: Type, + /// The indexes, one item per page + pub indexes: Vec>>, + pub boundary_order: BoundaryOrder, +} + +impl ByteArrayIndex { + pub(crate) fn try_new( + index: ColumnIndex, + physical_type: Type, + ) -> Result { + let len = index.min_values.len(); + + let null_counts = index + .null_counts + .map(|x| x.into_iter().map(Some).collect::>()) + .unwrap_or_else(|| vec![None; len]); + + let indexes = index + .min_values + .into_iter() + .zip(index.max_values.into_iter()) + .zip(index.null_pages.into_iter()) + .zip(null_counts.into_iter()) + .map(|(((min, max), is_null), null_count)| { + let (min, max) = if is_null { + (None, None) + } else { + (Some(min), Some(max)) + }; + Ok(PageIndex { + min, + max, + null_count, + }) + }) + .collect::, ParquetError>>()?; + + Ok(Self { + physical_type, + indexes, + boundary_order: index.boundary_order, + }) + } +} + +/// An index of a column of boolean physical type +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct BooleanIndex { + /// The indexes, one item per page + pub indexes: Vec>, + pub boundary_order: BoundaryOrder, +} + +impl BooleanIndex { + pub(crate) fn try_new(index: ColumnIndex) -> Result { + let len = index.min_values.len(); + + let null_counts = index + .null_counts + .map(|x| x.into_iter().map(Some).collect::>()) + .unwrap_or_else(|| vec![None; len]); + + let indexes = index + .min_values + .into_iter() + .zip(index.max_values.into_iter()) + .zip(index.null_pages.into_iter()) + .zip(null_counts.into_iter()) + .map(|(((min, max), is_null), null_count)| { + let (min, max) = if is_null { + (None, None) + } else { + let min = min[0] != 0; + let max = max[0] == 1; + (Some(min), Some(max)) + }; + Ok(PageIndex { + min, + max, + null_count, + }) + }) + .collect::, ParquetError>>()?; + + Ok(Self { + indexes, + boundary_order: index.boundary_order, + }) + } +} diff --git a/parquet/src/file/page_index/index_reader.rs b/parquet/src/file/page_index/index_reader.rs new file mode 100644 index 000000000000..8414480903fd --- /dev/null +++ b/parquet/src/file/page_index/index_reader.rs @@ -0,0 +1,167 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::basic::Type; +use crate::data_type::Int96; +use crate::errors::ParquetError; +use crate::file::metadata::ColumnChunkMetaData; +use crate::file::page_index::index::{BooleanIndex, ByteArrayIndex, Index, NativeIndex}; +use crate::file::reader::ChunkReader; +use parquet_format::{ColumnIndex, OffsetIndex, PageLocation}; +use std::io::{Cursor, Read}; +use thrift::protocol::TCompactInputProtocol; + +/// Read on row group's all columns indexes and change into [`Index`] +/// If not the format not available return an empty vector. +pub fn read_columns_indexes( + reader: &R, + chunks: &[ColumnChunkMetaData], +) -> Result, ParquetError> { + let (offset, lengths) = get_index_offset_and_lengths(chunks)?; + let length = lengths.iter().sum::(); + + //read all need data into buffer + let mut reader = reader.get_read(offset, reader.len() as usize)?; + let mut data = vec![0; length]; + reader.read_exact(&mut data)?; + + let mut start = 0; + let data = lengths.into_iter().map(|length| { + let r = &data[start..start + length]; + start += length; + r + }); + + chunks + .iter() + .zip(data) + .map(|(chunk, data)| { + let column_type = chunk.column_type(); + deserialize_column_index(data, column_type) + }) + .collect() +} + +/// Read on row group's all indexes and change into [`Index`] +/// If not the format not available return an empty vector. +pub fn read_pages_locations( + reader: &R, + chunks: &[ColumnChunkMetaData], +) -> Result>, ParquetError> { + let (offset, total_length) = get_location_offset_and_total_length(chunks)?; + + //read all need data into buffer + let mut reader = reader.get_read(offset, reader.len() as usize)?; + let mut data = vec![0; total_length]; + reader.read_exact(&mut data)?; + + let mut d = Cursor::new(data); + let mut result = vec![]; + + for _ in 0..chunks.len() { + let mut prot = TCompactInputProtocol::new(&mut d); + let offset = OffsetIndex::read_from_in_protocol(&mut prot)?; + result.push(offset.page_locations); + } + Ok(result) +} + +//Get File offsets of every ColumnChunk's page_index +//If there are invalid offset return a zero offset with empty lengths. +fn get_index_offset_and_lengths( + chunks: &[ColumnChunkMetaData], +) -> Result<(u64, Vec), ParquetError> { + let first_col_metadata = if let Some(chunk) = chunks.first() { + chunk + } else { + return Ok((0, vec![])); + }; + + let offset: u64 = if let Some(offset) = first_col_metadata.column_index_offset() { + offset.try_into().unwrap() + } else { + return Ok((0, vec![])); + }; + + let lengths = chunks + .iter() + .map(|x| x.column_index_length()) + .map(|maybe_length| { + let index_length = maybe_length.ok_or_else(|| { + ParquetError::General( + "The column_index_length must exist if offset_index_offset exists" + .to_string(), + ) + })?; + + Ok(index_length.try_into().unwrap()) + }) + .collect::, ParquetError>>()?; + + Ok((offset, lengths)) +} + +//Get File offset of ColumnChunk's pages_locations +//If there are invalid offset return a zero offset with zero length. +fn get_location_offset_and_total_length( + chunks: &[ColumnChunkMetaData], +) -> Result<(u64, usize), ParquetError> { + let metadata = if let Some(chunk) = chunks.first() { + chunk + } else { + return Ok((0, 0)); + }; + + let offset: u64 = if let Some(offset) = metadata.offset_index_offset() { + offset.try_into().unwrap() + } else { + return Ok((0, 0)); + }; + + let total_length = chunks + .iter() + .map(|x| x.offset_index_length().unwrap()) + .sum::() as usize; + Ok((offset, total_length)) +} + +fn deserialize_column_index( + data: &[u8], + column_type: Type, +) -> Result { + let mut d = Cursor::new(data); + let mut prot = TCompactInputProtocol::new(&mut d); + + let index = ColumnIndex::read_from_in_protocol(&mut prot)?; + + let index = match column_type { + Type::BOOLEAN => Index::BOOLEAN(BooleanIndex::try_new(index)?), + Type::INT32 => Index::INT32(NativeIndex::::try_new(index, column_type)?), + Type::INT64 => Index::INT64(NativeIndex::::try_new(index, column_type)?), + Type::INT96 => Index::INT96(NativeIndex::::try_new(index, column_type)?), + Type::FLOAT => Index::FLOAT(NativeIndex::::try_new(index, column_type)?), + Type::DOUBLE => Index::DOUBLE(NativeIndex::::try_new(index, column_type)?), + Type::BYTE_ARRAY => { + Index::BYTE_ARRAY(ByteArrayIndex::try_new(index, column_type)?) + } + Type::FIXED_LEN_BYTE_ARRAY => { + Index::FIXED_LEN_BYTE_ARRAY(ByteArrayIndex::try_new(index, column_type)?) + } + }; + + Ok(index) +} diff --git a/parquet/src/file/page_index/mod.rs b/parquet/src/file/page_index/mod.rs new file mode 100644 index 000000000000..fc87ef20448f --- /dev/null +++ b/parquet/src/file/page_index/mod.rs @@ -0,0 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +pub mod index; +pub mod index_reader; +pub(crate) mod range; diff --git a/parquet/src/file/page_index/range.rs b/parquet/src/file/page_index/range.rs new file mode 100644 index 000000000000..06c06553ccd5 --- /dev/null +++ b/parquet/src/file/page_index/range.rs @@ -0,0 +1,474 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +use crate::errors::ParquetError; +use parquet_format::PageLocation; +use std::cmp::Ordering; +use std::collections::VecDeque; +use std::ops::RangeInclusive; + +type Range = RangeInclusive; + +pub trait RangeOps { + fn is_before(&self, other: &Self) -> bool; + + fn is_after(&self, other: &Self) -> bool; + + fn count(&self) -> usize; + + fn union(left: &Range, right: &Range) -> Option; + + fn intersection(left: &Range, right: &Range) -> Option; +} + +impl RangeOps for Range { + fn is_before(&self, other: &Range) -> bool { + self.end() < other.start() + } + + fn is_after(&self, other: &Range) -> bool { + self.start() > other.end() + } + + fn count(&self) -> usize { + self.end() + 1 - self.start() + } + + /// Return the union of the two ranges, + /// Return `None` if there are hole between them. + fn union(left: &Range, right: &Range) -> Option { + if left.start() <= right.start() { + if left.end() + 1 >= *right.start() { + return Some(Range::new( + *left.start(), + std::cmp::max(*left.end(), *right.end()), + )); + } + } else if right.end() + 1 >= *left.start() { + return Some(Range::new( + *right.start(), + std::cmp::max(*left.end(), *right.end()), + )); + } + None + } + + /// Returns the intersection of the two ranges, + /// return null if they are not overlapped. + fn intersection(left: &Range, right: &Range) -> Option { + if left.start() <= right.start() { + if left.end() >= right.start() { + return Some(Range::new( + *right.start(), + std::cmp::min(*left.end(), *right.end()), + )); + } + } else if right.end() >= left.start() { + return Some(Range::new( + *left.start(), + std::cmp::min(*left.end(), *right.end()), + )); + } + None + } +} + +/// Struct representing row ranges in a row-group. These row ranges are calculated as a result of using +/// the column index on the filtering. +#[derive(Debug, Clone)] +pub struct RowRanges { + pub ranges: VecDeque, +} + +impl RowRanges { + //create an empty RowRanges + pub fn new_empty() -> Self { + RowRanges { + ranges: VecDeque::new(), + } + } + + pub fn count(&self) -> usize { + self.ranges.len() + } + + pub fn filter_with_mask(&self, mask: &[bool]) -> Result { + if self.ranges.len() != mask.len() { + return Err(ParquetError::General(format!( + "Mask size {} is not equal to number of pages {}", + mask.len(), + self.count() + ))); + } + let vec_range = mask + .iter() + .zip(self.ranges.clone()) + .filter_map(|(&f, r)| if f { Some(r) } else { None }) + .collect(); + Ok(RowRanges { ranges: vec_range }) + } + + /// Add a range to the end of the list of ranges. It maintains the disjunctive ascending order of the ranges by + /// trying to union the specified range to the last ranges in the list. The specified range shall be larger than + /// the last one or might be overlapped with some of the last ones. + /// [a, b] < [c, d] if b < c + pub fn add(&mut self, mut range: Range) { + let count = self.count(); + if count > 0 { + for i in 1..(count + 1) { + let index = count - i; + let last = self.ranges.get(index).unwrap(); + assert!(!last.is_after(&range), "Must add range in ascending!"); + // try to merge range + match Range::union(last, &range) { + None => { + break; + } + Some(r) => { + range = r; + self.ranges.remove(index); + } + } + } + } + self.ranges.push_back(range); + } + + /// Calculates the union of the two specified RowRanges object. The union of two range is calculated if there are no + /// elements between them. Otherwise, the two disjunctive ranges are stored separately. + /// For example: + /// [113, 241] ∪ [221, 340] = [113, 330] + /// [113, 230] ∪ [231, 340] = [113, 340] + /// while + /// [113, 230] ∪ [232, 340] = [113, 230], [232, 340] + /// + /// The result RowRanges object will contain all the row indexes that were contained in one of the specified objects. + pub fn union(mut left: RowRanges, mut right: RowRanges) -> RowRanges { + let v1 = &mut left.ranges; + let v2 = &mut right.ranges; + let mut result = RowRanges::new_empty(); + if v2.is_empty() { + left.clone() + } else { + let mut range2 = v2.pop_front().unwrap(); + while !v1.is_empty() { + let range1 = v1.pop_front().unwrap(); + if range1.is_after(&range2) { + result.add(range2); + range2 = range1; + std::mem::swap(v1, v2); + } else { + result.add(range1); + } + } + + result.add(range2); + while !v2.is_empty() { + result.add(v2.pop_front().unwrap()) + } + + result + } + } + + /// Calculates the intersection of the two specified RowRanges object. Two ranges intersect if they have common + /// elements otherwise the result is empty. + /// For example: + /// [113, 241] ∩ [221, 340] = [221, 241] + /// while + /// [113, 230] ∩ [231, 340] = + /// + /// The result RowRanges object will contain all the row indexes there were contained in both of the specified objects + #[allow(clippy::mut_range_bound)] + pub fn intersection(left: RowRanges, right: RowRanges) -> RowRanges { + let mut result = RowRanges::new_empty(); + let mut right_index = 0; + for l in left.ranges.iter() { + for i in right_index..right.ranges.len() { + let r = right.ranges.get(i).unwrap(); + if l.is_before(r) { + break; + } else if l.is_after(r) { + right_index = i + 1; + continue; + } + if let Some(ra) = Range::intersection(l, r) { + result.add(ra); + } + } + } + result + } + + pub fn row_count(&self) -> usize { + self.ranges.iter().map(|x| x.count()).sum() + } + + pub fn is_overlapping(&self, x: &Range) -> bool { + self.ranges + .binary_search_by(|y| -> Ordering { + if y.is_before(x) { + Ordering::Less + } else if y.is_after(x) { + Ordering::Greater + } else { + Ordering::Equal + } + }) + .is_ok() + } +} + +/// Takes an array of [`PageLocation`], and a total number of rows, and based on the provided `page_mask` +/// returns the corresponding [`RowRanges`] to scan +pub fn compute_row_ranges( + page_mask: &[bool], + locations: &[PageLocation], + total_rows: usize, +) -> Result { + if page_mask.len() != locations.len() { + return Err(ParquetError::General(format!( + "Page_mask size {} is not equal to number of locations {}", + page_mask.len(), + locations.len(), + ))); + } + let row_ranges = page_locations_to_row_ranges(locations, total_rows)?; + row_ranges.filter_with_mask(page_mask) +} + +fn page_locations_to_row_ranges( + locations: &[PageLocation], + total_rows: usize, +) -> Result { + if locations.is_empty() || total_rows == 0 { + return Ok(RowRanges::new_empty()); + } + + // If we read directly from parquet pageIndex to construct locations, + // the location index should be continuous + let mut vec_range: VecDeque = locations + .windows(2) + .map(|x| { + let start = x[0].first_row_index as usize; + let end = (x[1].first_row_index - 1) as usize; + Range::new(start, end) + }) + .collect(); + + let last = Range::new( + locations.last().unwrap().first_row_index as usize, + total_rows - 1, + ); + vec_range.push_back(last); + + Ok(RowRanges { ranges: vec_range }) +} + +#[cfg(test)] +mod tests { + use crate::basic::Type::INT32; + use crate::file::page_index::index::{NativeIndex, PageIndex}; + use crate::file::page_index::range::{compute_row_ranges, Range, RowRanges}; + use parquet_format::{BoundaryOrder, PageLocation}; + + #[test] + fn test_binary_search_overlap() { + let mut ranges = RowRanges::new_empty(); + ranges.add(Range::new(1, 3)); + ranges.add(Range::new(6, 7)); + + assert!(ranges.is_overlapping(&Range::new(1, 2))); + // include both [start, end] + assert!(ranges.is_overlapping(&Range::new(0, 1))); + assert!(ranges.is_overlapping(&Range::new(0, 3))); + + assert!(ranges.is_overlapping(&Range::new(0, 7))); + assert!(ranges.is_overlapping(&Range::new(2, 7))); + + assert!(!ranges.is_overlapping(&Range::new(4, 5))); + } + + #[test] + fn test_add_func_ascending_disjunctive() { + let mut ranges_1 = RowRanges::new_empty(); + ranges_1.add(Range::new(1, 3)); + ranges_1.add(Range::new(5, 6)); + ranges_1.add(Range::new(8, 9)); + assert_eq!(ranges_1.count(), 3); + } + + #[test] + fn test_add_func_ascending_merge() { + let mut ranges_1 = RowRanges::new_empty(); + ranges_1.add(Range::new(1, 3)); + ranges_1.add(Range::new(4, 5)); + ranges_1.add(Range::new(6, 7)); + assert_eq!(ranges_1.count(), 1); + } + + #[test] + #[should_panic(expected = "Must add range in ascending!")] + fn test_add_func_not_ascending() { + let mut ranges_1 = RowRanges::new_empty(); + ranges_1.add(Range::new(6, 7)); + ranges_1.add(Range::new(1, 3)); + ranges_1.add(Range::new(4, 5)); + assert_eq!(ranges_1.count(), 1); + } + + #[test] + fn test_union_func() { + let mut ranges_1 = RowRanges::new_empty(); + ranges_1.add(Range::new(1, 2)); + ranges_1.add(Range::new(3, 4)); + ranges_1.add(Range::new(5, 6)); + + let mut ranges_2 = RowRanges::new_empty(); + ranges_2.add(Range::new(2, 3)); + ranges_2.add(Range::new(4, 5)); + ranges_2.add(Range::new(6, 7)); + + let ranges = RowRanges::union(ranges_1, ranges_2); + assert_eq!(ranges.count(), 1); + let range = ranges.ranges.get(0).unwrap(); + assert_eq!(*range.start(), 1); + assert_eq!(*range.end(), 7); + + let mut ranges_a = RowRanges::new_empty(); + ranges_a.add(Range::new(1, 3)); + ranges_a.add(Range::new(5, 8)); + ranges_a.add(Range::new(11, 12)); + + let mut ranges_b = RowRanges::new_empty(); + ranges_b.add(Range::new(0, 2)); + ranges_b.add(Range::new(6, 7)); + ranges_b.add(Range::new(10, 11)); + + let ranges = RowRanges::union(ranges_a, ranges_b); + assert_eq!(ranges.count(), 3); + + let range_1 = ranges.ranges.get(0).unwrap(); + assert_eq!(*range_1.start(), 0); + assert_eq!(*range_1.end(), 3); + let range_2 = ranges.ranges.get(1).unwrap(); + assert_eq!(*range_2.start(), 5); + assert_eq!(*range_2.end(), 8); + let range_3 = ranges.ranges.get(2).unwrap(); + assert_eq!(*range_3.start(), 10); + assert_eq!(*range_3.end(), 12); + } + + #[test] + fn test_intersection_func() { + let mut ranges_1 = RowRanges::new_empty(); + ranges_1.add(Range::new(1, 2)); + ranges_1.add(Range::new(3, 4)); + ranges_1.add(Range::new(5, 6)); + + let mut ranges_2 = RowRanges::new_empty(); + ranges_2.add(Range::new(2, 3)); + ranges_2.add(Range::new(4, 5)); + ranges_2.add(Range::new(6, 7)); + + let ranges = RowRanges::intersection(ranges_1, ranges_2); + assert_eq!(ranges.count(), 1); + let range = ranges.ranges.get(0).unwrap(); + assert_eq!(*range.start(), 2); + assert_eq!(*range.end(), 6); + + let mut ranges_a = RowRanges::new_empty(); + ranges_a.add(Range::new(1, 3)); + ranges_a.add(Range::new(5, 8)); + ranges_a.add(Range::new(11, 12)); + + let mut ranges_b = RowRanges::new_empty(); + ranges_b.add(Range::new(0, 2)); + ranges_b.add(Range::new(6, 7)); + ranges_b.add(Range::new(10, 11)); + + let ranges = RowRanges::intersection(ranges_a, ranges_b); + assert_eq!(ranges.count(), 3); + + let range_1 = ranges.ranges.get(0).unwrap(); + assert_eq!(*range_1.start(), 1); + assert_eq!(*range_1.end(), 2); + let range_2 = ranges.ranges.get(1).unwrap(); + assert_eq!(*range_2.start(), 6); + assert_eq!(*range_2.end(), 7); + let range_3 = ranges.ranges.get(2).unwrap(); + assert_eq!(*range_3.start(), 11); + assert_eq!(*range_3.end(), 11); + } + + #[test] + fn test_compute_one() { + let locations = &[PageLocation { + offset: 50, + compressed_page_size: 10, + first_row_index: 0, + }]; + let total_rows = 10; + + let row_ranges = compute_row_ranges(&[true], locations, total_rows).unwrap(); + assert_eq!(row_ranges.count(), 1); + assert_eq!(row_ranges.ranges.get(0).unwrap(), &Range::new(0, 9)); + } + + #[test] + fn test_compute_multi() { + let index: NativeIndex = NativeIndex { + physical_type: INT32, + indexes: vec![ + PageIndex { + min: Some(0), + max: Some(10), + null_count: Some(0), + }, + PageIndex { + min: Some(15), + max: Some(20), + null_count: Some(0), + }, + ], + boundary_order: BoundaryOrder::Ascending, + }; + let locations = &[ + PageLocation { + offset: 100, + compressed_page_size: 10, + first_row_index: 0, + }, + PageLocation { + offset: 200, + compressed_page_size: 20, + first_row_index: 11, + }, + ]; + let total_rows = 20; + + //filter `x < 11` + let filter = + |page: &PageIndex| page.max.as_ref().map(|&x| x < 11).unwrap_or(false); + + let mask = index.indexes.iter().map(filter).collect::>(); + + let row_ranges = compute_row_ranges(&mask, locations, total_rows).unwrap(); + + assert_eq!(row_ranges.count(), 1); + assert_eq!(row_ranges.ranges.get(0).unwrap(), &Range::new(0, 10)); + } +} diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index 8059157aabf2..6ff73e041e88 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -18,6 +18,7 @@ //! Contains implementations of the reader traits FileReader, RowGroupReader and PageReader //! Also contains implementations of the ChunkReader for files (with buffering) and byte arrays (RAM) +use bytes::{Buf, Bytes}; use std::{convert::TryFrom, fs::File, io::Read, path::Path, sync::Arc}; use parquet_format::{PageHeader, PageType}; @@ -27,6 +28,7 @@ use crate::basic::{Compression, Encoding, Type}; use crate::column::page::{Page, PageReader}; use crate::compression::{create_codec, Codec}; use crate::errors::{ParquetError, Result}; +use crate::file::page_index::index_reader; use crate::file::{footer, metadata::*, reader::*, statistics}; use crate::record::reader::RowIter; use crate::record::Row; @@ -35,6 +37,7 @@ use crate::util::{io::TryClone, memory::ByteBufferPtr}; // export `SliceableCursor` and `FileSource` publically so clients can // re-use the logic in their own ParquetFileWriter wrappers +#[allow(deprecated)] pub use crate::util::{cursor::SliceableCursor, io::FileSource}; // ---------------------------------------------------------------------- @@ -60,12 +63,35 @@ impl ChunkReader for File { } } +impl Length for Bytes { + fn len(&self) -> u64 { + self.len() as u64 + } +} + +impl TryClone for Bytes { + fn try_clone(&self) -> std::io::Result { + Ok(self.clone()) + } +} + +impl ChunkReader for Bytes { + type T = bytes::buf::Reader; + + fn get_read(&self, start: u64, length: usize) -> Result { + let start = start as usize; + Ok(self.slice(start..start + length).reader()) + } +} + +#[allow(deprecated)] impl Length for SliceableCursor { fn len(&self) -> u64 { SliceableCursor::len(self) } } +#[allow(deprecated)] impl ChunkReader for SliceableCursor { type T = SliceableCursor; @@ -132,12 +158,16 @@ pub struct SerializedFileReader { /// they will be chained using 'AND' to filter the row groups. pub struct ReadOptionsBuilder { predicates: Vec bool>>, + enable_page_index: bool, } impl ReadOptionsBuilder { /// New builder pub fn new() -> Self { - ReadOptionsBuilder { predicates: vec![] } + ReadOptionsBuilder { + predicates: vec![], + enable_page_index: false, + } } /// Add a predicate on row group metadata to the reading option, @@ -162,10 +192,17 @@ impl ReadOptionsBuilder { self } + /// Enable page index in the reading option, + pub fn with_page_index(mut self) -> Self { + self.enable_page_index = true; + self + } + /// Seal the builder and return the read options pub fn build(self) -> ReadOptions { ReadOptions { predicates: self.predicates, + enable_page_index: self.enable_page_index, } } } @@ -176,6 +213,7 @@ impl ReadOptionsBuilder { /// All predicates will be chained using 'AND' to filter the row groups. pub struct ReadOptions { predicates: Vec bool>>, + enable_page_index: bool, } impl SerializedFileReader { @@ -209,13 +247,33 @@ impl SerializedFileReader { } } - Ok(Self { - chunk_reader: Arc::new(chunk_reader), - metadata: ParquetMetaData::new( - metadata.file_metadata().clone(), - filtered_row_groups, - ), - }) + if options.enable_page_index { + //Todo for now test data `data_index_bloom_encoding_stats.parquet` only have one rowgroup + //support multi after create multi-RG test data. + let cols = metadata.row_group(0); + let columns_indexes = + index_reader::read_columns_indexes(&chunk_reader, cols.columns())?; + let pages_locations = + index_reader::read_pages_locations(&chunk_reader, cols.columns())?; + + Ok(Self { + chunk_reader: Arc::new(chunk_reader), + metadata: ParquetMetaData::new_with_page_index( + metadata.file_metadata().clone(), + filtered_row_groups, + Some(columns_indexes), + Some(pages_locations), + ), + }) + } else { + Ok(Self { + chunk_reader: Arc::new(chunk_reader), + metadata: ParquetMetaData::new( + metadata.file_metadata().clone(), + filtered_row_groups, + ), + }) + } } } @@ -284,6 +342,7 @@ impl<'a, R: 'static + ChunkReader> RowGroupReader for SerializedRowGroupReader<' fn get_column_page_reader(&self, i: usize) -> Result> { let col = self.metadata.column(i); let (col_start, col_length) = col.byte_range(); + //Todo filter with multi row range let file_chunk = self.chunk_reader.get_read(col_start, col_length as usize)?; let page_reader = SerializedPageReader::new( file_chunk, @@ -299,6 +358,108 @@ impl<'a, R: 'static + ChunkReader> RowGroupReader for SerializedRowGroupReader<' } } +/// Reads a [`PageHeader`] from the provided [`Read`] +pub(crate) fn read_page_header(input: &mut T) -> Result { + let mut prot = TCompactInputProtocol::new(input); + let page_header = PageHeader::read_from_in_protocol(&mut prot)?; + Ok(page_header) +} + +/// Decodes a [`Page`] from the provided `buffer` +pub(crate) fn decode_page( + page_header: PageHeader, + buffer: ByteBufferPtr, + physical_type: Type, + decompressor: Option<&mut Box>, +) -> Result { + // When processing data page v2, depending on enabled compression for the + // page, we should account for uncompressed data ('offset') of + // repetition and definition levels. + // + // We always use 0 offset for other pages other than v2, `true` flag means + // that compression will be applied if decompressor is defined + let mut offset: usize = 0; + let mut can_decompress = true; + + if let Some(ref header_v2) = page_header.data_page_header_v2 { + offset = (header_v2.definition_levels_byte_length + + header_v2.repetition_levels_byte_length) as usize; + // When is_compressed flag is missing the page is considered compressed + can_decompress = header_v2.is_compressed.unwrap_or(true); + } + + // TODO: page header could be huge because of statistics. We should set a + // maximum page header size and abort if that is exceeded. + let buffer = match decompressor { + Some(decompressor) if can_decompress => { + let uncompressed_size = page_header.uncompressed_page_size as usize; + let mut decompressed = Vec::with_capacity(uncompressed_size); + let compressed = &buffer.as_ref()[offset..]; + decompressed.extend_from_slice(&buffer.as_ref()[..offset]); + decompressor.decompress(compressed, &mut decompressed)?; + + if decompressed.len() != uncompressed_size { + return Err(general_err!( + "Actual decompressed size doesn't match the expected one ({} vs {})", + decompressed.len(), + uncompressed_size + )); + } + + ByteBufferPtr::new(decompressed) + } + _ => buffer, + }; + + let result = match page_header.type_ { + PageType::DictionaryPage => { + assert!(page_header.dictionary_page_header.is_some()); + let dict_header = page_header.dictionary_page_header.as_ref().unwrap(); + let is_sorted = dict_header.is_sorted.unwrap_or(false); + Page::DictionaryPage { + buf: buffer, + num_values: dict_header.num_values as u32, + encoding: Encoding::from(dict_header.encoding), + is_sorted, + } + } + PageType::DataPage => { + assert!(page_header.data_page_header.is_some()); + let header = page_header.data_page_header.unwrap(); + Page::DataPage { + buf: buffer, + num_values: header.num_values as u32, + encoding: Encoding::from(header.encoding), + def_level_encoding: Encoding::from(header.definition_level_encoding), + rep_level_encoding: Encoding::from(header.repetition_level_encoding), + statistics: statistics::from_thrift(physical_type, header.statistics), + } + } + PageType::DataPageV2 => { + assert!(page_header.data_page_header_v2.is_some()); + let header = page_header.data_page_header_v2.unwrap(); + let is_compressed = header.is_compressed.unwrap_or(true); + Page::DataPageV2 { + buf: buffer, + num_values: header.num_values as u32, + encoding: Encoding::from(header.encoding), + num_nulls: header.num_nulls as u32, + num_rows: header.num_rows as u32, + def_levels_byte_len: header.definition_levels_byte_length as u32, + rep_levels_byte_len: header.repetition_levels_byte_length as u32, + is_compressed, + statistics: statistics::from_thrift(physical_type, header.statistics), + } + } + _ => { + // For unknown page type (e.g., INDEX_PAGE), skip and read next. + unimplemented!("Page type {:?} is not supported", page_header.type_) + } + }; + + Ok(result) +} + /// A serialized implementation for Parquet [`PageReader`]. pub struct SerializedPageReader { // The file source buffer which references exactly the bytes for the column trunk @@ -336,13 +497,6 @@ impl SerializedPageReader { }; Ok(result) } - - /// Reads Page header from Thrift. - fn read_page_header(&mut self) -> Result { - let mut prot = TCompactInputProtocol::new(&mut self.buf); - let page_header = PageHeader::read_from_in_protocol(&mut prot)?; - Ok(page_header) - } } impl Iterator for SerializedPageReader { @@ -356,108 +510,40 @@ impl Iterator for SerializedPageReader { impl PageReader for SerializedPageReader { fn get_next_page(&mut self) -> Result> { while self.seen_num_values < self.total_num_values { - let page_header = self.read_page_header()?; - - // When processing data page v2, depending on enabled compression for the - // page, we should account for uncompressed data ('offset') of - // repetition and definition levels. - // - // We always use 0 offset for other pages other than v2, `true` flag means - // that compression will be applied if decompressor is defined - let mut offset: usize = 0; - let mut can_decompress = true; - - if let Some(ref header_v2) = page_header.data_page_header_v2 { - offset = (header_v2.definition_levels_byte_length - + header_v2.repetition_levels_byte_length) - as usize; - // When is_compressed flag is missing the page is considered compressed - can_decompress = header_v2.is_compressed.unwrap_or(true); - } - - let compressed_len = page_header.compressed_page_size as usize - offset; - let uncompressed_len = page_header.uncompressed_page_size as usize - offset; - // We still need to read all bytes from buffered stream - let mut buffer = vec![0; offset + compressed_len]; - self.buf.read_exact(&mut buffer)?; - - // TODO: page header could be huge because of statistics. We should set a - // maximum page header size and abort if that is exceeded. - if let Some(decompressor) = self.decompressor.as_mut() { - if can_decompress { - let mut decompressed_buffer = Vec::with_capacity(uncompressed_len); - let decompressed_size = decompressor - .decompress(&buffer[offset..], &mut decompressed_buffer)?; - if decompressed_size != uncompressed_len { - return Err(general_err!( - "Actual decompressed size doesn't match the expected one ({} vs {})", - decompressed_size, - uncompressed_len - )); - } - if offset == 0 { - buffer = decompressed_buffer; - } else { - // Prepend saved offsets to the buffer - buffer.truncate(offset); - buffer.append(&mut decompressed_buffer); - } - } + let page_header = read_page_header(&mut self.buf)?; + + let to_read = page_header.compressed_page_size as usize; + let mut buffer = Vec::with_capacity(to_read); + let read = (&mut self.buf) + .take(to_read as u64) + .read_to_end(&mut buffer)?; + + if read != to_read { + return Err(eof_err!( + "Expected to read {} bytes of page, read only {}", + to_read, + read + )); } + let buffer = ByteBufferPtr::new(buffer); let result = match page_header.type_ { - PageType::DictionaryPage => { - assert!(page_header.dictionary_page_header.is_some()); - let dict_header = - page_header.dictionary_page_header.as_ref().unwrap(); - let is_sorted = dict_header.is_sorted.unwrap_or(false); - Page::DictionaryPage { - buf: ByteBufferPtr::new(buffer), - num_values: dict_header.num_values as u32, - encoding: Encoding::from(dict_header.encoding), - is_sorted, - } - } - PageType::DataPage => { - assert!(page_header.data_page_header.is_some()); - let header = page_header.data_page_header.unwrap(); - self.seen_num_values += header.num_values as i64; - Page::DataPage { - buf: ByteBufferPtr::new(buffer), - num_values: header.num_values as u32, - encoding: Encoding::from(header.encoding), - def_level_encoding: Encoding::from( - header.definition_level_encoding, - ), - rep_level_encoding: Encoding::from( - header.repetition_level_encoding, - ), - statistics: statistics::from_thrift( - self.physical_type, - header.statistics, - ), - } - } - PageType::DataPageV2 => { - assert!(page_header.data_page_header_v2.is_some()); - let header = page_header.data_page_header_v2.unwrap(); - let is_compressed = header.is_compressed.unwrap_or(true); - self.seen_num_values += header.num_values as i64; - Page::DataPageV2 { - buf: ByteBufferPtr::new(buffer), - num_values: header.num_values as u32, - encoding: Encoding::from(header.encoding), - num_nulls: header.num_nulls as u32, - num_rows: header.num_rows as u32, - def_levels_byte_len: header.definition_levels_byte_length as u32, - rep_levels_byte_len: header.repetition_levels_byte_length as u32, - is_compressed, - statistics: statistics::from_thrift( - self.physical_type, - header.statistics, - ), - } + PageType::DataPage | PageType::DataPageV2 => { + let decoded = decode_page( + page_header, + buffer, + self.physical_type, + self.decompressor.as_mut(), + )?; + self.seen_num_values += decoded.num_values() as i64; + decoded } + PageType::DictionaryPage => decode_page( + page_header, + buffer, + self.physical_type, + self.decompressor.as_mut(), + )?, _ => { // For unknown page type (e.g., INDEX_PAGE), skip and read next. continue; @@ -475,9 +561,11 @@ impl PageReader for SerializedPageReader { mod tests { use super::*; use crate::basic::{self, ColumnOrder}; + use crate::file::page_index::index::Index; use crate::record::RowAccessor; use crate::schema::parser::parse_message_type; use crate::util::test_common::{get_test_file, get_test_path}; + use parquet_format::BoundaryOrder; use std::sync::Arc; #[test] @@ -486,7 +574,7 @@ mod tests { get_test_file("alltypes_plain.parquet") .read_to_end(&mut buf) .unwrap(); - let cursor = SliceableCursor::new(buf); + let cursor = Bytes::from(buf); let read_from_cursor = SerializedFileReader::new(cursor).unwrap(); let test_file = get_test_file("alltypes_plain.parquet"); @@ -605,9 +693,9 @@ mod tests { let file_metadata = metadata.file_metadata(); assert!(file_metadata.created_by().is_some()); assert_eq!( - file_metadata.created_by().unwrap(), - "impala version 1.3.0-INTERNAL (build 8a48ddb1eff84592b3fc06bc6f51ec120e1fffc9)" - ); + file_metadata.created_by().unwrap(), + "impala version 1.3.0-INTERNAL (build 8a48ddb1eff84592b3fc06bc6f51ec120e1fffc9)" + ); assert!(file_metadata.key_value_metadata().is_none()); assert_eq!(file_metadata.num_rows(), 8); assert_eq!(file_metadata.version(), 1); @@ -955,4 +1043,66 @@ mod tests { assert_eq!(metadata.num_row_groups(), 0); Ok(()) } + + #[test] + // Use java parquet-tools get below pageIndex info + // !``` + // parquet-tools column-index ./data_index_bloom_encoding_stats.parquet + // row group 0: + // column index for column String: + // Boudary order: ASCENDING + // page-0 : + // null count min max + // 0 Hello today + // + // offset index for column String: + // page-0 : + // offset compressed size first row index + // 4 152 0 + ///``` + // + fn test_page_index_reader() { + let test_file = get_test_file("data_index_bloom_encoding_stats.parquet"); + let builder = ReadOptionsBuilder::new(); + //enable read page index + let options = builder.with_page_index().build(); + let reader_result = SerializedFileReader::new_with_options(test_file, options); + let reader = reader_result.unwrap(); + + // Test contents in Parquet metadata + let metadata = reader.metadata(); + assert_eq!(metadata.num_row_groups(), 1); + + let page_indexes = metadata.page_indexes().unwrap(); + + // only one row group + assert_eq!(page_indexes.len(), 1); + let index = if let Index::BYTE_ARRAY(index) = page_indexes.get(0).unwrap() { + index + } else { + unreachable!() + }; + + assert_eq!(index.boundary_order, BoundaryOrder::Ascending); + let index_in_pages = &index.indexes; + + //only one page group + assert_eq!(index_in_pages.len(), 1); + + let page0 = index_in_pages.get(0).unwrap(); + let min = page0.min.as_ref().unwrap(); + let max = page0.max.as_ref().unwrap(); + assert_eq!("Hello", std::str::from_utf8(min.as_slice()).unwrap()); + assert_eq!("today", std::str::from_utf8(max.as_slice()).unwrap()); + + let offset_indexes = metadata.offset_indexes().unwrap(); + // only one row group + assert_eq!(offset_indexes.len(), 1); + let offset_index = offset_indexes.get(0).unwrap(); + let page_offset = offset_index.get(0).unwrap(); + + assert_eq!(4, page_offset.offset); + assert_eq!(152, page_offset.compressed_page_size); + assert_eq!(0, page_offset.first_row_index); + } } diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index 646550dcb6be..0a8fc331e7e1 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -153,6 +153,11 @@ impl SerializedFileWriter { Ok(row_group_writer) } + /// Returns metadata for any flushed row groups + pub fn flushed_row_groups(&self) -> &[RowGroupMetaDataPtr] { + &self.row_groups + } + /// Closes and finalises file writer, returning the file metadata. /// /// All row groups must be appended before this method is called. @@ -541,6 +546,7 @@ impl<'a, W: Write> PageWriter for SerializedPageWriter<'a, W> { mod tests { use super::*; + use bytes::Bytes; use std::{fs::File, io::Cursor}; use crate::basic::{Compression, Encoding, LogicalType, Repetition, Type}; @@ -999,7 +1005,7 @@ mod tests { ); let mut rows: i64 = 0; - for subset in &data { + for (idx, subset) in data.iter().enumerate() { let mut row_group_writer = file_writer.next_row_group().unwrap(); if let Some(mut writer) = row_group_writer.next_column().unwrap() { rows += writer @@ -1008,7 +1014,10 @@ mod tests { .unwrap() as i64; writer.close().unwrap(); } - row_group_writer.close().unwrap(); + let last_group = row_group_writer.close().unwrap(); + let flushed = file_writer.flushed_row_groups(); + assert_eq!(flushed.len(), idx + 1); + assert_eq!(flushed[idx].as_ref(), last_group.as_ref()); } file_writer.close().unwrap(); @@ -1054,7 +1063,7 @@ mod tests { } fn test_bytes_roundtrip(data: Vec>) { - let mut cursor = Cursor::new(vec![]); + let mut buffer = vec![]; let schema = Arc::new( types::Type::group_type_builder("schema") @@ -1072,7 +1081,7 @@ mod tests { { let props = Arc::new(WriterProperties::builder().build()); let mut writer = - SerializedFileWriter::new(&mut cursor, schema, props).unwrap(); + SerializedFileWriter::new(&mut buffer, schema, props).unwrap(); for subset in &data { let mut row_group_writer = writer.next_row_group().unwrap(); @@ -1089,9 +1098,7 @@ mod tests { writer.close().unwrap(); } - let buffer = cursor.into_inner(); - - let reading_cursor = crate::file::serialized_reader::SliceableCursor::new(buffer); + let reading_cursor = Bytes::from(buffer); let reader = SerializedFileReader::new(reading_cursor).unwrap(); assert_eq!(reader.num_row_groups(), data.len()); diff --git a/parquet/src/record/api.rs b/parquet/src/record/api.rs index 95b97bc9546e..5df21e4b0d00 100644 --- a/parquet/src/record/api.rs +++ b/parquet/src/record/api.rs @@ -27,7 +27,7 @@ use crate::data_type::{ByteArray, Decimal, Int96}; use crate::errors::{ParquetError, Result}; use crate::schema::types::ColumnDescPtr; -#[cfg(feature = "cli")] +#[cfg(any(feature = "cli", test))] use serde_json::Value; /// Macro as a shortcut to generate 'not yet implemented' panic error. @@ -79,7 +79,7 @@ impl Row { } } - #[cfg(feature = "cli")] + #[cfg(any(feature = "cli", test))] pub fn to_json_value(&self) -> Value { Value::Object( self.fields @@ -650,7 +650,7 @@ impl Field { } } - #[cfg(feature = "cli")] + #[cfg(any(feature = "cli", test))] pub fn to_json_value(&self) -> Value { match &self { Field::Null => Value::Null, @@ -669,7 +669,7 @@ impl Field { Field::Double(n) => serde_json::Number::from_f64(*n) .map(Value::Number) .unwrap_or(Value::Null), - Field::Decimal(n) => Value::String(convert_decimal_to_string(&n)), + Field::Decimal(n) => Value::String(convert_decimal_to_string(n)), Field::Str(s) => Value::String(s.to_owned()), Field::Bytes(b) => Value::String(base64::encode(b.data())), Field::Date(d) => Value::String(convert_date_to_string(*d)), @@ -1668,7 +1668,7 @@ mod tests { } #[test] - #[cfg(feature = "cli")] + #[cfg(any(feature = "cli", test))] fn test_to_json_value() { assert_eq!(Field::Null.to_json_value(), Value::Null); assert_eq!(Field::Bool(true).to_json_value(), Value::Bool(true)); @@ -1707,21 +1707,19 @@ mod tests { ); assert_eq!( Field::Float(5.0).to_json_value(), - Value::Number(serde_json::Number::from_f64(f64::from(5.0 as f32)).unwrap()) + Value::Number(serde_json::Number::from_f64(5.0).unwrap()) ); assert_eq!( Field::Float(5.1234).to_json_value(), - Value::Number( - serde_json::Number::from_f64(f64::from(5.1234 as f32)).unwrap() - ) + Value::Number(serde_json::Number::from_f64(5.1234_f32 as f64).unwrap()) ); assert_eq!( Field::Double(6.0).to_json_value(), - Value::Number(serde_json::Number::from_f64(6.0 as f64).unwrap()) + Value::Number(serde_json::Number::from_f64(6.0).unwrap()) ); assert_eq!( Field::Double(6.1234).to_json_value(), - Value::Number(serde_json::Number::from_f64(6.1234 as f64).unwrap()) + Value::Number(serde_json::Number::from_f64(6.1234).unwrap()) ); assert_eq!( Field::Str("abc".to_string()).to_json_value(), diff --git a/parquet/src/schema/types.rs b/parquet/src/schema/types.rs index 9cef93a69b29..8d624fe3d185 100644 --- a/parquet/src/schema/types.rs +++ b/parquet/src/schema/types.rs @@ -838,6 +838,7 @@ impl ColumnDescriptor { /// A schema descriptor. This encapsulates the top-level schemas for all the columns, /// as well as all descriptors for all the primitive columns. +#[derive(PartialEq)] pub struct SchemaDescriptor { // The top-level schema (the "message" type). // This must be a `GroupType` where each field is a root column type in the schema. diff --git a/parquet/src/util/bit_util.rs b/parquet/src/util/bit_util.rs index 288c771b097b..b535ee02a0ef 100644 --- a/parquet/src/util/bit_util.rs +++ b/parquet/src/util/bit_util.rs @@ -32,6 +32,17 @@ pub fn from_ne_slice(bs: &[u8]) -> T { T::from_ne_bytes(b) } +#[inline] +pub fn from_le_slice(bs: &[u8]) -> T { + let mut b = T::Buffer::default(); + { + let b = b.as_mut(); + let bs = &bs[..b.len()]; + b.copy_from_slice(bs); + } + T::from_le_bytes(b) +} + pub trait FromBytes: Sized { type Buffer: AsMut<[u8]> + Default; fn from_le_bytes(bs: Self::Buffer) -> Self; diff --git a/parquet/src/util/cursor.rs b/parquet/src/util/cursor.rs index ff7067fcbcad..706724dbf52a 100644 --- a/parquet/src/util/cursor.rs +++ b/parquet/src/util/cursor.rs @@ -26,6 +26,7 @@ use std::{cmp, fmt}; /// because the lack of Generic Associated Type implies that you would require complex lifetime propagation when /// returning such a cursor. #[allow(clippy::rc_buffer)] +#[deprecated = "use bytes::Bytes instead"] pub struct SliceableCursor { inner: Arc>, start: u64, @@ -33,6 +34,7 @@ pub struct SliceableCursor { pos: u64, } +#[allow(deprecated)] impl fmt::Debug for SliceableCursor { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.debug_struct("SliceableCursor") @@ -44,6 +46,7 @@ impl fmt::Debug for SliceableCursor { } } +#[allow(deprecated)] impl SliceableCursor { pub fn new(content: impl Into>>) -> Self { let inner = content.into(); @@ -90,6 +93,7 @@ impl SliceableCursor { } /// Implementation inspired by std::io::Cursor +#[allow(deprecated)] impl Read for SliceableCursor { fn read(&mut self, buf: &mut [u8]) -> io::Result { let n = Read::read(&mut self.remaining_slice(), buf)?; @@ -98,6 +102,7 @@ impl Read for SliceableCursor { } } +#[allow(deprecated)] impl Seek for SliceableCursor { fn seek(&mut self, pos: SeekFrom) -> io::Result { let new_pos = match pos { @@ -204,12 +209,14 @@ mod tests { use super::*; /// Create a SliceableCursor of all u8 values in ascending order + #[allow(deprecated)] fn get_u8_range() -> SliceableCursor { let data: Vec = (0u8..=255).collect(); SliceableCursor::new(data) } /// Reads all the bytes in the slice and checks that it matches the u8 range from start to end_included + #[allow(deprecated)] fn check_read_all(mut cursor: SliceableCursor, start: u8, end_included: u8) { let mut target = vec![]; let cursor_res = cursor.read_to_end(&mut target); diff --git a/parquet/src/util/memory.rs b/parquet/src/util/memory.rs index 0b0c707ff34f..909878a6d538 100644 --- a/parquet/src/util/memory.rs +++ b/parquet/src/util/memory.rs @@ -31,7 +31,6 @@ use std::{ /// when all slices are dropped. /// /// TODO: Remove and replace with [`bytes::Bytes`] -#[allow(clippy::rc_buffer)] #[derive(Clone, Debug)] pub struct ByteBufferPtr { data: Bytes, @@ -109,6 +108,12 @@ impl From> for ByteBufferPtr { } } +impl From for ByteBufferPtr { + fn from(data: Bytes) -> Self { + Self { data } + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/parquet_derive/Cargo.toml b/parquet_derive/Cargo.toml index bab5d230a3c2..680074d08705 100644 --- a/parquet_derive/Cargo.toml +++ b/parquet_derive/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "parquet_derive" -version = "15.0.0" +version = "16.0.0" license = "Apache-2.0" description = "Derive macros for the Rust implementation of Apache Parquet" homepage = "https://github.com/apache/arrow-rs" @@ -35,4 +35,4 @@ proc-macro = true proc-macro2 = "1.0" quote = "1.0" syn = { version = "1.0", features = ["full", "extra-traits"] } -parquet = { path = "../parquet", version = "15.0.0" } +parquet = { path = "../parquet", version = "16.0.0" } diff --git a/parquet_derive/README.md b/parquet_derive/README.md index 61ccf3093001..4f390b0cd911 100644 --- a/parquet_derive/README.md +++ b/parquet_derive/README.md @@ -32,8 +32,8 @@ Add this to your Cargo.toml: ```toml [dependencies] -parquet = "15.0.0" -parquet_derive = "15.0.0" +parquet = "16.0.0" +parquet_derive = "16.0.0" ``` and this to your crate root: diff --git a/parquet_derive/test/dependency/README.md b/parquet_derive/test/dependency/README.md deleted file mode 100644 index b618b4636e7c..000000000000 --- a/parquet_derive/test/dependency/README.md +++ /dev/null @@ -1,21 +0,0 @@ - - -This directory contains projects that use arrow as a dependency with -various combinations of feature flags. diff --git a/parquet_derive/test/dependency/default-features/Cargo.toml b/parquet_derive/test/dependency/default-features/Cargo.toml deleted file mode 100644 index 7434552aaec4..000000000000 --- a/parquet_derive/test/dependency/default-features/Cargo.toml +++ /dev/null @@ -1,31 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -[package] -name = "defeault-features" -description = "Models a user application of parquet_derive that uses no additional features of arrow" -version = "0.1.0" -edition = "2021" -rust-version = "1.57" - -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html - -[dependencies] -parquet_derive = { path = "../../../../parquet_derive", version = "15.0.0" } - -# Keep this out of the default workspace -[workspace] diff --git a/parquet_derive/test/dependency/default-features/src/main.rs b/parquet_derive/test/dependency/default-features/src/main.rs deleted file mode 100644 index e7a11a969c03..000000000000 --- a/parquet_derive/test/dependency/default-features/src/main.rs +++ /dev/null @@ -1,3 +0,0 @@ -fn main() { - println!("Hello, world!"); -} diff --git a/parquet_derive_test/Cargo.toml b/parquet_derive_test/Cargo.toml index 53daf2977620..7bf6db6730e6 100644 --- a/parquet_derive_test/Cargo.toml +++ b/parquet_derive_test/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "parquet_derive_test" -version = "15.0.0" +version = "16.0.0" license = "Apache-2.0" description = "Integration test package for parquet-derive" homepage = "https://github.com/apache/arrow-rs" @@ -29,6 +29,6 @@ publish = false rust-version = "1.57" [dependencies] -parquet = { path = "../parquet", version = "15.0.0" } -parquet_derive = { path = "../parquet_derive", version = "15.0.0" } +parquet = { path = "../parquet", version = "16.0.0" } +parquet_derive = { path = "../parquet_derive", version = "16.0.0" } chrono = "0.4.19"