From a5f80184750fb09fd232f2acbe5558b03b45a156 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Thu, 4 Apr 2024 06:41:04 +0900 Subject: [PATCH 01/10] GH-40961: [GLib] Suppress warnings for Vala examples on macOS (#40962) ### Rationale for this change There are some warnings for Vala examples on macOS: ```text FAILED: example/vala/read-file.p/meson-generated_read-file.c.o ccache cc -Iexample/vala/read-file.p -Iexample/vala -I../../c_glib/example/vala -I/Users/runner/work/arrow/arrow/build/c_glib -I/Users/runner/work/arrow/arrow/c_glib -Iarrow-glib -I../../c_glib/arrow-glib -I/usr/local/Cellar/glib/2.80.0_2/include -I/usr/local/Cellar/glib/2.80.0_2/include/glib-2.0 -I/usr/local/Cellar/glib/2.80.0_2/lib/glib-2.0/include -I/usr/local/opt/gettext/include -I/usr/local/Cellar/pcre2/10.43/include -I/Library/Developer/CommandLineTools/SDKs/MacOSX12.sdk/usr/include/ffi -fdiagnostics-color=always -Wall -Winvalid-pch -Werror -std=c99 -O0 -g -DARROW_NO_DEPRECATED_API -MD -MQ example/vala/read-file.p/meson-generated_read-file.c.o -MF example/vala/read-file.p/meson-generated_read-file.c.o.d -o example/vala/read-file.p/meson-generated_read-file.c.o -c example/vala/read-file.p/read-file.c ../../c_glib/example/vala/read-file.vala:123:61: error: format specifies type 'long long' but the argument has type 'gint' (aka 'int') [-Werror,-Wformat] fprintf (_tmp2_, "columns[%" G_GINT64_FORMAT "](%s): ", nth_column, _tmp3_); ~~~~~~~~~~~~~~~~~~ ^~~~~~~~~~ 1 error generated. ``` ```text FAILED: example/vala/read-stream.p/meson-generated_read-stream.c.o ccache cc -Iexample/vala/read-stream.p -Iexample/vala -I../../c_glib/example/vala -I/Users/runner/work/arrow/arrow/build/c_glib -I/Users/runner/work/arrow/arrow/c_glib -Iarrow-glib -I../../c_glib/arrow-glib -I/usr/local/Cellar/glib/2.80.0_2/include -I/usr/local/Cellar/glib/2.80.0_2/include/glib-2.0 -I/usr/local/Cellar/glib/2.80.0_2/lib/glib-2.0/include -I/usr/local/opt/gettext/include -I/usr/local/Cellar/pcre2/10.43/include -I/Library/Developer/CommandLineTools/SDKs/MacOSX12.sdk/usr/include/ffi -fdiagnostics-color=always -Wall -Winvalid-pch -Werror -std=c99 -O0 -g -DARROW_NO_DEPRECATED_API -MD -MQ example/vala/read-stream.p/meson-generated_read-stream.c.o -MF example/vala/read-stream.p/meson-generated_read-stream.c.o.d -o example/vala/read-stream.p/meson-generated_read-stream.c.o -c example/vala/read-stream.p/read-stream.c ../../c_glib/example/vala/read-stream.vala:123:61: error: format specifies type 'long long' but the argument has type 'gint' (aka 'int') [-Werror,-Wformat] fprintf (_tmp2_, "columns[%" G_GINT64_FORMAT "](%s): ", nth_column, _tmp3_); ~~~~~~~~~~~~~~~~~~ ^~~~~~~~~~ 1 error generated. ``` ```text FAILED: example/vala/write-file.p/meson-generated_write-file.c.o ccache cc -Iexample/vala/write-file.p -Iexample/vala -I../../c_glib/example/vala -I/Users/runner/work/arrow/arrow/build/c_glib -I/Users/runner/work/arrow/arrow/c_glib -Iarrow-glib -I../../c_glib/arrow-glib -I/usr/local/Cellar/glib/2.80.0_2/include -I/usr/local/Cellar/glib/2.80.0_2/include/glib-2.0 -I/usr/local/Cellar/glib/2.80.0_2/lib/glib-2.0/include -I/usr/local/opt/gettext/include -I/usr/local/Cellar/pcre2/10.43/include -I/Library/Developer/CommandLineTools/SDKs/MacOSX12.sdk/usr/include/ffi -fdiagnostics-color=always -Wall -Winvalid-pch -Werror -std=c99 -O0 -g -DARROW_NO_DEPRECATED_API -MD -MQ example/vala/write-file.p/meson-generated_write-file.c.o -MF example/vala/write-file.p/meson-generated_write-file.c.o.d -o example/vala/write-file.p/meson-generated_write-file.c.o -c example/vala/write-file.p/write-file.c write-file.c:373:8: error: variable '_tmp45__length1' set but not used [-Werror,-Wunused-but-set-variable] gint _tmp45__length1; ^ write-file.c:504:8: error: variable '_tmp57__length1' set but not used [-Werror,-Wunused-but-set-variable] gint _tmp57__length1; ^ write-file.c:635:8: error: variable '_tmp69__length1' set but not used [-Werror,-Wunused-but-set-variable] gint _tmp69__length1; ^ write-file.c:766:8: error: variable '_tmp81__length1' set but not used [-Werror,-Wunused-but-set-variable] gint _tmp81__length1; ^ write-file.c:897:8: error: variable '_tmp93__length1' set but not used [-Werror,-Wunused-but-set-variable] gint _tmp93__length1; ^ write-file.c:1028:8: error: variable '_tmp105__length1' set but not used [-Werror,-Wunused-but-set-variable] gint _tmp105__length1; ^ write-file.c:1159:8: error: variable '_tmp117__length1' set but not used [-Werror,-Wunused-but-set-variable] gint _tmp117__length1; ^ write-file.c:1290:8: error: variable '_tmp129__length1' set but not used [-Werror,-Wunused-but-set-variable] gint _tmp129__length1; ^ write-file.c:1421:8: error: variable '_tmp141__length1' set but not used [-Werror,-Wunused-but-set-variable] gint _tmp141__length1; ^ write-file.c:1552:8: error: variable '_tmp153__length1' set but not used [-Werror,-Wunused-but-set-variable] gint _tmp153__length1; ^ 10 errors generated. ``` ### What changes are included in this PR? * Fix wrong format string * Disable `unused-but-set-variable` warning ### Are these changes tested? Yes. ### Are there any user-facing changes? No. * GitHub Issue: #40961 Authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- c_glib/example/vala/meson.build | 6 +++++- c_glib/example/vala/read-file.vala | 4 ++-- c_glib/example/vala/read-stream.vala | 4 ++-- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/c_glib/example/vala/meson.build b/c_glib/example/vala/meson.build index 474f0b1e9a51a..ff65a7328f171 100644 --- a/c_glib/example/vala/meson.build +++ b/c_glib/example/vala/meson.build @@ -18,11 +18,15 @@ # under the License. if generate_vapi + c_flags = [ + '-Wunused-but-set-variable', + ] + c_flags = meson.get_compiler('c').get_supported_arguments(c_flags) vala_example_executable_kwargs = { 'c_args': [ '-I' + project_build_root, '-I' + project_source_root, - ], + ] + c_flags, 'dependencies': [ arrow_glib_vapi, dependency('gio-2.0'), diff --git a/c_glib/example/vala/read-file.vala b/c_glib/example/vala/read-file.vala index a0a06275c4b24..287eddac76352 100644 --- a/c_glib/example/vala/read-file.vala +++ b/c_glib/example/vala/read-file.vala @@ -119,8 +119,8 @@ void print_array(GArrow.Array array) { void print_record_batch(GArrow.RecordBatch record_batch) { var n_columns = record_batch.get_n_columns(); - for (var nth_column = 0; nth_column < n_columns; nth_column++) { - stdout.printf("columns[%" + int64.FORMAT + "](%s): ", + for (int nth_column = 0; nth_column < n_columns; nth_column++) { + stdout.printf("columns[%d](%s): ", nth_column, record_batch.get_column_name(nth_column)); var array = record_batch.get_column_data(nth_column); diff --git a/c_glib/example/vala/read-stream.vala b/c_glib/example/vala/read-stream.vala index c58dc848930a8..4520c8609bdaf 100644 --- a/c_glib/example/vala/read-stream.vala +++ b/c_glib/example/vala/read-stream.vala @@ -119,8 +119,8 @@ void print_array(GArrow.Array array) { void print_record_batch(GArrow.RecordBatch record_batch) { var n_columns = record_batch.get_n_columns(); - for (var nth_column = 0; nth_column < n_columns; nth_column++) { - stdout.printf("columns[%" + int64.FORMAT + "](%s): ", + for (int nth_column = 0; nth_column < n_columns; nth_column++) { + stdout.printf("columns[%d](%s): ", nth_column, record_batch.get_column_name(nth_column)); var array = record_batch.get_column_data(nth_column); From 5b09059ae40164c173a6ef593ab7b463b24d431d Mon Sep 17 00:00:00 2001 From: Dominik Moritz Date: Wed, 3 Apr 2024 18:31:19 -0400 Subject: [PATCH 02/10] GH-40851: [JS] Fix nullcount and make vectors created from typed arrays not nullable (#40852) * GitHub Issue: #40851 --- js/src/data.ts | 13 ++++---- js/src/vector.ts | 2 +- js/test/unit/vector/vector-tests.ts | 46 ++++++++++++++++++++++++++--- 3 files changed, 51 insertions(+), 10 deletions(-) diff --git a/js/src/data.ts b/js/src/data.ts index 6f8792508858b..45fcc35d37676 100644 --- a/js/src/data.ts +++ b/js/src/data.ts @@ -109,7 +109,10 @@ export class Data { let nullCount = this._nullCount; let nullBitmap: Uint8Array | undefined; if (nullCount <= kUnknownNullCount && (nullBitmap = this.nullBitmap)) { - this._nullCount = nullCount = this.length - popcnt_bit_range(nullBitmap, this.offset, this.offset + this.length); + this._nullCount = nullCount = nullBitmap.length === 0 ? + // no null bitmap, so all values are valid + 0 : + this.length - popcnt_bit_range(nullBitmap, this.offset, this.offset + this.length); } return nullCount; } @@ -177,16 +180,16 @@ export class Data { // if we have a nullBitmap, truncate + slice and set it over the pre-filled 1s if (this.nullCount > 0) { nullBitmap.set(truncateBitmap(offset, length, this.nullBitmap), 0); + Object.assign(this, { nullBitmap }); + } else { + Object.assign(this, { nullBitmap, _nullCount: 0 }); } - Object.assign(this, { nullBitmap, _nullCount: -1 }); } const byte = nullBitmap[byteOffset]; prev = (byte & mask) !== 0; - value ? - (nullBitmap[byteOffset] = byte | mask) : - (nullBitmap[byteOffset] = byte & ~mask); + nullBitmap[byteOffset] = value ? (byte | mask) : (byte & ~mask); } if (prev !== !!value) { diff --git a/js/src/vector.ts b/js/src/vector.ts index a7c103bc326ee..1b0d9a05796f0 100644 --- a/js/src/vector.ts +++ b/js/src/vector.ts @@ -445,7 +445,7 @@ export function makeVector(init: any) { if (init instanceof DataView) { init = new Uint8Array(init.buffer); } - const props = { offset: 0, length: init.length, nullCount: 0, data: init }; + const props = { offset: 0, length: init.length, nullCount: -1, data: init }; if (init instanceof Int8Array) { return new Vector([makeData({ ...props, type: new dtypes.Int8 })]); } if (init instanceof Int16Array) { return new Vector([makeData({ ...props, type: new dtypes.Int16 })]); } if (init instanceof Int32Array) { return new Vector([makeData({ ...props, type: new dtypes.Int32 })]); } diff --git a/js/test/unit/vector/vector-tests.ts b/js/test/unit/vector/vector-tests.ts index bfcf0d8547861..a10d7c757ca17 100644 --- a/js/test/unit/vector/vector-tests.ts +++ b/js/test/unit/vector/vector-tests.ts @@ -16,7 +16,7 @@ // under the License. import { - Bool, DateDay, DateMillisecond, Dictionary, Float64, Int32, List, makeVector, Struct, Timestamp, TimeUnit, Utf8, LargeUtf8, util, Vector, vectorFromArray + Bool, DateDay, DateMillisecond, Dictionary, Float64, Int32, List, makeVector, Struct, Timestamp, TimeUnit, Utf8, LargeUtf8, util, Vector, vectorFromArray, makeData } from 'apache-arrow'; describe(`makeVectorFromArray`, () => { @@ -33,6 +33,47 @@ describe(`makeVectorFromArray`, () => { }); }); +describe(`basic vector methods`, () => { + test(`not nullable`, () => { + const vector = makeVector([makeData({ data: new Int32Array([1, 2, 3]), nullCount: -1, type: new Int32() })]); + expect(vector.nullable).toBe(false); + expect(vector.nullCount).toBe(0); + }); + + test(`nullable`, () => { + const vector = makeVector([makeData({ data: new Int32Array([1, 2, 3]), nullCount: 0, type: new Int32() })]); + expect(vector.nullable).toBe(true); + expect(vector.nullCount).toBe(0); + expect(vector.isValid(0)).toBe(true); + + // set a value to null + vector.set(0, null); + expect(vector.nullable).toBe(true); + expect(vector.nullCount).toBe(1); + expect(vector.isValid(0)).toBe(false); + + // set the same value to null which should not change anything + vector.set(0, null); + expect(vector.nullable).toBe(true); + expect(vector.nullCount).toBe(1); + + // set a different value to null + vector.set(1, null); + expect(vector.nullable).toBe(true); + expect(vector.nullCount).toBe(2); + + // set first value to non-null + vector.set(0, 1); + expect(vector.nullable).toBe(true); + expect(vector.nullCount).toBe(1); + + // set last null to non-null + vector.set(1, 2); + expect(vector.nullable).toBe(true); + expect(vector.nullCount).toBe(0); + }); +}); + describe(`StructVector`, () => { test(`makeVectorFromArray`, () => { const values: { a?: number; b?: string | null; c?: boolean | null }[] = [ @@ -108,7 +149,6 @@ describe(`DateVector`, () => { }); describe(`DictionaryVector`, () => { - const dictionary = ['foo', 'bar', 'baz']; const extras = ['abc', '123']; // values to search for that should NOT be found const dictionary_vec = vectorFromArray(dictionary, new Utf8).memoize(); @@ -117,7 +157,6 @@ describe(`DictionaryVector`, () => { const validity = Array.from({ length: indices.length }, () => Math.random() > 0.2); describe(`index with nullCount == 0`, () => { - const values = indices.map((d) => dictionary[d]); const vector = makeVector({ data: indices, @@ -133,7 +172,6 @@ describe(`DictionaryVector`, () => { }); describe(`index with nullCount > 0`, () => { - const nullBitmap = util.packBools(validity); const nullCount = validity.reduce((acc, d) => acc + (d ? 0 : 1), 0); const values = indices.map((d, i) => validity[i] ? dictionary[d] : null); From 2caec860894945e8bfe5b557c825ba962a6a16bd Mon Sep 17 00:00:00 2001 From: Dominik Moritz Date: Wed, 3 Apr 2024 18:32:12 -0400 Subject: [PATCH 03/10] GH-40891: [JS] Store Dates as TimestampMillisecond (#40892) Fixes #40891 Tested with ```ts const date = new Date("2023-03-29T12:34:56Z"); console.log("original", date) console.log("=> vec") const vec = arrow.vectorFromArray([date]) console.log(vec.toArray()) console.log(vec.toJSON()) console.log(vec.type) console.log(vec.get(0)) console.log("=> vec2") const vec2 = arrow.vectorFromArray([date], new arrow.DateMillisecond) console.log(vec2.toArray()) console.log(vec.toJSON()) console.log(vec2.type) console.log(vec2.get(0)) console.log("=> table") const table = arrow.tableFromJSON([{ date }]) console.log(table.toArray()) console.log(table.schema.fields[0].type) console.log(table.getChildAt(0)?.get(0)) console.log("=> table2") const table2 = arrow.tableFromIPC(arrow.tableToIPC(table)); console.log(table2.toArray()) console.log(table2.schema.fields[0].type) console.log(table2.getChildAt(0)?.get(0)) console.log("=> table3") const table3 = new arrow.Table({ dates: vec2 }) console.log(table3.toArray()) console.log(table3.schema.fields[0].type) console.log(table3.getChildAt(0)?.get(0)) ``` ``` => table [ {"date": Wed Mar 29 2023 08:34:56 GMT-0400 (Eastern Daylight Time)} ] TimestampMillisecond { typeId: 10, unit: 1, timezone: undefined, toString: [Function: toString], ArrayType: [class Int32Array], [Symbol(Symbol.toStringTag)]: "Timestamp", children: null, OffsetArrayType: [class Int32Array], } 2023-03-29T12:34:56.000Z => table2 [ {"date": Wed Mar 29 2023 08:34:56 GMT-0400 (Eastern Daylight Time)} ] Timestamp_ { typeId: 10, unit: 1, timezone: null, toString: [Function: toString], ArrayType: [class Int32Array], children: null, OffsetArrayType: [class Int32Array], } 2023-03-29T12:34:56.000Z => table3 [ {"dates": Wed Mar 29 2023 08:34:56 GMT-0400 (Eastern Daylight Time)} ] DateMillisecond { typeId: 8, unit: 1, toString: [Function: toString], ArrayType: [class Int32Array], [Symbol(Symbol.toStringTag)]: "Date", children: null, OffsetArrayType: [class Int32Array], } 2023-03-29T12:34:56.000Z ``` * GitHub Issue: #40891 --- js/src/factories.ts | 4 ++-- js/src/type.ts | 14 +++++++++++++- js/test/unit/vector/date-vector-tests.ts | 19 ++++++++++++++----- 3 files changed, 29 insertions(+), 8 deletions(-) diff --git a/js/src/factories.ts b/js/src/factories.ts index aa54498c50bc0..657ae1b95ab92 100644 --- a/js/src/factories.ts +++ b/js/src/factories.ts @@ -65,7 +65,7 @@ export function makeBuilder(option export function vectorFromArray(values: readonly (null | undefined)[], type?: dtypes.Null): Vector; export function vectorFromArray(values: readonly (null | undefined | boolean)[], type?: dtypes.Bool): Vector; export function vectorFromArray = dtypes.Dictionary>(values: readonly (null | undefined | string)[], type?: T): Vector; -export function vectorFromArray(values: readonly (null | undefined | Date)[], type?: T): Vector; +export function vectorFromArray(values: readonly (null | undefined | Date)[], type?: T): Vector; export function vectorFromArray(values: readonly (null | undefined | number)[], type: T): Vector; export function vectorFromArray(values: readonly (null | undefined | bigint)[], type?: T): Vector; export function vectorFromArray(values: readonly (null | undefined | number)[], type?: T): Vector; @@ -145,7 +145,7 @@ function inferType(value: readonly unknown[]): dtypes.DataType { } else if (booleansCount + nullsCount === value.length) { return new dtypes.Bool; } else if (datesCount + nullsCount === value.length) { - return new dtypes.DateMillisecond; + return new dtypes.TimestampMillisecond; } else if (arraysCount + nullsCount === value.length) { const array = value as Array[]; const childType = inferType(array[array.findIndex((ary) => ary != null)]); diff --git a/js/src/type.ts b/js/src/type.ts index ae3aefa025999..a42552d65ad27 100644 --- a/js/src/type.ts +++ b/js/src/type.ts @@ -349,7 +349,19 @@ export class Date_ extends DataType { /** @ignore */ export class DateDay extends Date_ { constructor() { super(DateUnit.DAY); } } -/** @ignore */ +/** + * A signed 64-bit date representing the elapsed time since UNIX epoch (1970-01-01) in milliseconds. + * According to the specification, this should be treated as the number of days, in milliseconds, since the UNIX epoch. + * Therefore, values must be evenly divisible by `86_400_000` (the number of milliseconds in a standard day). + * + * Practically, validation that values of this type are evenly divisible by `86_400_000` is not enforced by this library + * for performance and usability reasons. + * + * Users should prefer to use {@link DateDay} to cleanly represent the number of days. For JS dates, + * {@link TimestampMillisecond} is the preferred type. + * + * @ignore + */ export class DateMillisecond extends Date_ { constructor() { super(DateUnit.MILLISECOND); } } /** @ignore */ diff --git a/js/test/unit/vector/date-vector-tests.ts b/js/test/unit/vector/date-vector-tests.ts index f8b4c1c7976d2..e5cd49933eac5 100644 --- a/js/test/unit/vector/date-vector-tests.ts +++ b/js/test/unit/vector/date-vector-tests.ts @@ -15,10 +15,19 @@ // specific language governing permissions and limitations // under the License. -import { DateDay, DateMillisecond, RecordBatchReader, Table, vectorFromArray } from 'apache-arrow'; +import { DateDay, DateMillisecond, TimestampMillisecond, RecordBatchReader, Table, vectorFromArray } from 'apache-arrow'; + +describe(`TimestampVector`, () => { + test(`Dates are stored in TimestampMillisecond`, () => { + const date = new Date('2023-02-01T12:34:56Z'); + const vec = vectorFromArray([date]); + expect(vec.type).toBeInstanceOf(TimestampMillisecond); + expect(vec.get(0)).toBe(date.valueOf()); + }); +}); describe(`DateVector`, () => { - it('returns days since the epoch as correct JS Dates', () => { + test(`returns days since the epoch as correct JS Dates`, () => { const table = new Table(RecordBatchReader.from(test_data)); const expectedMillis = expectedMillis32(); const date32 = table.getChildAt(0)!; @@ -28,7 +37,7 @@ describe(`DateVector`, () => { } }); - it('returns millisecond longs since the epoch as correct JS Dates', () => { + test(`returns millisecond longs since the epoch as correct JS Dates`, () => { const table = new Table(RecordBatchReader.from(test_data)); const expectedMillis = expectedMillis64(); const date64 = table.getChildAt(1)!; @@ -38,9 +47,9 @@ describe(`DateVector`, () => { } }); - it('returns the same date that was in the vector', () => { + test(`returns the same date that was in the vector`, () => { const dates = [new Date(1950, 1, 0)]; - const vec = vectorFromArray(dates); + const vec = vectorFromArray(dates, new DateMillisecond()); for (const date of vec) { expect(date).toEqual(dates.shift()); } From 89d5d8b40a5074a7ae30430b3fa95f7a9daf16da Mon Sep 17 00:00:00 2001 From: Bryce Mecum Date: Wed, 3 Apr 2024 17:14:06 -0800 Subject: [PATCH 04/10] MINOR: [R] Replace use of show_query in test-duckdb.R with dbplyr::sql_build (#40955) ### Rationale for this change I just ran the R package tests and saw a printed query mixed amongst testthat output: ``` [ FAIL 0 | WARN 0 | SKIP 1 | PASS 15 ] SELECT * FROM arrow_010 ``` I thought it would be good to silence this in some way. ### What changes are included in this PR? I silenced this by changing out the call to `dplyr::show_query` for `dbplyr::sql_build` which produces different output but (1) doesn't print as as side-effect, (2) is specifically made for testing, and (3) still produces output we can use in this test. For reference, this is what show_query prints (assuming via `cat`): ``` > show_query(table_four) SELECT * FROM arrow_011 ``` Whereas `sql_build`: ``` > dbplyr::sql_build(table_four) [1] `arrow_011` ``` ### Are these changes tested? Yes, but just manually on my system. ### Are there any user-facing changes? No. Authored-by: Bryce Mecum Signed-off-by: Bryce Mecum --- r/tests/testthat/test-duckdb.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/tests/testthat/test-duckdb.R b/r/tests/testthat/test-duckdb.R index 33ab1ecc7aa4d..dd7b6dba7fde0 100644 --- a/r/tests/testthat/test-duckdb.R +++ b/r/tests/testthat/test-duckdb.R @@ -281,7 +281,7 @@ test_that("to_duckdb passing a connection", { to_duckdb(con = con_separate, auto_disconnect = FALSE) # Generates a query like SELECT * FROM arrow_xxx - table_four_query <- paste(show_query(table_four), collapse = "\n") + table_four_query <- paste(dbplyr::sql_build(table_four), collapse = "\n") table_four_name <- stringr::str_extract(table_four_query, "arrow_[0-9]{3}") expect_false(is.na(table_four_name)) From 36ed0328b43ca39533e58a889f8e091d1f1ca7dc Mon Sep 17 00:00:00 2001 From: James Henderson Date: Thu, 4 Apr 2024 05:39:37 +0100 Subject: [PATCH 05/10] GH-24826: [Java] Add DUV.setOffset method (#40985) ### Are these changes tested? Yes ### Are there any user-facing changes? Yes - the addition of a public DUV.setOffset method * GitHub Issue: #24826 Authored-by: James Henderson Signed-off-by: David Li --- .../codegen/templates/DenseUnionVector.java | 8 ++++ .../arrow/vector/TestDenseUnionVector.java | 38 +++++++++++++++++++ 2 files changed, 46 insertions(+) diff --git a/java/vector/src/main/codegen/templates/DenseUnionVector.java b/java/vector/src/main/codegen/templates/DenseUnionVector.java index c23caf3bb5a03..8edd167152d7f 100644 --- a/java/vector/src/main/codegen/templates/DenseUnionVector.java +++ b/java/vector/src/main/codegen/templates/DenseUnionVector.java @@ -908,6 +908,14 @@ private int getTypeBufferValueCapacity() { return (int) typeBuffer.capacity() / TYPE_WIDTH; } + public void setOffset(int index, int offset) { + while (index >= getOffsetBufferValueCapacity()) { + reallocOffsetBuffer(); + } + + offsetBuffer.setInt((long) index * OFFSET_WIDTH, offset); + } + private long getOffsetBufferValueCapacity() { return offsetBuffer.capacity() / OFFSET_WIDTH; } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestDenseUnionVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestDenseUnionVector.java index 8fd33eb5a8432..2c29861561bb7 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestDenseUnionVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestDenseUnionVector.java @@ -99,6 +99,44 @@ public void testDenseUnionVector() throws Exception { } } + @Test + public void testSetOffset() { + try (DenseUnionVector duv = DenseUnionVector.empty("foo", allocator)) { + duv.allocateNew(); + byte i32TypeId = duv.registerNewTypeId(Field.notNullable("i32", MinorType.INT.getType())); + byte f64TypeId = duv.registerNewTypeId(Field.notNullable("f64", MinorType.FLOAT8.getType())); + + IntVector i32Vector = ((IntVector) duv.addVector(i32TypeId, new IntVector("i32", allocator))); + Float8Vector f64Vector = ((Float8Vector) duv.addVector(f64TypeId, new Float8Vector("f64", allocator))); + + i32Vector.allocateNew(3); + f64Vector.allocateNew(1); + + duv.setTypeId(0, i32TypeId); + duv.setOffset(0, 0); + i32Vector.set(0, 42); + + duv.setTypeId(1, i32TypeId); + duv.setOffset(1, 1); + i32Vector.set(1, 43); + + duv.setTypeId(2, f64TypeId); + duv.setOffset(2, 0); + f64Vector.set(0, 3.14); + + duv.setTypeId(3, i32TypeId); + duv.setOffset(3, 2); + i32Vector.set(2, 44); + + duv.setValueCount(4); + + assertEquals(42, duv.getObject(0)); + assertEquals(43, duv.getObject(1)); + assertEquals(3.14, duv.getObject(2)); + assertEquals(44, duv.getObject(3)); + } + } + @Test public void testTransfer() throws Exception { try (DenseUnionVector srcVector = new DenseUnionVector(EMPTY_SCHEMA_PATH, allocator, null, null)) { From 26631d7504420ff00a827d40273b589c6d38860f Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Thu, 4 Apr 2024 10:29:26 +0200 Subject: [PATCH 06/10] GH-40806: [C++] Revert changes from PR #40857 (#40980) Revert changes from https://github.com/apache/arrow/pull/40857. `GetRuntimeInfo` returns the SIMD level for dynamic dispatch, but Neon currently does not participate in dynamic dispatch (actually, Neon should be available by default on all modern Arm CPUs AFAIU). Authored-by: Antoine Pitrou Signed-off-by: Antoine Pitrou --- cpp/src/arrow/config.cc | 2 -- 1 file changed, 2 deletions(-) diff --git a/cpp/src/arrow/config.cc b/cpp/src/arrow/config.cc index 1f852e84d3d5c..9e32e5437325f 100644 --- a/cpp/src/arrow/config.cc +++ b/cpp/src/arrow/config.cc @@ -58,8 +58,6 @@ std::string MakeSimdLevelString(QueryFlagFunction&& query_flag) { return "avx"; } else if (query_flag(CpuInfo::SSE4_2)) { return "sse4_2"; - } else if (query_flag(CpuInfo::ASIMD)) { - return "neon"; } else { return "none"; } From 640c10191a51f6d0f408c72f45dbf5d94ec0b9d7 Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Thu, 4 Apr 2024 02:51:58 -0700 Subject: [PATCH 07/10] GH-40224: [C++] Fix: improve the backpressure handling in the dataset writer (#40722) ### Rationale for this change The dataset writer would fire the resume callback as soon as the underlying dataset writer's queues freed up, even if there were pending tasks. Backpressure is not applied immediately and so a few tasks will always trickle in. If backpressure is pausing and then resuming frequently this can lead to a buildup of pending tasks and uncontrolled memory growth. ### What changes are included in this PR? The resume callback is not called until all pending write tasks have completed. ### Are these changes tested? There is quite an extensive set of tests for the dataset writer already and they continue to pass. I ran them on repeat, with and without stress, and did not see any issues. However, the underlying problem (dataset writer can have uncontrolled memory growth) is still not tested as it is quite difficult to test. I was able to run the setup described in the issue to reproduce the issue. With this fix the repartitioning task completes for me. ### Are there any user-facing changes? No * GitHub Issue: #40224 Authored-by: Weston Pace Signed-off-by: Antoine Pitrou --- cpp/src/arrow/dataset/dataset_writer.cc | 36 ++++++++++++++++++++----- cpp/src/arrow/util/async_util.cc | 7 +++++ cpp/src/arrow/util/async_util.h | 3 +++ cpp/src/arrow/util/async_util_test.cc | 1 + 4 files changed, 41 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/dataset/dataset_writer.cc b/cpp/src/arrow/dataset/dataset_writer.cc index 34731d19ab3eb..754386275d60c 100644 --- a/cpp/src/arrow/dataset/dataset_writer.cc +++ b/cpp/src/arrow/dataset/dataset_writer.cc @@ -515,7 +515,7 @@ class DatasetWriter::DatasetWriterImpl { std::function finish_callback, uint64_t max_rows_queued) : scheduler_(scheduler), write_tasks_(util::MakeThrottledAsyncTaskGroup( - scheduler_, 1, /*queue=*/nullptr, + scheduler_, /*max_concurrent_cost=*/1, /*queue=*/nullptr, [finish_callback = std::move(finish_callback)] { finish_callback(); return Status::OK(); @@ -541,6 +541,23 @@ class DatasetWriter::DatasetWriterImpl { } } + void ResumeIfNeeded() { + if (!paused_) { + return; + } + bool needs_resume = false; + { + std::lock_guard lg(mutex_); + if (!write_tasks_ || write_tasks_->QueueSize() == 0) { + needs_resume = true; + } + } + if (needs_resume) { + paused_ = false; + resume_callback_(); + } + } + void WriteRecordBatch(std::shared_ptr batch, const std::string& directory, const std::string& prefix) { write_tasks_->AddSimpleTask( @@ -549,11 +566,14 @@ class DatasetWriter::DatasetWriterImpl { WriteAndCheckBackpressure(std::move(batch), directory, prefix); if (!has_room.is_finished()) { // We don't have to worry about sequencing backpressure here since - // task_group_ serves as our sequencer. If batches continue to arrive after - // we pause they will queue up in task_group_ until we free up and call - // Resume + // task_group_ serves as our sequencer. If batches continue to arrive + // after we pause they will queue up in task_group_ until we free up and + // call Resume pause_callback_(); - return has_room.Then([this] { resume_callback_(); }); + paused_ = true; + return has_room.Then([this] { ResumeIfNeeded(); }); + } else { + ResumeIfNeeded(); } return has_room; }, @@ -571,6 +591,9 @@ class DatasetWriter::DatasetWriterImpl { return Future<>::MakeFinished(); }, "DatasetWriter::FinishAll"sv); + // Reset write_tasks_ to signal that we are done adding tasks, this will allow + // us to invoke the finish callback once the tasks wrap up. + std::lock_guard lg(mutex_); write_tasks_.reset(); } @@ -660,7 +683,7 @@ class DatasetWriter::DatasetWriterImpl { } util::AsyncTaskScheduler* scheduler_ = nullptr; - std::unique_ptr write_tasks_; + std::unique_ptr write_tasks_; Future<> finish_fut_ = Future<>::Make(); FileSystemDatasetWriteOptions write_options_; DatasetWriterState writer_state_; @@ -670,6 +693,7 @@ class DatasetWriter::DatasetWriterImpl { std::unordered_map> directory_queues_; std::mutex mutex_; + bool paused_ = false; Status err_; }; diff --git a/cpp/src/arrow/util/async_util.cc b/cpp/src/arrow/util/async_util.cc index 63e27bfbe5773..fbd45eadac2cd 100644 --- a/cpp/src/arrow/util/async_util.cc +++ b/cpp/src/arrow/util/async_util.cc @@ -118,6 +118,8 @@ class FifoQueue : public ThrottledAsyncTaskScheduler::Queue { void Purge() override { tasks_.clear(); } + std::size_t Size() const override { return tasks_.size(); } + private: std::list> tasks_; }; @@ -332,6 +334,10 @@ class ThrottledAsyncTaskSchedulerImpl void Pause() override { throttle_->Pause(); } void Resume() override { throttle_->Resume(); } + std::size_t QueueSize() override { + std::lock_guard lk(mutex_); + return queue_->Size(); + } const util::tracing::Span& span() const override { return target_->span(); } private: @@ -499,6 +505,7 @@ class ThrottledAsyncTaskGroup : public ThrottledAsyncTaskScheduler { : throttle_(std::move(throttle)), task_group_(std::move(task_group)) {} void Pause() override { throttle_->Pause(); } void Resume() override { throttle_->Resume(); } + std::size_t QueueSize() override { return throttle_->QueueSize(); } const util::tracing::Span& span() const override { return task_group_->span(); } bool AddTask(std::unique_ptr task) override { return task_group_->AddTask(std::move(task)); diff --git a/cpp/src/arrow/util/async_util.h b/cpp/src/arrow/util/async_util.h index 7a675da59facd..d9ed63bdbce22 100644 --- a/cpp/src/arrow/util/async_util.h +++ b/cpp/src/arrow/util/async_util.h @@ -226,6 +226,7 @@ class ARROW_EXPORT ThrottledAsyncTaskScheduler : public AsyncTaskScheduler { virtual bool Empty() = 0; /// Purge the queue of all items virtual void Purge() = 0; + virtual std::size_t Size() const = 0; }; class Throttle { @@ -277,6 +278,8 @@ class ARROW_EXPORT ThrottledAsyncTaskScheduler : public AsyncTaskScheduler { /// Allows task to be submitted again. If there is a max_concurrent_cost limit then /// it will still apply. virtual void Resume() = 0; + /// Return the number of tasks queued but not yet submitted + virtual std::size_t QueueSize() = 0; /// Create a throttled view of a scheduler /// diff --git a/cpp/src/arrow/util/async_util_test.cc b/cpp/src/arrow/util/async_util_test.cc index 313ca91912335..1f9aad453e9c4 100644 --- a/cpp/src/arrow/util/async_util_test.cc +++ b/cpp/src/arrow/util/async_util_test.cc @@ -680,6 +680,7 @@ class PriorityQueue : public ThrottledAsyncTaskScheduler::Queue { queue_.pop(); } } + std::size_t Size() const { return queue_.size(); } private: std::priority_queue, From ad6758900da1706d3cbfd59e5fe7d1d548c4235b Mon Sep 17 00:00:00 2001 From: James Henderson Date: Thu, 4 Apr 2024 12:34:40 +0100 Subject: [PATCH 08/10] MINOR: [Java] `DenseUnionVector.empty` should create not-nullable DUVs (#41001) ### Rationale for this change DUVs do not have a validity vector, so cannot be set to null - `isNull`, for example, always returns false. This change ensures vectors created through `DenseUnionVector.empty` reflect this in their FieldType. ### What changes are included in this PR? `DenseUnionVector.empty` now creates DUVs with a not-nullable `FieldType` ### Are these changes tested? I haven't added an explicit test for this as it would essentially be testing `FieldType.notNullable` - confirmed that it doesn't break any existing tests. ### Are there any user-facing changes? Yes, strictly speaking this is a public change, correcting a bug in a public API. **This PR includes breaking changes to public APIs.** Authored-by: James Henderson Signed-off-by: David Li --- java/vector/src/main/codegen/templates/DenseUnionVector.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/vector/src/main/codegen/templates/DenseUnionVector.java b/java/vector/src/main/codegen/templates/DenseUnionVector.java index 8edd167152d7f..27fd8e9798b67 100644 --- a/java/vector/src/main/codegen/templates/DenseUnionVector.java +++ b/java/vector/src/main/codegen/templates/DenseUnionVector.java @@ -124,7 +124,7 @@ public class DenseUnionVector extends AbstractContainerVector implements FieldVe ArrowType.Struct.INSTANCE, /*dictionary*/ null, /*metadata*/ null); public static DenseUnionVector empty(String name, BufferAllocator allocator) { - FieldType fieldType = FieldType.nullable(new ArrowType.Union( + FieldType fieldType = FieldType.notNullable(new ArrowType.Union( UnionMode.Dense, null)); return new DenseUnionVector(name, allocator, fieldType, null); } From bbeeb33a2fb65f40caf6c3176ee377de2b9de6e5 Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Thu, 4 Apr 2024 14:38:16 +0200 Subject: [PATCH 09/10] GH-40974: [CI][Python] CI failures on Python builds due to pytest_cython (#40975) ### Rationale for this change We are seeing sporadic CI failures on Python builds due to `pytest_cython`. ### What changes are included in this PR? Upper pin of 0.3.0 added for `pytest-cython`. ### Are these changes tested? With a green CI. ### Are there any user-facing changes? No. * GitHub Issue: #40974 Authored-by: AlenkaF Signed-off-by: Sutou Kouhei --- ci/conda_env_sphinx.txt | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ci/conda_env_sphinx.txt b/ci/conda_env_sphinx.txt index 6899f9c36a7f6..0a356d5722c42 100644 --- a/ci/conda_env_sphinx.txt +++ b/ci/conda_env_sphinx.txt @@ -29,5 +29,8 @@ sphinx-copybutton sphinxcontrib-jquery sphinx==6.2 # Requirement for doctest-cython -pytest-cython +# Needs upper pin of 0.3.0, see: +# https://github.com/lgpage/pytest-cython/issues/67 +# With 0.3.* bug fix release, the pin can be removed +pytest-cython==0.2.2 pandas From b99b00dd66586cf54b04ce6a51eb1cf68b1510a3 Mon Sep 17 00:00:00 2001 From: mwish Date: Thu, 4 Apr 2024 21:35:00 +0800 Subject: [PATCH 10/10] GH-40994: [C++][Parquet] RleBooleanDecoder supports DecodeArrow with nulls (#40995) ### Rationale for this change Supports DecodeArrow with nulls in RleBooleanDecoder ### What changes are included in this PR? Supports DecodeArrow with nulls in RleBooleanDecoder ### Are these changes tested? Yes ### Are there any user-facing changes? currently not * GitHub Issue: #40994 Lead-authored-by: mwish Co-authored-by: mwish Signed-off-by: Antoine Pitrou --- cpp/src/parquet/encoding.cc | 63 ++++++++++++++++++++------- cpp/src/parquet/encoding_benchmark.cc | 9 ++-- cpp/src/parquet/encoding_test.cc | 46 +++++++++---------- 3 files changed, 74 insertions(+), 44 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index f16e9b34fc682..6e93b493392c9 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -3143,27 +3143,58 @@ class RleBooleanDecoder : public DecoderImpl, virtual public BooleanDecoder { int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset, typename EncodingTraits::Accumulator* out) override { - if (null_count != 0) { - // TODO(ARROW-34660): implement DecodeArrow with null slots. - ParquetException::NYI("RleBoolean DecodeArrow with null slots"); + if (null_count == num_values) { + PARQUET_THROW_NOT_OK(out->AppendNulls(null_count)); + return 0; } constexpr int kBatchSize = 1024; std::array values; - int sum_decode_count = 0; - do { - int current_batch = std::min(kBatchSize, num_values); - int decoded_count = decoder_->GetBatch(values.data(), current_batch); - if (decoded_count == 0) { - break; + const int num_non_null_values = num_values - null_count; + // Remaining non-null boolean values to read from decoder. + // We decode from `decoder_` with maximum 1024 size batches. + int num_remain_non_null_values = num_non_null_values; + int current_index_in_batch = 0; + int current_batch_size = 0; + auto next_boolean_batch = [&]() { + DCHECK_GT(num_remain_non_null_values, 0); + DCHECK_EQ(current_index_in_batch, current_batch_size); + current_batch_size = std::min(num_remain_non_null_values, kBatchSize); + int decoded_count = decoder_->GetBatch(values.data(), current_batch_size); + if (ARROW_PREDICT_FALSE(decoded_count != current_batch_size)) { + // required values is more than values in decoder. + ParquetException::EofException(); } - sum_decode_count += decoded_count; - PARQUET_THROW_NOT_OK(out->Reserve(sum_decode_count)); - for (int i = 0; i < decoded_count; ++i) { - PARQUET_THROW_NOT_OK(out->Append(values[i])); + num_remain_non_null_values -= current_batch_size; + current_index_in_batch = 0; + }; + + // Reserve all values including nulls first + PARQUET_THROW_NOT_OK(out->Reserve(num_values)); + if (null_count == 0) { + // Fast-path for not having nulls. + do { + next_boolean_batch(); + PARQUET_THROW_NOT_OK( + out->AppendValues(values.begin(), values.begin() + current_batch_size)); + num_values -= current_batch_size; + current_index_in_batch = 0; + } while (num_values > 0); + return num_non_null_values; + } + auto next_value = [&]() -> bool { + if (current_index_in_batch == current_batch_size) { + next_boolean_batch(); + DCHECK_GT(current_batch_size, 0); } - num_values -= decoded_count; - } while (num_values > 0); - return sum_decode_count; + DCHECK_LT(current_index_in_batch, current_batch_size); + bool value = values[current_index_in_batch]; + ++current_index_in_batch; + return value; + }; + VisitNullBitmapInline( + valid_bits, valid_bits_offset, num_values, null_count, + [&]() { out->UnsafeAppend(next_value()); }, [&]() { out->UnsafeAppendNull(); }); + return num_non_null_values; } int DecodeArrow( diff --git a/cpp/src/parquet/encoding_benchmark.cc b/cpp/src/parquet/encoding_benchmark.cc index 9c07d262b350e..a858c53e931d8 100644 --- a/cpp/src/parquet/encoding_benchmark.cc +++ b/cpp/src/parquet/encoding_benchmark.cc @@ -1518,11 +1518,10 @@ BENCHMARK_DEFINE_F(BM_DecodeArrowBooleanRle, DecodeArrowNonNull) (benchmark::State& state) { DecodeArrowNonNullDenseBenchmark(state); } BENCHMARK_REGISTER_F(BM_DecodeArrowBooleanRle, DecodeArrowNonNull) ->Range(MIN_RANGE, MAX_RANGE); -// TODO(mwish): RleBoolean not implemented DecodeArrow with null slots yet. -// BENCHMARK_DEFINE_F(BM_DecodeArrowBooleanRle, DecodeArrowWithNull) -//(benchmark::State& state) { DecodeArrowWithNullDenseBenchmark(state); } -// BENCHMARK_REGISTER_F(BM_DecodeArrowBooleanRle, DecodeArrowWithNull) -// ->Apply(BooleanWithNullCustomArguments); +BENCHMARK_DEFINE_F(BM_DecodeArrowBooleanRle, DecodeArrowWithNull) +(benchmark::State& state) { DecodeArrowWithNullDenseBenchmark(state); } +BENCHMARK_REGISTER_F(BM_DecodeArrowBooleanRle, DecodeArrowWithNull) + ->Apply(BooleanWithNullCustomArguments); BENCHMARK_DEFINE_F(BM_DecodeArrowBooleanPlain, DecodeArrow) (benchmark::State& state) { DecodeArrowDenseBenchmark(state); } diff --git a/cpp/src/parquet/encoding_test.cc b/cpp/src/parquet/encoding_test.cc index ea0029f4c7d7f..bb5126ce251d4 100644 --- a/cpp/src/parquet/encoding_test.cc +++ b/cpp/src/parquet/encoding_test.cc @@ -602,7 +602,7 @@ TEST(PlainEncodingAdHoc, ArrowBinaryDirectPut) { // Check that one can put several Arrow arrays into a given encoder // and decode to the right values (see GH-36939) -TEST(PlainBooleanArrayEncoding, AdHocRoundTrip) { +TEST(BooleanArrayEncoding, AdHocRoundTrip) { std::vector> arrays{ ::arrow::ArrayFromJSON(::arrow::boolean(), R"([])"), ::arrow::ArrayFromJSON(::arrow::boolean(), R"([false, null, true])"), @@ -610,27 +610,29 @@ TEST(PlainBooleanArrayEncoding, AdHocRoundTrip) { ::arrow::ArrayFromJSON(::arrow::boolean(), R"([true, null, false])"), }; - auto encoder = MakeTypedEncoder(Encoding::PLAIN, - /*use_dictionary=*/false); - for (const auto& array : arrays) { - encoder->Put(*array); - } - auto buffer = encoder->FlushValues(); - auto decoder = MakeTypedDecoder(Encoding::PLAIN); - EXPECT_OK_AND_ASSIGN(auto expected, ::arrow::Concatenate(arrays)); - decoder->SetData(static_cast(expected->length()), buffer->data(), - static_cast(buffer->size())); - - ::arrow::BooleanBuilder builder; - ASSERT_EQ(static_cast(expected->length() - expected->null_count()), - decoder->DecodeArrow(static_cast(expected->length()), - static_cast(expected->null_count()), - expected->null_bitmap_data(), 0, &builder)); + for (auto encoding : {Encoding::PLAIN, Encoding::RLE}) { + auto encoder = MakeTypedEncoder(encoding, + /*use_dictionary=*/false); + for (const auto& array : arrays) { + encoder->Put(*array); + } + auto buffer = encoder->FlushValues(); + auto decoder = MakeTypedDecoder(encoding); + EXPECT_OK_AND_ASSIGN(auto expected, ::arrow::Concatenate(arrays)); + decoder->SetData(static_cast(expected->length()), buffer->data(), + static_cast(buffer->size())); + + ::arrow::BooleanBuilder builder; + ASSERT_EQ(static_cast(expected->length() - expected->null_count()), + decoder->DecodeArrow(static_cast(expected->length()), + static_cast(expected->null_count()), + expected->null_bitmap_data(), 0, &builder)); - std::shared_ptr<::arrow::Array> result; - ASSERT_OK(builder.Finish(&result)); - ASSERT_EQ(expected->length(), result->length()); - ::arrow::AssertArraysEqual(*expected, *result, /*verbose=*/true); + std::shared_ptr<::arrow::Array> result; + ASSERT_OK(builder.Finish(&result)); + ASSERT_EQ(expected->length(), result->length()); + ::arrow::AssertArraysEqual(*expected, *result, /*verbose=*/true); + } } template @@ -963,8 +965,6 @@ TYPED_TEST(EncodingAdHocTyped, ByteStreamSplitArrowDirectPut) { } TYPED_TEST(EncodingAdHocTyped, RleArrowDirectPut) { - // TODO: test with nulls once RleBooleanDecoder::DecodeArrow supports them - this->null_probability_ = 0; for (auto seed : {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}) { this->Rle(seed); }