From a5f80184750fb09fd232f2acbe5558b03b45a156 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Thu, 4 Apr 2024 06:41:04 +0900
Subject: [PATCH 01/10] GH-40961: [GLib] Suppress warnings for Vala examples on
 macOS (#40962)

### Rationale for this change

There are some warnings for Vala examples on macOS:

```text
FAILED: example/vala/read-file.p/meson-generated_read-file.c.o
ccache cc -Iexample/vala/read-file.p -Iexample/vala -I../../c_glib/example/vala -I/Users/runner/work/arrow/arrow/build/c_glib -I/Users/runner/work/arrow/arrow/c_glib -Iarrow-glib -I../../c_glib/arrow-glib -I/usr/local/Cellar/glib/2.80.0_2/include -I/usr/local/Cellar/glib/2.80.0_2/include/glib-2.0 -I/usr/local/Cellar/glib/2.80.0_2/lib/glib-2.0/include -I/usr/local/opt/gettext/include -I/usr/local/Cellar/pcre2/10.43/include -I/Library/Developer/CommandLineTools/SDKs/MacOSX12.sdk/usr/include/ffi -fdiagnostics-color=always -Wall -Winvalid-pch -Werror -std=c99 -O0 -g -DARROW_NO_DEPRECATED_API -MD -MQ example/vala/read-file.p/meson-generated_read-file.c.o -MF example/vala/read-file.p/meson-generated_read-file.c.o.d -o example/vala/read-file.p/meson-generated_read-file.c.o -c example/vala/read-file.p/read-file.c
../../c_glib/example/vala/read-file.vala:123:61: error: format specifies type 'long long' but the argument has type 'gint' (aka 'int') [-Werror,-Wformat]
                                fprintf (_tmp2_, "columns[%" G_GINT64_FORMAT "](%s): ", nth_column, _tmp3_);
                                                          ~~~~~~~~~~~~~~~~~~            ^~~~~~~~~~
1 error generated.
```

```text
FAILED: example/vala/read-stream.p/meson-generated_read-stream.c.o
ccache cc -Iexample/vala/read-stream.p -Iexample/vala -I../../c_glib/example/vala -I/Users/runner/work/arrow/arrow/build/c_glib -I/Users/runner/work/arrow/arrow/c_glib -Iarrow-glib -I../../c_glib/arrow-glib -I/usr/local/Cellar/glib/2.80.0_2/include -I/usr/local/Cellar/glib/2.80.0_2/include/glib-2.0 -I/usr/local/Cellar/glib/2.80.0_2/lib/glib-2.0/include -I/usr/local/opt/gettext/include -I/usr/local/Cellar/pcre2/10.43/include -I/Library/Developer/CommandLineTools/SDKs/MacOSX12.sdk/usr/include/ffi -fdiagnostics-color=always -Wall -Winvalid-pch -Werror -std=c99 -O0 -g -DARROW_NO_DEPRECATED_API -MD -MQ example/vala/read-stream.p/meson-generated_read-stream.c.o -MF example/vala/read-stream.p/meson-generated_read-stream.c.o.d -o example/vala/read-stream.p/meson-generated_read-stream.c.o -c example/vala/read-stream.p/read-stream.c
../../c_glib/example/vala/read-stream.vala:123:61: error: format specifies type 'long long' but the argument has type 'gint' (aka 'int') [-Werror,-Wformat]
                                fprintf (_tmp2_, "columns[%" G_GINT64_FORMAT "](%s): ", nth_column, _tmp3_);
                                                          ~~~~~~~~~~~~~~~~~~            ^~~~~~~~~~
1 error generated.
```

```text
FAILED: example/vala/write-file.p/meson-generated_write-file.c.o
ccache cc -Iexample/vala/write-file.p -Iexample/vala -I../../c_glib/example/vala -I/Users/runner/work/arrow/arrow/build/c_glib -I/Users/runner/work/arrow/arrow/c_glib -Iarrow-glib -I../../c_glib/arrow-glib -I/usr/local/Cellar/glib/2.80.0_2/include -I/usr/local/Cellar/glib/2.80.0_2/include/glib-2.0 -I/usr/local/Cellar/glib/2.80.0_2/lib/glib-2.0/include -I/usr/local/opt/gettext/include -I/usr/local/Cellar/pcre2/10.43/include -I/Library/Developer/CommandLineTools/SDKs/MacOSX12.sdk/usr/include/ffi -fdiagnostics-color=always -Wall -Winvalid-pch -Werror -std=c99 -O0 -g -DARROW_NO_DEPRECATED_API -MD -MQ example/vala/write-file.p/meson-generated_write-file.c.o -MF example/vala/write-file.p/meson-generated_write-file.c.o.d -o example/vala/write-file.p/meson-generated_write-file.c.o -c example/vala/write-file.p/write-file.c
write-file.c:373:8: error: variable '_tmp45__length1' set but not used [-Werror,-Wunused-but-set-variable]
                gint _tmp45__length1;
                     ^
write-file.c:504:8: error: variable '_tmp57__length1' set but not used [-Werror,-Wunused-but-set-variable]
                gint _tmp57__length1;
                     ^
write-file.c:635:8: error: variable '_tmp69__length1' set but not used [-Werror,-Wunused-but-set-variable]
                gint _tmp69__length1;
                     ^
write-file.c:766:8: error: variable '_tmp81__length1' set but not used [-Werror,-Wunused-but-set-variable]
                gint _tmp81__length1;
                     ^
write-file.c:897:8: error: variable '_tmp93__length1' set but not used [-Werror,-Wunused-but-set-variable]
                gint _tmp93__length1;
                     ^
write-file.c:1028:8: error: variable '_tmp105__length1' set but not used [-Werror,-Wunused-but-set-variable]
                gint _tmp105__length1;
                     ^
write-file.c:1159:8: error: variable '_tmp117__length1' set but not used [-Werror,-Wunused-but-set-variable]
                gint _tmp117__length1;
                     ^
write-file.c:1290:8: error: variable '_tmp129__length1' set but not used [-Werror,-Wunused-but-set-variable]
                gint _tmp129__length1;
                     ^
write-file.c:1421:8: error: variable '_tmp141__length1' set but not used [-Werror,-Wunused-but-set-variable]
                gint _tmp141__length1;
                     ^
write-file.c:1552:8: error: variable '_tmp153__length1' set but not used [-Werror,-Wunused-but-set-variable]
                gint _tmp153__length1;
                     ^
10 errors generated.
```

### What changes are included in this PR?

* Fix wrong format string
* Disable `unused-but-set-variable` warning

### Are these changes tested?

Yes.

### Are there any user-facing changes?

No.
* GitHub Issue: #40961

Authored-by: Sutou Kouhei <kou@clear-code.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 c_glib/example/vala/meson.build      | 6 +++++-
 c_glib/example/vala/read-file.vala   | 4 ++--
 c_glib/example/vala/read-stream.vala | 4 ++--
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/c_glib/example/vala/meson.build b/c_glib/example/vala/meson.build
index 474f0b1e9a51a..ff65a7328f171 100644
--- a/c_glib/example/vala/meson.build
+++ b/c_glib/example/vala/meson.build
@@ -18,11 +18,15 @@
 # under the License.
 
 if generate_vapi
+  c_flags = [
+    '-Wunused-but-set-variable',
+  ]
+  c_flags = meson.get_compiler('c').get_supported_arguments(c_flags)
   vala_example_executable_kwargs = {
     'c_args': [
       '-I' + project_build_root,
       '-I' + project_source_root,
-    ],
+    ] + c_flags,
     'dependencies': [
       arrow_glib_vapi,
       dependency('gio-2.0'),
diff --git a/c_glib/example/vala/read-file.vala b/c_glib/example/vala/read-file.vala
index a0a06275c4b24..287eddac76352 100644
--- a/c_glib/example/vala/read-file.vala
+++ b/c_glib/example/vala/read-file.vala
@@ -119,8 +119,8 @@ void print_array(GArrow.Array array) {
 
 void print_record_batch(GArrow.RecordBatch record_batch) {
     var n_columns = record_batch.get_n_columns();
-    for (var nth_column = 0; nth_column < n_columns; nth_column++) {
-        stdout.printf("columns[%" + int64.FORMAT + "](%s): ",
+    for (int nth_column = 0; nth_column < n_columns; nth_column++) {
+        stdout.printf("columns[%d](%s): ",
                       nth_column,
                       record_batch.get_column_name(nth_column));
         var array = record_batch.get_column_data(nth_column);
diff --git a/c_glib/example/vala/read-stream.vala b/c_glib/example/vala/read-stream.vala
index c58dc848930a8..4520c8609bdaf 100644
--- a/c_glib/example/vala/read-stream.vala
+++ b/c_glib/example/vala/read-stream.vala
@@ -119,8 +119,8 @@ void print_array(GArrow.Array array) {
 
 void print_record_batch(GArrow.RecordBatch record_batch) {
     var n_columns = record_batch.get_n_columns();
-    for (var nth_column = 0; nth_column < n_columns; nth_column++) {
-        stdout.printf("columns[%" + int64.FORMAT + "](%s): ",
+    for (int nth_column = 0; nth_column < n_columns; nth_column++) {
+        stdout.printf("columns[%d](%s): ",
                       nth_column,
                       record_batch.get_column_name(nth_column));
         var array = record_batch.get_column_data(nth_column);

From 5b09059ae40164c173a6ef593ab7b463b24d431d Mon Sep 17 00:00:00 2001
From: Dominik Moritz <domoritz@apache.org>
Date: Wed, 3 Apr 2024 18:31:19 -0400
Subject: [PATCH 02/10] GH-40851: [JS] Fix nullcount and make vectors created
 from typed arrays not nullable (#40852)

* GitHub Issue: #40851
---
 js/src/data.ts                      | 13 ++++----
 js/src/vector.ts                    |  2 +-
 js/test/unit/vector/vector-tests.ts | 46 ++++++++++++++++++++++++++---
 3 files changed, 51 insertions(+), 10 deletions(-)

diff --git a/js/src/data.ts b/js/src/data.ts
index 6f8792508858b..45fcc35d37676 100644
--- a/js/src/data.ts
+++ b/js/src/data.ts
@@ -109,7 +109,10 @@ export class Data<T extends DataType = DataType> {
         let nullCount = this._nullCount;
         let nullBitmap: Uint8Array | undefined;
         if (nullCount <= kUnknownNullCount && (nullBitmap = this.nullBitmap)) {
-            this._nullCount = nullCount = this.length - popcnt_bit_range(nullBitmap, this.offset, this.offset + this.length);
+            this._nullCount = nullCount = nullBitmap.length === 0 ?
+                // no null bitmap, so all values are valid
+                0 :
+                this.length - popcnt_bit_range(nullBitmap, this.offset, this.offset + this.length);
         }
         return nullCount;
     }
@@ -177,16 +180,16 @@ export class Data<T extends DataType = DataType> {
                 // if we have a nullBitmap, truncate + slice and set it over the pre-filled 1s
                 if (this.nullCount > 0) {
                     nullBitmap.set(truncateBitmap(offset, length, this.nullBitmap), 0);
+                    Object.assign(this, { nullBitmap });
+                } else {
+                    Object.assign(this, { nullBitmap, _nullCount: 0 });
                 }
-                Object.assign(this, { nullBitmap, _nullCount: -1 });
             }
 
             const byte = nullBitmap[byteOffset];
 
             prev = (byte & mask) !== 0;
-            value ?
-                (nullBitmap[byteOffset] = byte | mask) :
-                (nullBitmap[byteOffset] = byte & ~mask);
+            nullBitmap[byteOffset] = value ? (byte | mask) : (byte & ~mask);
         }
 
         if (prev !== !!value) {
diff --git a/js/src/vector.ts b/js/src/vector.ts
index a7c103bc326ee..1b0d9a05796f0 100644
--- a/js/src/vector.ts
+++ b/js/src/vector.ts
@@ -445,7 +445,7 @@ export function makeVector(init: any) {
             if (init instanceof DataView) {
                 init = new Uint8Array(init.buffer);
             }
-            const props = { offset: 0, length: init.length, nullCount: 0, data: init };
+            const props = { offset: 0, length: init.length, nullCount: -1, data: init };
             if (init instanceof Int8Array) { return new Vector([makeData({ ...props, type: new dtypes.Int8 })]); }
             if (init instanceof Int16Array) { return new Vector([makeData({ ...props, type: new dtypes.Int16 })]); }
             if (init instanceof Int32Array) { return new Vector([makeData({ ...props, type: new dtypes.Int32 })]); }
diff --git a/js/test/unit/vector/vector-tests.ts b/js/test/unit/vector/vector-tests.ts
index bfcf0d8547861..a10d7c757ca17 100644
--- a/js/test/unit/vector/vector-tests.ts
+++ b/js/test/unit/vector/vector-tests.ts
@@ -16,7 +16,7 @@
 // under the License.
 
 import {
-    Bool, DateDay, DateMillisecond, Dictionary, Float64, Int32, List, makeVector, Struct, Timestamp, TimeUnit, Utf8, LargeUtf8, util, Vector, vectorFromArray
+    Bool, DateDay, DateMillisecond, Dictionary, Float64, Int32, List, makeVector, Struct, Timestamp, TimeUnit, Utf8, LargeUtf8, util, Vector, vectorFromArray, makeData
 } from 'apache-arrow';
 
 describe(`makeVectorFromArray`, () => {
@@ -33,6 +33,47 @@ describe(`makeVectorFromArray`, () => {
     });
 });
 
+describe(`basic vector methods`, () => {
+    test(`not nullable`, () => {
+        const vector = makeVector([makeData({ data: new Int32Array([1, 2, 3]), nullCount: -1, type: new Int32() })]);
+        expect(vector.nullable).toBe(false);
+        expect(vector.nullCount).toBe(0);
+    });
+
+    test(`nullable`, () => {
+        const vector = makeVector([makeData({ data: new Int32Array([1, 2, 3]), nullCount: 0, type: new Int32() })]);
+        expect(vector.nullable).toBe(true);
+        expect(vector.nullCount).toBe(0);
+        expect(vector.isValid(0)).toBe(true);
+
+        // set a value to null
+        vector.set(0, null);
+        expect(vector.nullable).toBe(true);
+        expect(vector.nullCount).toBe(1);
+        expect(vector.isValid(0)).toBe(false);
+
+        // set the same value to null which should not change anything
+        vector.set(0, null);
+        expect(vector.nullable).toBe(true);
+        expect(vector.nullCount).toBe(1);
+
+        // set a different value to null
+        vector.set(1, null);
+        expect(vector.nullable).toBe(true);
+        expect(vector.nullCount).toBe(2);
+
+        // set first value to non-null
+        vector.set(0, 1);
+        expect(vector.nullable).toBe(true);
+        expect(vector.nullCount).toBe(1);
+
+        // set last null to non-null
+        vector.set(1, 2);
+        expect(vector.nullable).toBe(true);
+        expect(vector.nullCount).toBe(0);
+    });
+});
+
 describe(`StructVector`, () => {
     test(`makeVectorFromArray`, () => {
         const values: { a?: number; b?: string | null; c?: boolean | null }[] = [
@@ -108,7 +149,6 @@ describe(`DateVector`, () => {
 });
 
 describe(`DictionaryVector`, () => {
-
     const dictionary = ['foo', 'bar', 'baz'];
     const extras = ['abc', '123']; // values to search for that should NOT be found
     const dictionary_vec = vectorFromArray(dictionary, new Utf8).memoize();
@@ -117,7 +157,6 @@ describe(`DictionaryVector`, () => {
     const validity = Array.from({ length: indices.length }, () => Math.random() > 0.2);
 
     describe(`index with nullCount == 0`, () => {
-
         const values = indices.map((d) => dictionary[d]);
         const vector = makeVector({
             data: indices,
@@ -133,7 +172,6 @@ describe(`DictionaryVector`, () => {
     });
 
     describe(`index with nullCount > 0`, () => {
-
         const nullBitmap = util.packBools(validity);
         const nullCount = validity.reduce((acc, d) => acc + (d ? 0 : 1), 0);
         const values = indices.map((d, i) => validity[i] ? dictionary[d] : null);

From 2caec860894945e8bfe5b557c825ba962a6a16bd Mon Sep 17 00:00:00 2001
From: Dominik Moritz <domoritz@apache.org>
Date: Wed, 3 Apr 2024 18:32:12 -0400
Subject: [PATCH 03/10] GH-40891: [JS] Store Dates as TimestampMillisecond
 (#40892)

Fixes #40891

Tested with

```ts
const date = new Date("2023-03-29T12:34:56Z");
console.log("original", date)

console.log("=> vec")
const vec = arrow.vectorFromArray([date])
console.log(vec.toArray())
console.log(vec.toJSON())
console.log(vec.type)
console.log(vec.get(0))

console.log("=> vec2")
const vec2 = arrow.vectorFromArray([date], new arrow.DateMillisecond)
console.log(vec2.toArray())
console.log(vec.toJSON())
console.log(vec2.type)
console.log(vec2.get(0))

console.log("=> table")
const table = arrow.tableFromJSON([{ date }])
console.log(table.toArray())
console.log(table.schema.fields[0].type)
console.log(table.getChildAt(0)?.get(0))

console.log("=> table2")
const table2 = arrow.tableFromIPC(arrow.tableToIPC(table));
console.log(table2.toArray())
console.log(table2.schema.fields[0].type)
console.log(table2.getChildAt(0)?.get(0))

console.log("=> table3")
const table3 = new arrow.Table({ dates: vec2 })
console.log(table3.toArray())
console.log(table3.schema.fields[0].type)
console.log(table3.getChildAt(0)?.get(0))
```

```
=> table
[
  {"date": Wed Mar 29 2023 08:34:56 GMT-0400 (Eastern Daylight Time)}
]
TimestampMillisecond {
  typeId: 10,
  unit: 1,
  timezone: undefined,
  toString: [Function: toString],
  ArrayType: [class Int32Array],
  [Symbol(Symbol.toStringTag)]: "Timestamp",
  children: null,
  OffsetArrayType: [class Int32Array],
}
2023-03-29T12:34:56.000Z
=> table2
[
  {"date": Wed Mar 29 2023 08:34:56 GMT-0400 (Eastern Daylight Time)}
]
Timestamp_ {
  typeId: 10,
  unit: 1,
  timezone: null,
  toString: [Function: toString],
  ArrayType: [class Int32Array],
  children: null,
  OffsetArrayType: [class Int32Array],
}
2023-03-29T12:34:56.000Z
=> table3
[
  {"dates": Wed Mar 29 2023 08:34:56 GMT-0400 (Eastern Daylight Time)}
]
DateMillisecond {
  typeId: 8,
  unit: 1,
  toString: [Function: toString],
  ArrayType: [class Int32Array],
  [Symbol(Symbol.toStringTag)]: "Date",
  children: null,
  OffsetArrayType: [class Int32Array],
}
2023-03-29T12:34:56.000Z
```
* GitHub Issue: #40891
---
 js/src/factories.ts                      |  4 ++--
 js/src/type.ts                           | 14 +++++++++++++-
 js/test/unit/vector/date-vector-tests.ts | 19 ++++++++++++++-----
 3 files changed, 29 insertions(+), 8 deletions(-)

diff --git a/js/src/factories.ts b/js/src/factories.ts
index aa54498c50bc0..657ae1b95ab92 100644
--- a/js/src/factories.ts
+++ b/js/src/factories.ts
@@ -65,7 +65,7 @@ export function makeBuilder<T extends dtypes.DataType = any, TNull = any>(option
 export function vectorFromArray(values: readonly (null | undefined)[], type?: dtypes.Null): Vector<dtypes.Null>;
 export function vectorFromArray(values: readonly (null | undefined | boolean)[], type?: dtypes.Bool): Vector<dtypes.Bool>;
 export function vectorFromArray<T extends dtypes.Utf8 | dtypes.Dictionary<dtypes.Utf8> = dtypes.Dictionary<dtypes.Utf8, dtypes.Int32>>(values: readonly (null | undefined | string)[], type?: T): Vector<T>;
-export function vectorFromArray<T extends dtypes.Date_>(values: readonly (null | undefined | Date)[], type?: T): Vector<T>;
+export function vectorFromArray<T extends dtypes.TimestampMillisecond>(values: readonly (null | undefined | Date)[], type?: T): Vector<T>;
 export function vectorFromArray<T extends dtypes.Int>(values: readonly (null | undefined | number)[], type: T): Vector<T>;
 export function vectorFromArray<T extends dtypes.Int64 | dtypes.Uint64 = dtypes.Int64>(values: readonly (null | undefined | bigint)[], type?: T): Vector<T>;
 export function vectorFromArray<T extends dtypes.Float = dtypes.Float64>(values: readonly (null | undefined | number)[], type?: T): Vector<T>;
@@ -145,7 +145,7 @@ function inferType(value: readonly unknown[]): dtypes.DataType {
     } else if (booleansCount + nullsCount === value.length) {
         return new dtypes.Bool;
     } else if (datesCount + nullsCount === value.length) {
-        return new dtypes.DateMillisecond;
+        return new dtypes.TimestampMillisecond;
     } else if (arraysCount + nullsCount === value.length) {
         const array = value as Array<unknown>[];
         const childType = inferType(array[array.findIndex((ary) => ary != null)]);
diff --git a/js/src/type.ts b/js/src/type.ts
index ae3aefa025999..a42552d65ad27 100644
--- a/js/src/type.ts
+++ b/js/src/type.ts
@@ -349,7 +349,19 @@ export class Date_<T extends Dates = Dates> extends DataType<T> {
 
 /** @ignore */
 export class DateDay extends Date_<Type.DateDay> { constructor() { super(DateUnit.DAY); } }
-/** @ignore */
+/**
+ * A signed 64-bit date representing the elapsed time since UNIX epoch (1970-01-01) in milliseconds.
+ * According to the specification, this should be treated as the number of days, in milliseconds,  since the UNIX epoch.
+ * Therefore, values must be evenly divisible by `86_400_000` (the number of milliseconds in a standard day).
+ *
+ * Practically, validation that values of this type are evenly divisible by `86_400_000` is not enforced by this library
+ * for performance and usability reasons.
+ *
+ * Users should prefer to use {@link DateDay} to cleanly represent the number of days. For JS dates,
+ * {@link TimestampMillisecond} is the preferred type.
+ *
+ * @ignore
+ */
 export class DateMillisecond extends Date_<Type.DateMillisecond> { constructor() { super(DateUnit.MILLISECOND); } }
 
 /** @ignore */
diff --git a/js/test/unit/vector/date-vector-tests.ts b/js/test/unit/vector/date-vector-tests.ts
index f8b4c1c7976d2..e5cd49933eac5 100644
--- a/js/test/unit/vector/date-vector-tests.ts
+++ b/js/test/unit/vector/date-vector-tests.ts
@@ -15,10 +15,19 @@
 // specific language governing permissions and limitations
 // under the License.
 
-import { DateDay, DateMillisecond, RecordBatchReader, Table, vectorFromArray } from 'apache-arrow';
+import { DateDay, DateMillisecond, TimestampMillisecond, RecordBatchReader, Table, vectorFromArray } from 'apache-arrow';
+
+describe(`TimestampVector`, () => {
+    test(`Dates are stored in TimestampMillisecond`, () => {
+        const date = new Date('2023-02-01T12:34:56Z');
+        const vec = vectorFromArray([date]);
+        expect(vec.type).toBeInstanceOf(TimestampMillisecond);
+        expect(vec.get(0)).toBe(date.valueOf());
+    });
+});
 
 describe(`DateVector`, () => {
-    it('returns days since the epoch as correct JS Dates', () => {
+    test(`returns days since the epoch as correct JS Dates`, () => {
         const table = new Table(RecordBatchReader.from(test_data));
         const expectedMillis = expectedMillis32();
         const date32 = table.getChildAt<DateDay>(0)!;
@@ -28,7 +37,7 @@ describe(`DateVector`, () => {
         }
     });
 
-    it('returns millisecond longs since the epoch as correct JS Dates', () => {
+    test(`returns millisecond longs since the epoch as correct JS Dates`, () => {
         const table = new Table(RecordBatchReader.from(test_data));
         const expectedMillis = expectedMillis64();
         const date64 = table.getChildAt<DateMillisecond>(1)!;
@@ -38,9 +47,9 @@ describe(`DateVector`, () => {
         }
     });
 
-    it('returns the same date that was in the vector', () => {
+    test(`returns the same date that was in the vector`, () => {
         const dates = [new Date(1950, 1, 0)];
-        const vec = vectorFromArray(dates);
+        const vec = vectorFromArray(dates, new DateMillisecond());
         for (const date of vec) {
             expect(date).toEqual(dates.shift());
         }

From 89d5d8b40a5074a7ae30430b3fa95f7a9daf16da Mon Sep 17 00:00:00 2001
From: Bryce Mecum <petridish@gmail.com>
Date: Wed, 3 Apr 2024 17:14:06 -0800
Subject: [PATCH 04/10] MINOR: [R] Replace use of show_query in test-duckdb.R
 with dbplyr::sql_build (#40955)

### Rationale for this change

I just ran the R package tests and saw a printed query mixed amongst testthat output:

```
[ FAIL 0 | WARN 0 | SKIP 1 | PASS 15 ]<SQL>
SELECT *
FROM arrow_010
```

I thought it would be good to silence this in some way.

### What changes are included in this PR?

I silenced this by changing out the call to `dplyr::show_query` for `dbplyr::sql_build` which produces different output but (1) doesn't print as as side-effect, (2) is specifically made for testing, and (3) still produces output we can use in this test.

For reference, this is what show_query prints (assuming via `cat`):
```
> show_query(table_four)
<SQL>
SELECT *
FROM arrow_011
```

Whereas `sql_build`:
```
> dbplyr::sql_build(table_four)
<dbplyr_table_ident[1]>
[1] `arrow_011`
```

### Are these changes tested?

Yes, but just manually on my system.

### Are there any user-facing changes?

No.

Authored-by: Bryce Mecum <petridish@gmail.com>
Signed-off-by: Bryce Mecum <petridish@gmail.com>
---
 r/tests/testthat/test-duckdb.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/r/tests/testthat/test-duckdb.R b/r/tests/testthat/test-duckdb.R
index 33ab1ecc7aa4d..dd7b6dba7fde0 100644
--- a/r/tests/testthat/test-duckdb.R
+++ b/r/tests/testthat/test-duckdb.R
@@ -281,7 +281,7 @@ test_that("to_duckdb passing a connection", {
     to_duckdb(con = con_separate, auto_disconnect = FALSE)
 
   # Generates a query like SELECT * FROM arrow_xxx
-  table_four_query <- paste(show_query(table_four), collapse = "\n")
+  table_four_query <- paste(dbplyr::sql_build(table_four), collapse = "\n")
   table_four_name <- stringr::str_extract(table_four_query, "arrow_[0-9]{3}")
   expect_false(is.na(table_four_name))
 

From 36ed0328b43ca39533e58a889f8e091d1f1ca7dc Mon Sep 17 00:00:00 2001
From: James Henderson <james@jarohen.dev>
Date: Thu, 4 Apr 2024 05:39:37 +0100
Subject: [PATCH 05/10] GH-24826: [Java] Add DUV.setOffset method (#40985)

### Are these changes tested?

Yes

### Are there any user-facing changes?

Yes - the addition of a public DUV.setOffset method
* GitHub Issue: #24826

Authored-by: James Henderson <james@jarohen.dev>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 .../codegen/templates/DenseUnionVector.java   |  8 ++++
 .../arrow/vector/TestDenseUnionVector.java    | 38 +++++++++++++++++++
 2 files changed, 46 insertions(+)

diff --git a/java/vector/src/main/codegen/templates/DenseUnionVector.java b/java/vector/src/main/codegen/templates/DenseUnionVector.java
index c23caf3bb5a03..8edd167152d7f 100644
--- a/java/vector/src/main/codegen/templates/DenseUnionVector.java
+++ b/java/vector/src/main/codegen/templates/DenseUnionVector.java
@@ -908,6 +908,14 @@ private int getTypeBufferValueCapacity() {
     return (int) typeBuffer.capacity() / TYPE_WIDTH;
   }
 
+  public void setOffset(int index, int offset) {
+    while (index >= getOffsetBufferValueCapacity()) {
+      reallocOffsetBuffer();
+    }
+
+    offsetBuffer.setInt((long) index * OFFSET_WIDTH, offset);
+  }
+
   private long getOffsetBufferValueCapacity() {
     return offsetBuffer.capacity() / OFFSET_WIDTH;
   }
diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestDenseUnionVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestDenseUnionVector.java
index 8fd33eb5a8432..2c29861561bb7 100644
--- a/java/vector/src/test/java/org/apache/arrow/vector/TestDenseUnionVector.java
+++ b/java/vector/src/test/java/org/apache/arrow/vector/TestDenseUnionVector.java
@@ -99,6 +99,44 @@ public void testDenseUnionVector() throws Exception {
     }
   }
 
+  @Test
+  public void testSetOffset() {
+    try (DenseUnionVector duv = DenseUnionVector.empty("foo", allocator)) {
+      duv.allocateNew();
+      byte i32TypeId = duv.registerNewTypeId(Field.notNullable("i32", MinorType.INT.getType()));
+      byte f64TypeId = duv.registerNewTypeId(Field.notNullable("f64", MinorType.FLOAT8.getType()));
+
+      IntVector i32Vector = ((IntVector) duv.addVector(i32TypeId, new IntVector("i32", allocator)));
+      Float8Vector f64Vector = ((Float8Vector) duv.addVector(f64TypeId, new Float8Vector("f64", allocator)));
+
+      i32Vector.allocateNew(3);
+      f64Vector.allocateNew(1);
+
+      duv.setTypeId(0, i32TypeId);
+      duv.setOffset(0, 0);
+      i32Vector.set(0, 42);
+
+      duv.setTypeId(1, i32TypeId);
+      duv.setOffset(1, 1);
+      i32Vector.set(1, 43);
+
+      duv.setTypeId(2, f64TypeId);
+      duv.setOffset(2, 0);
+      f64Vector.set(0, 3.14);
+
+      duv.setTypeId(3, i32TypeId);
+      duv.setOffset(3, 2);
+      i32Vector.set(2, 44);
+
+      duv.setValueCount(4);
+
+      assertEquals(42, duv.getObject(0));
+      assertEquals(43, duv.getObject(1));
+      assertEquals(3.14, duv.getObject(2));
+      assertEquals(44, duv.getObject(3));
+    }
+  }
+
   @Test
   public void testTransfer() throws Exception {
     try (DenseUnionVector srcVector = new DenseUnionVector(EMPTY_SCHEMA_PATH, allocator, null, null)) {

From 26631d7504420ff00a827d40273b589c6d38860f Mon Sep 17 00:00:00 2001
From: Antoine Pitrou <pitrou@free.fr>
Date: Thu, 4 Apr 2024 10:29:26 +0200
Subject: [PATCH 06/10] GH-40806: [C++] Revert changes from PR #40857 (#40980)

Revert changes from https://github.com/apache/arrow/pull/40857.

`GetRuntimeInfo` returns the SIMD level for dynamic dispatch, but Neon currently does not participate in dynamic dispatch (actually, Neon should be available by default on all modern Arm CPUs AFAIU).

Authored-by: Antoine Pitrou <pitrou@free.fr>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 cpp/src/arrow/config.cc | 2 --
 1 file changed, 2 deletions(-)

diff --git a/cpp/src/arrow/config.cc b/cpp/src/arrow/config.cc
index 1f852e84d3d5c..9e32e5437325f 100644
--- a/cpp/src/arrow/config.cc
+++ b/cpp/src/arrow/config.cc
@@ -58,8 +58,6 @@ std::string MakeSimdLevelString(QueryFlagFunction&& query_flag) {
     return "avx";
   } else if (query_flag(CpuInfo::SSE4_2)) {
     return "sse4_2";
-  } else if (query_flag(CpuInfo::ASIMD)) {
-    return "neon";
   } else {
     return "none";
   }

From 640c10191a51f6d0f408c72f45dbf5d94ec0b9d7 Mon Sep 17 00:00:00 2001
From: Weston Pace <weston.pace@gmail.com>
Date: Thu, 4 Apr 2024 02:51:58 -0700
Subject: [PATCH 07/10] GH-40224: [C++] Fix: improve the backpressure handling
 in the dataset writer (#40722)

### Rationale for this change

The dataset writer would fire the resume callback as soon as the underlying dataset writer's queues freed up, even if there were pending tasks.  Backpressure is not applied immediately and so a few tasks will always trickle in.  If backpressure is pausing and then resuming frequently this can lead to a buildup of pending tasks and uncontrolled memory growth.

### What changes are included in this PR?

The resume callback is not called until all pending write tasks have completed.

### Are these changes tested?

There is quite an extensive set of tests for the dataset writer already and they continue to pass.  I ran them on repeat, with and without stress, and did not see any issues.

However, the underlying problem (dataset writer can have uncontrolled memory growth) is still not tested as it is quite difficult to test.  I was able to run the setup described in the issue to reproduce the issue.  With this fix the repartitioning task completes for me.

### Are there any user-facing changes?

No
* GitHub Issue: #40224

Authored-by: Weston Pace <weston.pace@gmail.com>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 cpp/src/arrow/dataset/dataset_writer.cc | 36 ++++++++++++++++++++-----
 cpp/src/arrow/util/async_util.cc        |  7 +++++
 cpp/src/arrow/util/async_util.h         |  3 +++
 cpp/src/arrow/util/async_util_test.cc   |  1 +
 4 files changed, 41 insertions(+), 6 deletions(-)

diff --git a/cpp/src/arrow/dataset/dataset_writer.cc b/cpp/src/arrow/dataset/dataset_writer.cc
index 34731d19ab3eb..754386275d60c 100644
--- a/cpp/src/arrow/dataset/dataset_writer.cc
+++ b/cpp/src/arrow/dataset/dataset_writer.cc
@@ -515,7 +515,7 @@ class DatasetWriter::DatasetWriterImpl {
                     std::function<void()> finish_callback, uint64_t max_rows_queued)
       : scheduler_(scheduler),
         write_tasks_(util::MakeThrottledAsyncTaskGroup(
-            scheduler_, 1, /*queue=*/nullptr,
+            scheduler_, /*max_concurrent_cost=*/1, /*queue=*/nullptr,
             [finish_callback = std::move(finish_callback)] {
               finish_callback();
               return Status::OK();
@@ -541,6 +541,23 @@ class DatasetWriter::DatasetWriterImpl {
     }
   }
 
+  void ResumeIfNeeded() {
+    if (!paused_) {
+      return;
+    }
+    bool needs_resume = false;
+    {
+      std::lock_guard lg(mutex_);
+      if (!write_tasks_ || write_tasks_->QueueSize() == 0) {
+        needs_resume = true;
+      }
+    }
+    if (needs_resume) {
+      paused_ = false;
+      resume_callback_();
+    }
+  }
+
   void WriteRecordBatch(std::shared_ptr<RecordBatch> batch, const std::string& directory,
                         const std::string& prefix) {
     write_tasks_->AddSimpleTask(
@@ -549,11 +566,14 @@ class DatasetWriter::DatasetWriterImpl {
               WriteAndCheckBackpressure(std::move(batch), directory, prefix);
           if (!has_room.is_finished()) {
             // We don't have to worry about sequencing backpressure here since
-            // task_group_ serves as our sequencer.  If batches continue to arrive after
-            // we pause they will queue up in task_group_ until we free up and call
-            // Resume
+            // task_group_ serves as our sequencer.  If batches continue to arrive
+            // after we pause they will queue up in task_group_ until we free up and
+            // call Resume
             pause_callback_();
-            return has_room.Then([this] { resume_callback_(); });
+            paused_ = true;
+            return has_room.Then([this] { ResumeIfNeeded(); });
+          } else {
+            ResumeIfNeeded();
           }
           return has_room;
         },
@@ -571,6 +591,9 @@ class DatasetWriter::DatasetWriterImpl {
           return Future<>::MakeFinished();
         },
         "DatasetWriter::FinishAll"sv);
+    // Reset write_tasks_ to signal that we are done adding tasks, this will allow
+    // us to invoke the finish callback once the tasks wrap up.
+    std::lock_guard lg(mutex_);
     write_tasks_.reset();
   }
 
@@ -660,7 +683,7 @@ class DatasetWriter::DatasetWriterImpl {
   }
 
   util::AsyncTaskScheduler* scheduler_ = nullptr;
-  std::unique_ptr<util::AsyncTaskScheduler> write_tasks_;
+  std::unique_ptr<util::ThrottledAsyncTaskScheduler> write_tasks_;
   Future<> finish_fut_ = Future<>::Make();
   FileSystemDatasetWriteOptions write_options_;
   DatasetWriterState writer_state_;
@@ -670,6 +693,7 @@ class DatasetWriter::DatasetWriterImpl {
   std::unordered_map<std::string, std::shared_ptr<DatasetWriterDirectoryQueue>>
       directory_queues_;
   std::mutex mutex_;
+  bool paused_ = false;
   Status err_;
 };
 
diff --git a/cpp/src/arrow/util/async_util.cc b/cpp/src/arrow/util/async_util.cc
index 63e27bfbe5773..fbd45eadac2cd 100644
--- a/cpp/src/arrow/util/async_util.cc
+++ b/cpp/src/arrow/util/async_util.cc
@@ -118,6 +118,8 @@ class FifoQueue : public ThrottledAsyncTaskScheduler::Queue {
 
   void Purge() override { tasks_.clear(); }
 
+  std::size_t Size() const override { return tasks_.size(); }
+
  private:
   std::list<std::unique_ptr<Task>> tasks_;
 };
@@ -332,6 +334,10 @@ class ThrottledAsyncTaskSchedulerImpl
 
   void Pause() override { throttle_->Pause(); }
   void Resume() override { throttle_->Resume(); }
+  std::size_t QueueSize() override {
+    std::lock_guard lk(mutex_);
+    return queue_->Size();
+  }
   const util::tracing::Span& span() const override { return target_->span(); }
 
  private:
@@ -499,6 +505,7 @@ class ThrottledAsyncTaskGroup : public ThrottledAsyncTaskScheduler {
       : throttle_(std::move(throttle)), task_group_(std::move(task_group)) {}
   void Pause() override { throttle_->Pause(); }
   void Resume() override { throttle_->Resume(); }
+  std::size_t QueueSize() override { return throttle_->QueueSize(); }
   const util::tracing::Span& span() const override { return task_group_->span(); }
   bool AddTask(std::unique_ptr<Task> task) override {
     return task_group_->AddTask(std::move(task));
diff --git a/cpp/src/arrow/util/async_util.h b/cpp/src/arrow/util/async_util.h
index 7a675da59facd..d9ed63bdbce22 100644
--- a/cpp/src/arrow/util/async_util.h
+++ b/cpp/src/arrow/util/async_util.h
@@ -226,6 +226,7 @@ class ARROW_EXPORT ThrottledAsyncTaskScheduler : public AsyncTaskScheduler {
     virtual bool Empty() = 0;
     /// Purge the queue of all items
     virtual void Purge() = 0;
+    virtual std::size_t Size() const = 0;
   };
 
   class Throttle {
@@ -277,6 +278,8 @@ class ARROW_EXPORT ThrottledAsyncTaskScheduler : public AsyncTaskScheduler {
   /// Allows task to be submitted again.  If there is a max_concurrent_cost limit then
   /// it will still apply.
   virtual void Resume() = 0;
+  /// Return the number of tasks queued but not yet submitted
+  virtual std::size_t QueueSize() = 0;
 
   /// Create a throttled view of a scheduler
   ///
diff --git a/cpp/src/arrow/util/async_util_test.cc b/cpp/src/arrow/util/async_util_test.cc
index 313ca91912335..1f9aad453e9c4 100644
--- a/cpp/src/arrow/util/async_util_test.cc
+++ b/cpp/src/arrow/util/async_util_test.cc
@@ -680,6 +680,7 @@ class PriorityQueue : public ThrottledAsyncTaskScheduler::Queue {
       queue_.pop();
     }
   }
+  std::size_t Size() const { return queue_.size(); }
 
  private:
   std::priority_queue<TaskWithPriority*, std::vector<TaskWithPriority*>,

From ad6758900da1706d3cbfd59e5fe7d1d548c4235b Mon Sep 17 00:00:00 2001
From: James Henderson <james@jarohen.dev>
Date: Thu, 4 Apr 2024 12:34:40 +0100
Subject: [PATCH 08/10] MINOR: [Java] `DenseUnionVector.empty` should create
 not-nullable DUVs (#41001)

### Rationale for this change

DUVs do not have a validity vector, so cannot be set to null - `isNull`, for example, always returns false. This change ensures vectors created through `DenseUnionVector.empty` reflect this in their FieldType.

### What changes are included in this PR?

`DenseUnionVector.empty` now creates DUVs with a not-nullable `FieldType`

### Are these changes tested?

I haven't added an explicit test for this as it would essentially be testing `FieldType.notNullable` - confirmed that it doesn't break any existing tests.

### Are there any user-facing changes?

Yes, strictly speaking this is a public change, correcting a bug in a public API.

**This PR includes breaking changes to public APIs.**

Authored-by: James Henderson <james@jarohen.dev>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 java/vector/src/main/codegen/templates/DenseUnionVector.java | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/java/vector/src/main/codegen/templates/DenseUnionVector.java b/java/vector/src/main/codegen/templates/DenseUnionVector.java
index 8edd167152d7f..27fd8e9798b67 100644
--- a/java/vector/src/main/codegen/templates/DenseUnionVector.java
+++ b/java/vector/src/main/codegen/templates/DenseUnionVector.java
@@ -124,7 +124,7 @@ public class DenseUnionVector extends AbstractContainerVector implements FieldVe
           ArrowType.Struct.INSTANCE, /*dictionary*/ null, /*metadata*/ null);
 
   public static DenseUnionVector empty(String name, BufferAllocator allocator) {
-    FieldType fieldType = FieldType.nullable(new ArrowType.Union(
+    FieldType fieldType = FieldType.notNullable(new ArrowType.Union(
             UnionMode.Dense, null));
     return new DenseUnionVector(name, allocator, fieldType, null);
   }

From bbeeb33a2fb65f40caf6c3176ee377de2b9de6e5 Mon Sep 17 00:00:00 2001
From: Alenka Frim <AlenkaF@users.noreply.github.com>
Date: Thu, 4 Apr 2024 14:38:16 +0200
Subject: [PATCH 09/10] GH-40974: [CI][Python] CI failures on Python builds due
 to pytest_cython (#40975)

### Rationale for this change

We are seeing sporadic CI failures on Python builds due to `pytest_cython`.

### What changes are included in this PR?

Upper pin of 0.3.0 added for `pytest-cython`.

### Are these changes tested?

With a green CI.

### Are there any user-facing changes?

No.
* GitHub Issue: #40974

Authored-by: AlenkaF <frim.alenka@gmail.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 ci/conda_env_sphinx.txt | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/ci/conda_env_sphinx.txt b/ci/conda_env_sphinx.txt
index 6899f9c36a7f6..0a356d5722c42 100644
--- a/ci/conda_env_sphinx.txt
+++ b/ci/conda_env_sphinx.txt
@@ -29,5 +29,8 @@ sphinx-copybutton
 sphinxcontrib-jquery
 sphinx==6.2
 # Requirement for doctest-cython
-pytest-cython
+# Needs upper pin of 0.3.0, see:
+# https://github.com/lgpage/pytest-cython/issues/67
+# With 0.3.* bug fix release, the pin can be removed
+pytest-cython==0.2.2
 pandas

From b99b00dd66586cf54b04ce6a51eb1cf68b1510a3 Mon Sep 17 00:00:00 2001
From: mwish <maplewish117@gmail.com>
Date: Thu, 4 Apr 2024 21:35:00 +0800
Subject: [PATCH 10/10] GH-40994: [C++][Parquet] RleBooleanDecoder supports
 DecodeArrow with nulls (#40995)

### Rationale for this change

Supports DecodeArrow with nulls in RleBooleanDecoder

### What changes are included in this PR?

Supports DecodeArrow with nulls in RleBooleanDecoder

### Are these changes tested?

Yes

### Are there any user-facing changes?

currently not

* GitHub Issue: #40994

Lead-authored-by: mwish <maplewish117@gmail.com>
Co-authored-by: mwish <anmmscs_maple@qq.com>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 cpp/src/parquet/encoding.cc           | 63 ++++++++++++++++++++-------
 cpp/src/parquet/encoding_benchmark.cc |  9 ++--
 cpp/src/parquet/encoding_test.cc      | 46 +++++++++----------
 3 files changed, 74 insertions(+), 44 deletions(-)

diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc
index f16e9b34fc682..6e93b493392c9 100644
--- a/cpp/src/parquet/encoding.cc
+++ b/cpp/src/parquet/encoding.cc
@@ -3143,27 +3143,58 @@ class RleBooleanDecoder : public DecoderImpl, virtual public BooleanDecoder {
   int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
                   int64_t valid_bits_offset,
                   typename EncodingTraits<BooleanType>::Accumulator* out) override {
-    if (null_count != 0) {
-      // TODO(ARROW-34660): implement DecodeArrow with null slots.
-      ParquetException::NYI("RleBoolean DecodeArrow with null slots");
+    if (null_count == num_values) {
+      PARQUET_THROW_NOT_OK(out->AppendNulls(null_count));
+      return 0;
     }
     constexpr int kBatchSize = 1024;
     std::array<bool, kBatchSize> values;
-    int sum_decode_count = 0;
-    do {
-      int current_batch = std::min(kBatchSize, num_values);
-      int decoded_count = decoder_->GetBatch(values.data(), current_batch);
-      if (decoded_count == 0) {
-        break;
+    const int num_non_null_values = num_values - null_count;
+    // Remaining non-null boolean values to read from decoder.
+    // We decode from `decoder_` with maximum 1024 size batches.
+    int num_remain_non_null_values = num_non_null_values;
+    int current_index_in_batch = 0;
+    int current_batch_size = 0;
+    auto next_boolean_batch = [&]() {
+      DCHECK_GT(num_remain_non_null_values, 0);
+      DCHECK_EQ(current_index_in_batch, current_batch_size);
+      current_batch_size = std::min(num_remain_non_null_values, kBatchSize);
+      int decoded_count = decoder_->GetBatch(values.data(), current_batch_size);
+      if (ARROW_PREDICT_FALSE(decoded_count != current_batch_size)) {
+        // required values is more than values in decoder.
+        ParquetException::EofException();
       }
-      sum_decode_count += decoded_count;
-      PARQUET_THROW_NOT_OK(out->Reserve(sum_decode_count));
-      for (int i = 0; i < decoded_count; ++i) {
-        PARQUET_THROW_NOT_OK(out->Append(values[i]));
+      num_remain_non_null_values -= current_batch_size;
+      current_index_in_batch = 0;
+    };
+
+    // Reserve all values including nulls first
+    PARQUET_THROW_NOT_OK(out->Reserve(num_values));
+    if (null_count == 0) {
+      // Fast-path for not having nulls.
+      do {
+        next_boolean_batch();
+        PARQUET_THROW_NOT_OK(
+            out->AppendValues(values.begin(), values.begin() + current_batch_size));
+        num_values -= current_batch_size;
+        current_index_in_batch = 0;
+      } while (num_values > 0);
+      return num_non_null_values;
+    }
+    auto next_value = [&]() -> bool {
+      if (current_index_in_batch == current_batch_size) {
+        next_boolean_batch();
+        DCHECK_GT(current_batch_size, 0);
       }
-      num_values -= decoded_count;
-    } while (num_values > 0);
-    return sum_decode_count;
+      DCHECK_LT(current_index_in_batch, current_batch_size);
+      bool value = values[current_index_in_batch];
+      ++current_index_in_batch;
+      return value;
+    };
+    VisitNullBitmapInline(
+        valid_bits, valid_bits_offset, num_values, null_count,
+        [&]() { out->UnsafeAppend(next_value()); }, [&]() { out->UnsafeAppendNull(); });
+    return num_non_null_values;
   }
 
   int DecodeArrow(
diff --git a/cpp/src/parquet/encoding_benchmark.cc b/cpp/src/parquet/encoding_benchmark.cc
index 9c07d262b350e..a858c53e931d8 100644
--- a/cpp/src/parquet/encoding_benchmark.cc
+++ b/cpp/src/parquet/encoding_benchmark.cc
@@ -1518,11 +1518,10 @@ BENCHMARK_DEFINE_F(BM_DecodeArrowBooleanRle, DecodeArrowNonNull)
 (benchmark::State& state) { DecodeArrowNonNullDenseBenchmark(state); }
 BENCHMARK_REGISTER_F(BM_DecodeArrowBooleanRle, DecodeArrowNonNull)
     ->Range(MIN_RANGE, MAX_RANGE);
-// TODO(mwish): RleBoolean not implemented DecodeArrow with null slots yet.
-// BENCHMARK_DEFINE_F(BM_DecodeArrowBooleanRle, DecodeArrowWithNull)
-//(benchmark::State& state) { DecodeArrowWithNullDenseBenchmark(state); }
-// BENCHMARK_REGISTER_F(BM_DecodeArrowBooleanRle, DecodeArrowWithNull)
-//    ->Apply(BooleanWithNullCustomArguments);
+BENCHMARK_DEFINE_F(BM_DecodeArrowBooleanRle, DecodeArrowWithNull)
+(benchmark::State& state) { DecodeArrowWithNullDenseBenchmark(state); }
+BENCHMARK_REGISTER_F(BM_DecodeArrowBooleanRle, DecodeArrowWithNull)
+    ->Apply(BooleanWithNullCustomArguments);
 
 BENCHMARK_DEFINE_F(BM_DecodeArrowBooleanPlain, DecodeArrow)
 (benchmark::State& state) { DecodeArrowDenseBenchmark(state); }
diff --git a/cpp/src/parquet/encoding_test.cc b/cpp/src/parquet/encoding_test.cc
index ea0029f4c7d7f..bb5126ce251d4 100644
--- a/cpp/src/parquet/encoding_test.cc
+++ b/cpp/src/parquet/encoding_test.cc
@@ -602,7 +602,7 @@ TEST(PlainEncodingAdHoc, ArrowBinaryDirectPut) {
 
 // Check that one can put several Arrow arrays into a given encoder
 // and decode to the right values (see GH-36939)
-TEST(PlainBooleanArrayEncoding, AdHocRoundTrip) {
+TEST(BooleanArrayEncoding, AdHocRoundTrip) {
   std::vector<std::shared_ptr<::arrow::Array>> arrays{
       ::arrow::ArrayFromJSON(::arrow::boolean(), R"([])"),
       ::arrow::ArrayFromJSON(::arrow::boolean(), R"([false, null, true])"),
@@ -610,27 +610,29 @@ TEST(PlainBooleanArrayEncoding, AdHocRoundTrip) {
       ::arrow::ArrayFromJSON(::arrow::boolean(), R"([true, null, false])"),
   };
 
-  auto encoder = MakeTypedEncoder<BooleanType>(Encoding::PLAIN,
-                                               /*use_dictionary=*/false);
-  for (const auto& array : arrays) {
-    encoder->Put(*array);
-  }
-  auto buffer = encoder->FlushValues();
-  auto decoder = MakeTypedDecoder<BooleanType>(Encoding::PLAIN);
-  EXPECT_OK_AND_ASSIGN(auto expected, ::arrow::Concatenate(arrays));
-  decoder->SetData(static_cast<int>(expected->length()), buffer->data(),
-                   static_cast<int>(buffer->size()));
-
-  ::arrow::BooleanBuilder builder;
-  ASSERT_EQ(static_cast<int>(expected->length() - expected->null_count()),
-            decoder->DecodeArrow(static_cast<int>(expected->length()),
-                                 static_cast<int>(expected->null_count()),
-                                 expected->null_bitmap_data(), 0, &builder));
+  for (auto encoding : {Encoding::PLAIN, Encoding::RLE}) {
+    auto encoder = MakeTypedEncoder<BooleanType>(encoding,
+                                                 /*use_dictionary=*/false);
+    for (const auto& array : arrays) {
+      encoder->Put(*array);
+    }
+    auto buffer = encoder->FlushValues();
+    auto decoder = MakeTypedDecoder<BooleanType>(encoding);
+    EXPECT_OK_AND_ASSIGN(auto expected, ::arrow::Concatenate(arrays));
+    decoder->SetData(static_cast<int>(expected->length()), buffer->data(),
+                     static_cast<int>(buffer->size()));
+
+    ::arrow::BooleanBuilder builder;
+    ASSERT_EQ(static_cast<int>(expected->length() - expected->null_count()),
+              decoder->DecodeArrow(static_cast<int>(expected->length()),
+                                   static_cast<int>(expected->null_count()),
+                                   expected->null_bitmap_data(), 0, &builder));
 
-  std::shared_ptr<::arrow::Array> result;
-  ASSERT_OK(builder.Finish(&result));
-  ASSERT_EQ(expected->length(), result->length());
-  ::arrow::AssertArraysEqual(*expected, *result, /*verbose=*/true);
+    std::shared_ptr<::arrow::Array> result;
+    ASSERT_OK(builder.Finish(&result));
+    ASSERT_EQ(expected->length(), result->length());
+    ::arrow::AssertArraysEqual(*expected, *result, /*verbose=*/true);
+  }
 }
 
 template <typename T>
@@ -963,8 +965,6 @@ TYPED_TEST(EncodingAdHocTyped, ByteStreamSplitArrowDirectPut) {
 }
 
 TYPED_TEST(EncodingAdHocTyped, RleArrowDirectPut) {
-  // TODO: test with nulls once RleBooleanDecoder::DecodeArrow supports them
-  this->null_probability_ = 0;
   for (auto seed : {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}) {
     this->Rle(seed);
   }