diff --git a/c_glib/example/vala/meson.build b/c_glib/example/vala/meson.build index 474f0b1e9a51a..ff65a7328f171 100644 --- a/c_glib/example/vala/meson.build +++ b/c_glib/example/vala/meson.build @@ -18,11 +18,15 @@ # under the License. if generate_vapi + c_flags = [ + '-Wunused-but-set-variable', + ] + c_flags = meson.get_compiler('c').get_supported_arguments(c_flags) vala_example_executable_kwargs = { 'c_args': [ '-I' + project_build_root, '-I' + project_source_root, - ], + ] + c_flags, 'dependencies': [ arrow_glib_vapi, dependency('gio-2.0'), diff --git a/c_glib/example/vala/read-file.vala b/c_glib/example/vala/read-file.vala index a0a06275c4b24..287eddac76352 100644 --- a/c_glib/example/vala/read-file.vala +++ b/c_glib/example/vala/read-file.vala @@ -119,8 +119,8 @@ void print_array(GArrow.Array array) { void print_record_batch(GArrow.RecordBatch record_batch) { var n_columns = record_batch.get_n_columns(); - for (var nth_column = 0; nth_column < n_columns; nth_column++) { - stdout.printf("columns[%" + int64.FORMAT + "](%s): ", + for (int nth_column = 0; nth_column < n_columns; nth_column++) { + stdout.printf("columns[%d](%s): ", nth_column, record_batch.get_column_name(nth_column)); var array = record_batch.get_column_data(nth_column); diff --git a/c_glib/example/vala/read-stream.vala b/c_glib/example/vala/read-stream.vala index c58dc848930a8..4520c8609bdaf 100644 --- a/c_glib/example/vala/read-stream.vala +++ b/c_glib/example/vala/read-stream.vala @@ -119,8 +119,8 @@ void print_array(GArrow.Array array) { void print_record_batch(GArrow.RecordBatch record_batch) { var n_columns = record_batch.get_n_columns(); - for (var nth_column = 0; nth_column < n_columns; nth_column++) { - stdout.printf("columns[%" + int64.FORMAT + "](%s): ", + for (int nth_column = 0; nth_column < n_columns; nth_column++) { + stdout.printf("columns[%d](%s): ", nth_column, record_batch.get_column_name(nth_column)); var array = record_batch.get_column_data(nth_column); diff --git a/ci/conda_env_sphinx.txt b/ci/conda_env_sphinx.txt index 6899f9c36a7f6..0a356d5722c42 100644 --- a/ci/conda_env_sphinx.txt +++ b/ci/conda_env_sphinx.txt @@ -29,5 +29,8 @@ sphinx-copybutton sphinxcontrib-jquery sphinx==6.2 # Requirement for doctest-cython -pytest-cython +# Needs upper pin of 0.3.0, see: +# https://github.com/lgpage/pytest-cython/issues/67 +# With 0.3.* bug fix release, the pin can be removed +pytest-cython==0.2.2 pandas diff --git a/cpp/src/arrow/config.cc b/cpp/src/arrow/config.cc index 1f852e84d3d5c..9e32e5437325f 100644 --- a/cpp/src/arrow/config.cc +++ b/cpp/src/arrow/config.cc @@ -58,8 +58,6 @@ std::string MakeSimdLevelString(QueryFlagFunction&& query_flag) { return "avx"; } else if (query_flag(CpuInfo::SSE4_2)) { return "sse4_2"; - } else if (query_flag(CpuInfo::ASIMD)) { - return "neon"; } else { return "none"; } diff --git a/cpp/src/arrow/dataset/dataset_writer.cc b/cpp/src/arrow/dataset/dataset_writer.cc index 34731d19ab3eb..754386275d60c 100644 --- a/cpp/src/arrow/dataset/dataset_writer.cc +++ b/cpp/src/arrow/dataset/dataset_writer.cc @@ -515,7 +515,7 @@ class DatasetWriter::DatasetWriterImpl { std::function finish_callback, uint64_t max_rows_queued) : scheduler_(scheduler), write_tasks_(util::MakeThrottledAsyncTaskGroup( - scheduler_, 1, /*queue=*/nullptr, + scheduler_, /*max_concurrent_cost=*/1, /*queue=*/nullptr, [finish_callback = std::move(finish_callback)] { finish_callback(); return Status::OK(); @@ -541,6 +541,23 @@ class DatasetWriter::DatasetWriterImpl { } } + void ResumeIfNeeded() { + if (!paused_) { + return; + } + bool needs_resume = false; + { + std::lock_guard lg(mutex_); + if (!write_tasks_ || write_tasks_->QueueSize() == 0) { + needs_resume = true; + } + } + if (needs_resume) { + paused_ = false; + resume_callback_(); + } + } + void WriteRecordBatch(std::shared_ptr batch, const std::string& directory, const std::string& prefix) { write_tasks_->AddSimpleTask( @@ -549,11 +566,14 @@ class DatasetWriter::DatasetWriterImpl { WriteAndCheckBackpressure(std::move(batch), directory, prefix); if (!has_room.is_finished()) { // We don't have to worry about sequencing backpressure here since - // task_group_ serves as our sequencer. If batches continue to arrive after - // we pause they will queue up in task_group_ until we free up and call - // Resume + // task_group_ serves as our sequencer. If batches continue to arrive + // after we pause they will queue up in task_group_ until we free up and + // call Resume pause_callback_(); - return has_room.Then([this] { resume_callback_(); }); + paused_ = true; + return has_room.Then([this] { ResumeIfNeeded(); }); + } else { + ResumeIfNeeded(); } return has_room; }, @@ -571,6 +591,9 @@ class DatasetWriter::DatasetWriterImpl { return Future<>::MakeFinished(); }, "DatasetWriter::FinishAll"sv); + // Reset write_tasks_ to signal that we are done adding tasks, this will allow + // us to invoke the finish callback once the tasks wrap up. + std::lock_guard lg(mutex_); write_tasks_.reset(); } @@ -660,7 +683,7 @@ class DatasetWriter::DatasetWriterImpl { } util::AsyncTaskScheduler* scheduler_ = nullptr; - std::unique_ptr write_tasks_; + std::unique_ptr write_tasks_; Future<> finish_fut_ = Future<>::Make(); FileSystemDatasetWriteOptions write_options_; DatasetWriterState writer_state_; @@ -670,6 +693,7 @@ class DatasetWriter::DatasetWriterImpl { std::unordered_map> directory_queues_; std::mutex mutex_; + bool paused_ = false; Status err_; }; diff --git a/cpp/src/arrow/util/async_util.cc b/cpp/src/arrow/util/async_util.cc index 63e27bfbe5773..fbd45eadac2cd 100644 --- a/cpp/src/arrow/util/async_util.cc +++ b/cpp/src/arrow/util/async_util.cc @@ -118,6 +118,8 @@ class FifoQueue : public ThrottledAsyncTaskScheduler::Queue { void Purge() override { tasks_.clear(); } + std::size_t Size() const override { return tasks_.size(); } + private: std::list> tasks_; }; @@ -332,6 +334,10 @@ class ThrottledAsyncTaskSchedulerImpl void Pause() override { throttle_->Pause(); } void Resume() override { throttle_->Resume(); } + std::size_t QueueSize() override { + std::lock_guard lk(mutex_); + return queue_->Size(); + } const util::tracing::Span& span() const override { return target_->span(); } private: @@ -499,6 +505,7 @@ class ThrottledAsyncTaskGroup : public ThrottledAsyncTaskScheduler { : throttle_(std::move(throttle)), task_group_(std::move(task_group)) {} void Pause() override { throttle_->Pause(); } void Resume() override { throttle_->Resume(); } + std::size_t QueueSize() override { return throttle_->QueueSize(); } const util::tracing::Span& span() const override { return task_group_->span(); } bool AddTask(std::unique_ptr task) override { return task_group_->AddTask(std::move(task)); diff --git a/cpp/src/arrow/util/async_util.h b/cpp/src/arrow/util/async_util.h index 7a675da59facd..d9ed63bdbce22 100644 --- a/cpp/src/arrow/util/async_util.h +++ b/cpp/src/arrow/util/async_util.h @@ -226,6 +226,7 @@ class ARROW_EXPORT ThrottledAsyncTaskScheduler : public AsyncTaskScheduler { virtual bool Empty() = 0; /// Purge the queue of all items virtual void Purge() = 0; + virtual std::size_t Size() const = 0; }; class Throttle { @@ -277,6 +278,8 @@ class ARROW_EXPORT ThrottledAsyncTaskScheduler : public AsyncTaskScheduler { /// Allows task to be submitted again. If there is a max_concurrent_cost limit then /// it will still apply. virtual void Resume() = 0; + /// Return the number of tasks queued but not yet submitted + virtual std::size_t QueueSize() = 0; /// Create a throttled view of a scheduler /// diff --git a/cpp/src/arrow/util/async_util_test.cc b/cpp/src/arrow/util/async_util_test.cc index 313ca91912335..1f9aad453e9c4 100644 --- a/cpp/src/arrow/util/async_util_test.cc +++ b/cpp/src/arrow/util/async_util_test.cc @@ -680,6 +680,7 @@ class PriorityQueue : public ThrottledAsyncTaskScheduler::Queue { queue_.pop(); } } + std::size_t Size() const { return queue_.size(); } private: std::priority_queue, diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index f16e9b34fc682..6e93b493392c9 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -3143,27 +3143,58 @@ class RleBooleanDecoder : public DecoderImpl, virtual public BooleanDecoder { int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset, typename EncodingTraits::Accumulator* out) override { - if (null_count != 0) { - // TODO(ARROW-34660): implement DecodeArrow with null slots. - ParquetException::NYI("RleBoolean DecodeArrow with null slots"); + if (null_count == num_values) { + PARQUET_THROW_NOT_OK(out->AppendNulls(null_count)); + return 0; } constexpr int kBatchSize = 1024; std::array values; - int sum_decode_count = 0; - do { - int current_batch = std::min(kBatchSize, num_values); - int decoded_count = decoder_->GetBatch(values.data(), current_batch); - if (decoded_count == 0) { - break; + const int num_non_null_values = num_values - null_count; + // Remaining non-null boolean values to read from decoder. + // We decode from `decoder_` with maximum 1024 size batches. + int num_remain_non_null_values = num_non_null_values; + int current_index_in_batch = 0; + int current_batch_size = 0; + auto next_boolean_batch = [&]() { + DCHECK_GT(num_remain_non_null_values, 0); + DCHECK_EQ(current_index_in_batch, current_batch_size); + current_batch_size = std::min(num_remain_non_null_values, kBatchSize); + int decoded_count = decoder_->GetBatch(values.data(), current_batch_size); + if (ARROW_PREDICT_FALSE(decoded_count != current_batch_size)) { + // required values is more than values in decoder. + ParquetException::EofException(); } - sum_decode_count += decoded_count; - PARQUET_THROW_NOT_OK(out->Reserve(sum_decode_count)); - for (int i = 0; i < decoded_count; ++i) { - PARQUET_THROW_NOT_OK(out->Append(values[i])); + num_remain_non_null_values -= current_batch_size; + current_index_in_batch = 0; + }; + + // Reserve all values including nulls first + PARQUET_THROW_NOT_OK(out->Reserve(num_values)); + if (null_count == 0) { + // Fast-path for not having nulls. + do { + next_boolean_batch(); + PARQUET_THROW_NOT_OK( + out->AppendValues(values.begin(), values.begin() + current_batch_size)); + num_values -= current_batch_size; + current_index_in_batch = 0; + } while (num_values > 0); + return num_non_null_values; + } + auto next_value = [&]() -> bool { + if (current_index_in_batch == current_batch_size) { + next_boolean_batch(); + DCHECK_GT(current_batch_size, 0); } - num_values -= decoded_count; - } while (num_values > 0); - return sum_decode_count; + DCHECK_LT(current_index_in_batch, current_batch_size); + bool value = values[current_index_in_batch]; + ++current_index_in_batch; + return value; + }; + VisitNullBitmapInline( + valid_bits, valid_bits_offset, num_values, null_count, + [&]() { out->UnsafeAppend(next_value()); }, [&]() { out->UnsafeAppendNull(); }); + return num_non_null_values; } int DecodeArrow( diff --git a/cpp/src/parquet/encoding_benchmark.cc b/cpp/src/parquet/encoding_benchmark.cc index 9c07d262b350e..a858c53e931d8 100644 --- a/cpp/src/parquet/encoding_benchmark.cc +++ b/cpp/src/parquet/encoding_benchmark.cc @@ -1518,11 +1518,10 @@ BENCHMARK_DEFINE_F(BM_DecodeArrowBooleanRle, DecodeArrowNonNull) (benchmark::State& state) { DecodeArrowNonNullDenseBenchmark(state); } BENCHMARK_REGISTER_F(BM_DecodeArrowBooleanRle, DecodeArrowNonNull) ->Range(MIN_RANGE, MAX_RANGE); -// TODO(mwish): RleBoolean not implemented DecodeArrow with null slots yet. -// BENCHMARK_DEFINE_F(BM_DecodeArrowBooleanRle, DecodeArrowWithNull) -//(benchmark::State& state) { DecodeArrowWithNullDenseBenchmark(state); } -// BENCHMARK_REGISTER_F(BM_DecodeArrowBooleanRle, DecodeArrowWithNull) -// ->Apply(BooleanWithNullCustomArguments); +BENCHMARK_DEFINE_F(BM_DecodeArrowBooleanRle, DecodeArrowWithNull) +(benchmark::State& state) { DecodeArrowWithNullDenseBenchmark(state); } +BENCHMARK_REGISTER_F(BM_DecodeArrowBooleanRle, DecodeArrowWithNull) + ->Apply(BooleanWithNullCustomArguments); BENCHMARK_DEFINE_F(BM_DecodeArrowBooleanPlain, DecodeArrow) (benchmark::State& state) { DecodeArrowDenseBenchmark(state); } diff --git a/cpp/src/parquet/encoding_test.cc b/cpp/src/parquet/encoding_test.cc index ea0029f4c7d7f..bb5126ce251d4 100644 --- a/cpp/src/parquet/encoding_test.cc +++ b/cpp/src/parquet/encoding_test.cc @@ -602,7 +602,7 @@ TEST(PlainEncodingAdHoc, ArrowBinaryDirectPut) { // Check that one can put several Arrow arrays into a given encoder // and decode to the right values (see GH-36939) -TEST(PlainBooleanArrayEncoding, AdHocRoundTrip) { +TEST(BooleanArrayEncoding, AdHocRoundTrip) { std::vector> arrays{ ::arrow::ArrayFromJSON(::arrow::boolean(), R"([])"), ::arrow::ArrayFromJSON(::arrow::boolean(), R"([false, null, true])"), @@ -610,27 +610,29 @@ TEST(PlainBooleanArrayEncoding, AdHocRoundTrip) { ::arrow::ArrayFromJSON(::arrow::boolean(), R"([true, null, false])"), }; - auto encoder = MakeTypedEncoder(Encoding::PLAIN, - /*use_dictionary=*/false); - for (const auto& array : arrays) { - encoder->Put(*array); - } - auto buffer = encoder->FlushValues(); - auto decoder = MakeTypedDecoder(Encoding::PLAIN); - EXPECT_OK_AND_ASSIGN(auto expected, ::arrow::Concatenate(arrays)); - decoder->SetData(static_cast(expected->length()), buffer->data(), - static_cast(buffer->size())); - - ::arrow::BooleanBuilder builder; - ASSERT_EQ(static_cast(expected->length() - expected->null_count()), - decoder->DecodeArrow(static_cast(expected->length()), - static_cast(expected->null_count()), - expected->null_bitmap_data(), 0, &builder)); + for (auto encoding : {Encoding::PLAIN, Encoding::RLE}) { + auto encoder = MakeTypedEncoder(encoding, + /*use_dictionary=*/false); + for (const auto& array : arrays) { + encoder->Put(*array); + } + auto buffer = encoder->FlushValues(); + auto decoder = MakeTypedDecoder(encoding); + EXPECT_OK_AND_ASSIGN(auto expected, ::arrow::Concatenate(arrays)); + decoder->SetData(static_cast(expected->length()), buffer->data(), + static_cast(buffer->size())); + + ::arrow::BooleanBuilder builder; + ASSERT_EQ(static_cast(expected->length() - expected->null_count()), + decoder->DecodeArrow(static_cast(expected->length()), + static_cast(expected->null_count()), + expected->null_bitmap_data(), 0, &builder)); - std::shared_ptr<::arrow::Array> result; - ASSERT_OK(builder.Finish(&result)); - ASSERT_EQ(expected->length(), result->length()); - ::arrow::AssertArraysEqual(*expected, *result, /*verbose=*/true); + std::shared_ptr<::arrow::Array> result; + ASSERT_OK(builder.Finish(&result)); + ASSERT_EQ(expected->length(), result->length()); + ::arrow::AssertArraysEqual(*expected, *result, /*verbose=*/true); + } } template @@ -963,8 +965,6 @@ TYPED_TEST(EncodingAdHocTyped, ByteStreamSplitArrowDirectPut) { } TYPED_TEST(EncodingAdHocTyped, RleArrowDirectPut) { - // TODO: test with nulls once RleBooleanDecoder::DecodeArrow supports them - this->null_probability_ = 0; for (auto seed : {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}) { this->Rle(seed); } diff --git a/java/vector/src/main/codegen/templates/DenseUnionVector.java b/java/vector/src/main/codegen/templates/DenseUnionVector.java index c23caf3bb5a03..27fd8e9798b67 100644 --- a/java/vector/src/main/codegen/templates/DenseUnionVector.java +++ b/java/vector/src/main/codegen/templates/DenseUnionVector.java @@ -124,7 +124,7 @@ public class DenseUnionVector extends AbstractContainerVector implements FieldVe ArrowType.Struct.INSTANCE, /*dictionary*/ null, /*metadata*/ null); public static DenseUnionVector empty(String name, BufferAllocator allocator) { - FieldType fieldType = FieldType.nullable(new ArrowType.Union( + FieldType fieldType = FieldType.notNullable(new ArrowType.Union( UnionMode.Dense, null)); return new DenseUnionVector(name, allocator, fieldType, null); } @@ -908,6 +908,14 @@ private int getTypeBufferValueCapacity() { return (int) typeBuffer.capacity() / TYPE_WIDTH; } + public void setOffset(int index, int offset) { + while (index >= getOffsetBufferValueCapacity()) { + reallocOffsetBuffer(); + } + + offsetBuffer.setInt((long) index * OFFSET_WIDTH, offset); + } + private long getOffsetBufferValueCapacity() { return offsetBuffer.capacity() / OFFSET_WIDTH; } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestDenseUnionVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestDenseUnionVector.java index 8fd33eb5a8432..2c29861561bb7 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestDenseUnionVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestDenseUnionVector.java @@ -99,6 +99,44 @@ public void testDenseUnionVector() throws Exception { } } + @Test + public void testSetOffset() { + try (DenseUnionVector duv = DenseUnionVector.empty("foo", allocator)) { + duv.allocateNew(); + byte i32TypeId = duv.registerNewTypeId(Field.notNullable("i32", MinorType.INT.getType())); + byte f64TypeId = duv.registerNewTypeId(Field.notNullable("f64", MinorType.FLOAT8.getType())); + + IntVector i32Vector = ((IntVector) duv.addVector(i32TypeId, new IntVector("i32", allocator))); + Float8Vector f64Vector = ((Float8Vector) duv.addVector(f64TypeId, new Float8Vector("f64", allocator))); + + i32Vector.allocateNew(3); + f64Vector.allocateNew(1); + + duv.setTypeId(0, i32TypeId); + duv.setOffset(0, 0); + i32Vector.set(0, 42); + + duv.setTypeId(1, i32TypeId); + duv.setOffset(1, 1); + i32Vector.set(1, 43); + + duv.setTypeId(2, f64TypeId); + duv.setOffset(2, 0); + f64Vector.set(0, 3.14); + + duv.setTypeId(3, i32TypeId); + duv.setOffset(3, 2); + i32Vector.set(2, 44); + + duv.setValueCount(4); + + assertEquals(42, duv.getObject(0)); + assertEquals(43, duv.getObject(1)); + assertEquals(3.14, duv.getObject(2)); + assertEquals(44, duv.getObject(3)); + } + } + @Test public void testTransfer() throws Exception { try (DenseUnionVector srcVector = new DenseUnionVector(EMPTY_SCHEMA_PATH, allocator, null, null)) { diff --git a/js/src/data.ts b/js/src/data.ts index 6f8792508858b..45fcc35d37676 100644 --- a/js/src/data.ts +++ b/js/src/data.ts @@ -109,7 +109,10 @@ export class Data { let nullCount = this._nullCount; let nullBitmap: Uint8Array | undefined; if (nullCount <= kUnknownNullCount && (nullBitmap = this.nullBitmap)) { - this._nullCount = nullCount = this.length - popcnt_bit_range(nullBitmap, this.offset, this.offset + this.length); + this._nullCount = nullCount = nullBitmap.length === 0 ? + // no null bitmap, so all values are valid + 0 : + this.length - popcnt_bit_range(nullBitmap, this.offset, this.offset + this.length); } return nullCount; } @@ -177,16 +180,16 @@ export class Data { // if we have a nullBitmap, truncate + slice and set it over the pre-filled 1s if (this.nullCount > 0) { nullBitmap.set(truncateBitmap(offset, length, this.nullBitmap), 0); + Object.assign(this, { nullBitmap }); + } else { + Object.assign(this, { nullBitmap, _nullCount: 0 }); } - Object.assign(this, { nullBitmap, _nullCount: -1 }); } const byte = nullBitmap[byteOffset]; prev = (byte & mask) !== 0; - value ? - (nullBitmap[byteOffset] = byte | mask) : - (nullBitmap[byteOffset] = byte & ~mask); + nullBitmap[byteOffset] = value ? (byte | mask) : (byte & ~mask); } if (prev !== !!value) { diff --git a/js/src/factories.ts b/js/src/factories.ts index aa54498c50bc0..657ae1b95ab92 100644 --- a/js/src/factories.ts +++ b/js/src/factories.ts @@ -65,7 +65,7 @@ export function makeBuilder(option export function vectorFromArray(values: readonly (null | undefined)[], type?: dtypes.Null): Vector; export function vectorFromArray(values: readonly (null | undefined | boolean)[], type?: dtypes.Bool): Vector; export function vectorFromArray = dtypes.Dictionary>(values: readonly (null | undefined | string)[], type?: T): Vector; -export function vectorFromArray(values: readonly (null | undefined | Date)[], type?: T): Vector; +export function vectorFromArray(values: readonly (null | undefined | Date)[], type?: T): Vector; export function vectorFromArray(values: readonly (null | undefined | number)[], type: T): Vector; export function vectorFromArray(values: readonly (null | undefined | bigint)[], type?: T): Vector; export function vectorFromArray(values: readonly (null | undefined | number)[], type?: T): Vector; @@ -145,7 +145,7 @@ function inferType(value: readonly unknown[]): dtypes.DataType { } else if (booleansCount + nullsCount === value.length) { return new dtypes.Bool; } else if (datesCount + nullsCount === value.length) { - return new dtypes.DateMillisecond; + return new dtypes.TimestampMillisecond; } else if (arraysCount + nullsCount === value.length) { const array = value as Array[]; const childType = inferType(array[array.findIndex((ary) => ary != null)]); diff --git a/js/src/type.ts b/js/src/type.ts index ae3aefa025999..a42552d65ad27 100644 --- a/js/src/type.ts +++ b/js/src/type.ts @@ -349,7 +349,19 @@ export class Date_ extends DataType { /** @ignore */ export class DateDay extends Date_ { constructor() { super(DateUnit.DAY); } } -/** @ignore */ +/** + * A signed 64-bit date representing the elapsed time since UNIX epoch (1970-01-01) in milliseconds. + * According to the specification, this should be treated as the number of days, in milliseconds, since the UNIX epoch. + * Therefore, values must be evenly divisible by `86_400_000` (the number of milliseconds in a standard day). + * + * Practically, validation that values of this type are evenly divisible by `86_400_000` is not enforced by this library + * for performance and usability reasons. + * + * Users should prefer to use {@link DateDay} to cleanly represent the number of days. For JS dates, + * {@link TimestampMillisecond} is the preferred type. + * + * @ignore + */ export class DateMillisecond extends Date_ { constructor() { super(DateUnit.MILLISECOND); } } /** @ignore */ diff --git a/js/src/vector.ts b/js/src/vector.ts index a7c103bc326ee..1b0d9a05796f0 100644 --- a/js/src/vector.ts +++ b/js/src/vector.ts @@ -445,7 +445,7 @@ export function makeVector(init: any) { if (init instanceof DataView) { init = new Uint8Array(init.buffer); } - const props = { offset: 0, length: init.length, nullCount: 0, data: init }; + const props = { offset: 0, length: init.length, nullCount: -1, data: init }; if (init instanceof Int8Array) { return new Vector([makeData({ ...props, type: new dtypes.Int8 })]); } if (init instanceof Int16Array) { return new Vector([makeData({ ...props, type: new dtypes.Int16 })]); } if (init instanceof Int32Array) { return new Vector([makeData({ ...props, type: new dtypes.Int32 })]); } diff --git a/js/test/unit/vector/date-vector-tests.ts b/js/test/unit/vector/date-vector-tests.ts index f8b4c1c7976d2..e5cd49933eac5 100644 --- a/js/test/unit/vector/date-vector-tests.ts +++ b/js/test/unit/vector/date-vector-tests.ts @@ -15,10 +15,19 @@ // specific language governing permissions and limitations // under the License. -import { DateDay, DateMillisecond, RecordBatchReader, Table, vectorFromArray } from 'apache-arrow'; +import { DateDay, DateMillisecond, TimestampMillisecond, RecordBatchReader, Table, vectorFromArray } from 'apache-arrow'; + +describe(`TimestampVector`, () => { + test(`Dates are stored in TimestampMillisecond`, () => { + const date = new Date('2023-02-01T12:34:56Z'); + const vec = vectorFromArray([date]); + expect(vec.type).toBeInstanceOf(TimestampMillisecond); + expect(vec.get(0)).toBe(date.valueOf()); + }); +}); describe(`DateVector`, () => { - it('returns days since the epoch as correct JS Dates', () => { + test(`returns days since the epoch as correct JS Dates`, () => { const table = new Table(RecordBatchReader.from(test_data)); const expectedMillis = expectedMillis32(); const date32 = table.getChildAt(0)!; @@ -28,7 +37,7 @@ describe(`DateVector`, () => { } }); - it('returns millisecond longs since the epoch as correct JS Dates', () => { + test(`returns millisecond longs since the epoch as correct JS Dates`, () => { const table = new Table(RecordBatchReader.from(test_data)); const expectedMillis = expectedMillis64(); const date64 = table.getChildAt(1)!; @@ -38,9 +47,9 @@ describe(`DateVector`, () => { } }); - it('returns the same date that was in the vector', () => { + test(`returns the same date that was in the vector`, () => { const dates = [new Date(1950, 1, 0)]; - const vec = vectorFromArray(dates); + const vec = vectorFromArray(dates, new DateMillisecond()); for (const date of vec) { expect(date).toEqual(dates.shift()); } diff --git a/js/test/unit/vector/vector-tests.ts b/js/test/unit/vector/vector-tests.ts index bfcf0d8547861..a10d7c757ca17 100644 --- a/js/test/unit/vector/vector-tests.ts +++ b/js/test/unit/vector/vector-tests.ts @@ -16,7 +16,7 @@ // under the License. import { - Bool, DateDay, DateMillisecond, Dictionary, Float64, Int32, List, makeVector, Struct, Timestamp, TimeUnit, Utf8, LargeUtf8, util, Vector, vectorFromArray + Bool, DateDay, DateMillisecond, Dictionary, Float64, Int32, List, makeVector, Struct, Timestamp, TimeUnit, Utf8, LargeUtf8, util, Vector, vectorFromArray, makeData } from 'apache-arrow'; describe(`makeVectorFromArray`, () => { @@ -33,6 +33,47 @@ describe(`makeVectorFromArray`, () => { }); }); +describe(`basic vector methods`, () => { + test(`not nullable`, () => { + const vector = makeVector([makeData({ data: new Int32Array([1, 2, 3]), nullCount: -1, type: new Int32() })]); + expect(vector.nullable).toBe(false); + expect(vector.nullCount).toBe(0); + }); + + test(`nullable`, () => { + const vector = makeVector([makeData({ data: new Int32Array([1, 2, 3]), nullCount: 0, type: new Int32() })]); + expect(vector.nullable).toBe(true); + expect(vector.nullCount).toBe(0); + expect(vector.isValid(0)).toBe(true); + + // set a value to null + vector.set(0, null); + expect(vector.nullable).toBe(true); + expect(vector.nullCount).toBe(1); + expect(vector.isValid(0)).toBe(false); + + // set the same value to null which should not change anything + vector.set(0, null); + expect(vector.nullable).toBe(true); + expect(vector.nullCount).toBe(1); + + // set a different value to null + vector.set(1, null); + expect(vector.nullable).toBe(true); + expect(vector.nullCount).toBe(2); + + // set first value to non-null + vector.set(0, 1); + expect(vector.nullable).toBe(true); + expect(vector.nullCount).toBe(1); + + // set last null to non-null + vector.set(1, 2); + expect(vector.nullable).toBe(true); + expect(vector.nullCount).toBe(0); + }); +}); + describe(`StructVector`, () => { test(`makeVectorFromArray`, () => { const values: { a?: number; b?: string | null; c?: boolean | null }[] = [ @@ -108,7 +149,6 @@ describe(`DateVector`, () => { }); describe(`DictionaryVector`, () => { - const dictionary = ['foo', 'bar', 'baz']; const extras = ['abc', '123']; // values to search for that should NOT be found const dictionary_vec = vectorFromArray(dictionary, new Utf8).memoize(); @@ -117,7 +157,6 @@ describe(`DictionaryVector`, () => { const validity = Array.from({ length: indices.length }, () => Math.random() > 0.2); describe(`index with nullCount == 0`, () => { - const values = indices.map((d) => dictionary[d]); const vector = makeVector({ data: indices, @@ -133,7 +172,6 @@ describe(`DictionaryVector`, () => { }); describe(`index with nullCount > 0`, () => { - const nullBitmap = util.packBools(validity); const nullCount = validity.reduce((acc, d) => acc + (d ? 0 : 1), 0); const values = indices.map((d, i) => validity[i] ? dictionary[d] : null); diff --git a/r/tests/testthat/test-duckdb.R b/r/tests/testthat/test-duckdb.R index 33ab1ecc7aa4d..dd7b6dba7fde0 100644 --- a/r/tests/testthat/test-duckdb.R +++ b/r/tests/testthat/test-duckdb.R @@ -281,7 +281,7 @@ test_that("to_duckdb passing a connection", { to_duckdb(con = con_separate, auto_disconnect = FALSE) # Generates a query like SELECT * FROM arrow_xxx - table_four_query <- paste(show_query(table_four), collapse = "\n") + table_four_query <- paste(dbplyr::sql_build(table_four), collapse = "\n") table_four_name <- stringr::str_extract(table_four_query, "arrow_[0-9]{3}") expect_false(is.na(table_four_name))