diff --git a/DESCRIPTION b/DESCRIPTION index bbe6c7d11c..9ec6e3c023 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: tiledb Type: Package -Version: 0.21.1.4 +Version: 0.21.1.5 Title: Universal Storage Engine for Sparse and Dense Multidimensional Arrays Authors@R: c(person("TileDB, Inc.", role = c("aut", "cph")), person("Dirk", "Eddelbuettel", email = "dirk@tiledb.com", role = "cre")) diff --git a/NEWS.md b/NEWS.md index a0e0374f27..b5fc94f522 100644 --- a/NEWS.md +++ b/NEWS.md @@ -10,6 +10,8 @@ * Array fragment deletions uses a new static method (with TileDB 2.18.0 or later) (#599) +* The included `nanoarrow` header and source file have been updated to release 0.3.0 (#600) + ## Bug Fixes * The DESCRIPTION file now correctly refers to macOS 10.14 (#596) diff --git a/src/nanoarrow.c b/src/nanoarrow.c index 1d31884b19..717cbf5d87 100644 --- a/src/nanoarrow.c +++ b/src/nanoarrow.c @@ -49,12 +49,21 @@ int ArrowErrorSet(struct ArrowError* error, const char* fmt, ...) { } } -const char* ArrowErrorMessage(struct ArrowError* error) { return error->message; } +const char* ArrowErrorMessage(struct ArrowError* error) { + if (error == NULL) { + return ""; + } else { + return error->message; + } +} void ArrowLayoutInit(struct ArrowLayout* layout, enum ArrowType storage_type) { layout->buffer_type[0] = NANOARROW_BUFFER_TYPE_VALIDITY; - layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_NONE; + layout->buffer_data_type[0] = NANOARROW_TYPE_BOOL; + layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA; + layout->buffer_data_type[1] = storage_type; layout->buffer_type[2] = NANOARROW_BUFFER_TYPE_NONE; + layout->buffer_data_type[2] = NANOARROW_TYPE_UNINITIALIZED; layout->element_size_bits[0] = 1; layout->element_size_bits[1] = 0; @@ -66,43 +75,53 @@ void ArrowLayoutInit(struct ArrowLayout* layout, enum ArrowType storage_type) { case NANOARROW_TYPE_UNINITIALIZED: case NANOARROW_TYPE_NA: layout->buffer_type[0] = NANOARROW_BUFFER_TYPE_NONE; + layout->buffer_data_type[0] = NANOARROW_TYPE_UNINITIALIZED; + layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_NONE; + layout->buffer_data_type[1] = NANOARROW_TYPE_UNINITIALIZED; layout->element_size_bits[0] = 0; break; case NANOARROW_TYPE_LIST: case NANOARROW_TYPE_MAP: layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA_OFFSET; + layout->buffer_data_type[1] = NANOARROW_TYPE_INT32; layout->element_size_bits[1] = 32; break; case NANOARROW_TYPE_LARGE_LIST: layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA_OFFSET; + layout->buffer_data_type[1] = NANOARROW_TYPE_INT64; layout->element_size_bits[1] = 64; break; + case NANOARROW_TYPE_STRUCT: + case NANOARROW_TYPE_FIXED_SIZE_LIST: + layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_NONE; + layout->buffer_data_type[1] = NANOARROW_TYPE_UNINITIALIZED; + break; + case NANOARROW_TYPE_BOOL: - layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA; layout->element_size_bits[1] = 1; break; case NANOARROW_TYPE_UINT8: case NANOARROW_TYPE_INT8: - layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA; layout->element_size_bits[1] = 8; break; case NANOARROW_TYPE_UINT16: case NANOARROW_TYPE_INT16: case NANOARROW_TYPE_HALF_FLOAT: - layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA; layout->element_size_bits[1] = 16; break; case NANOARROW_TYPE_UINT32: case NANOARROW_TYPE_INT32: case NANOARROW_TYPE_FLOAT: + layout->element_size_bits[1] = 32; + break; case NANOARROW_TYPE_INTERVAL_MONTHS: - layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA; + layout->buffer_data_type[1] = NANOARROW_TYPE_INT32; layout->element_size_bits[1] = 32; break; @@ -110,49 +129,61 @@ void ArrowLayoutInit(struct ArrowLayout* layout, enum ArrowType storage_type) { case NANOARROW_TYPE_INT64: case NANOARROW_TYPE_DOUBLE: case NANOARROW_TYPE_INTERVAL_DAY_TIME: - layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA; layout->element_size_bits[1] = 64; break; case NANOARROW_TYPE_DECIMAL128: case NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO: - layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA; layout->element_size_bits[1] = 128; break; case NANOARROW_TYPE_DECIMAL256: - layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA; layout->element_size_bits[1] = 256; break; case NANOARROW_TYPE_FIXED_SIZE_BINARY: - layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA; + layout->buffer_data_type[1] = NANOARROW_TYPE_BINARY; break; case NANOARROW_TYPE_DENSE_UNION: layout->buffer_type[0] = NANOARROW_BUFFER_TYPE_TYPE_ID; + layout->buffer_data_type[0] = NANOARROW_TYPE_INT8; layout->element_size_bits[0] = 8; layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_UNION_OFFSET; + layout->buffer_data_type[1] = NANOARROW_TYPE_INT32; layout->element_size_bits[1] = 32; break; case NANOARROW_TYPE_SPARSE_UNION: layout->buffer_type[0] = NANOARROW_BUFFER_TYPE_TYPE_ID; + layout->buffer_data_type[0] = NANOARROW_TYPE_INT8; layout->element_size_bits[0] = 8; + layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_NONE; + layout->buffer_data_type[1] = NANOARROW_TYPE_UNINITIALIZED; break; case NANOARROW_TYPE_STRING: case NANOARROW_TYPE_BINARY: layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA_OFFSET; + layout->buffer_data_type[1] = NANOARROW_TYPE_INT32; layout->element_size_bits[1] = 32; layout->buffer_type[2] = NANOARROW_BUFFER_TYPE_DATA; + layout->buffer_data_type[2] = storage_type; break; case NANOARROW_TYPE_LARGE_STRING: + layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA_OFFSET; + layout->buffer_data_type[1] = NANOARROW_TYPE_INT64; + layout->element_size_bits[1] = 64; + layout->buffer_type[2] = NANOARROW_BUFFER_TYPE_DATA; + layout->buffer_data_type[2] = NANOARROW_TYPE_STRING; + break; case NANOARROW_TYPE_LARGE_BINARY: layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA_OFFSET; + layout->buffer_data_type[1] = NANOARROW_TYPE_INT64; layout->element_size_bits[1] = 64; layout->buffer_type[2] = NANOARROW_BUFFER_TYPE_DATA; + layout->buffer_data_type[2] = NANOARROW_TYPE_BINARY; break; default: @@ -1413,7 +1444,7 @@ static int64_t ArrowSchemaTypeToStringInternal(struct ArrowSchemaView* schema_vi } } -// Helper for bookeeping to emulate sprintf()-like behaviour spread +// Helper for bookkeeping to emulate sprintf()-like behaviour spread // among multiple sprintf calls. static inline void ArrowToStringLogChars(char** out, int64_t n_chars_last, int64_t* n_remaining, int64_t* n_chars) { @@ -1794,7 +1825,7 @@ void ArrowArrayRelease(struct ArrowArray* array) { // -- changed for tiledb-r static ArrowErrorCode ArrowArraySetStorageType(struct ArrowArray* array, - enum ArrowType storage_type) { + enum ArrowType storage_type) { switch (storage_type) { case NANOARROW_TYPE_UNINITIALIZED: case NANOARROW_TYPE_NA: @@ -1894,24 +1925,43 @@ ArrowErrorCode ArrowArrayInitFromType(struct ArrowArray* array, return NANOARROW_OK; } -static ArrowErrorCode ArrowArrayInitFromArrayView(struct ArrowArray* array, - struct ArrowArrayView* array_view, - struct ArrowError* error) { - ArrowArrayInitFromType(array, array_view->storage_type); +ArrowErrorCode ArrowArrayInitFromArrayView(struct ArrowArray* array, + struct ArrowArrayView* array_view, + struct ArrowError* error) { + NANOARROW_RETURN_NOT_OK_WITH_ERROR( + ArrowArrayInitFromType(array, array_view->storage_type), error); + int result; + struct ArrowArrayPrivateData* private_data = (struct ArrowArrayPrivateData*)array->private_data; + private_data->layout = array_view->layout; - int result = ArrowArrayAllocateChildren(array, array_view->n_children); - if (result != NANOARROW_OK) { - array->release(array); - return result; + if (array_view->n_children > 0) { + result = ArrowArrayAllocateChildren(array, array_view->n_children); + if (result != NANOARROW_OK) { + array->release(array); + return result; + } + + for (int64_t i = 0; i < array_view->n_children; i++) { + result = + ArrowArrayInitFromArrayView(array->children[i], array_view->children[i], error); + if (result != NANOARROW_OK) { + array->release(array); + return result; + } + } } - private_data->layout = array_view->layout; + if (array_view->dictionary != NULL) { + result = ArrowArrayAllocateDictionary(array); + if (result != NANOARROW_OK) { + array->release(array); + return result; + } - for (int64_t i = 0; i < array_view->n_children; i++) { - int result = - ArrowArrayInitFromArrayView(array->children[i], array_view->children[i], error); + result = + ArrowArrayInitFromArrayView(array->dictionary, array_view->dictionary, error); if (result != NANOARROW_OK) { array->release(array); return result; @@ -1957,9 +2007,7 @@ ArrowErrorCode ArrowArrayAllocateChildren(struct ArrowArray* array, int64_t n_ch return ENOMEM; } - for (int64_t i = 0; i < n_children; i++) { - array->children[i] = NULL; - } + memset(array->children, 0, n_children * sizeof(struct ArrowArray*)); for (int64_t i = 0; i < n_children; i++) { array->children[i] = (struct ArrowArray*)ArrowMalloc(sizeof(struct ArrowArray)); @@ -2027,6 +2075,16 @@ static ArrowErrorCode ArrowArrayViewInitFromArray(struct ArrowArrayView* array_v ArrowArrayViewInitFromType(array_view, private_data->storage_type); array_view->layout = private_data->layout; array_view->array = array; + array_view->length = array->length; + array_view->offset = array->offset; + array_view->null_count = array->null_count; + + array_view->buffer_views[0].data.as_uint8 = private_data->bitmap.buffer.data; + array_view->buffer_views[0].size_bytes = private_data->bitmap.buffer.size_bytes; + array_view->buffer_views[1].data.as_uint8 = private_data->buffers[0].data; + array_view->buffer_views[1].size_bytes = private_data->buffers[0].size_bytes; + array_view->buffer_views[2].data.as_uint8 = private_data->buffers[1].data; + array_view->buffer_views[2].size_bytes = private_data->buffers[1].size_bytes; int result = ArrowArrayViewAllocateChildren(array_view, array->n_children); if (result != NANOARROW_OK) { @@ -2042,6 +2100,20 @@ static ArrowErrorCode ArrowArrayViewInitFromArray(struct ArrowArrayView* array_v } } + if (array->dictionary != NULL) { + result = ArrowArrayViewAllocateDictionary(array_view); + if (result != NANOARROW_OK) { + ArrowArrayViewReset(array_view); + return result; + } + + result = ArrowArrayViewInitFromArray(array_view->dictionary, array->dictionary); + if (result != NANOARROW_OK) { + ArrowArrayViewReset(array_view); + return result; + } + } + return NANOARROW_OK; } @@ -2114,6 +2186,10 @@ static ArrowErrorCode ArrowArrayFinalizeBuffers(struct ArrowArray* array) { NANOARROW_RETURN_NOT_OK(ArrowArrayFinalizeBuffers(array->children[i])); } + if (array->dictionary != NULL) { + NANOARROW_RETURN_NOT_OK(ArrowArrayFinalizeBuffers(array->dictionary)); + } + return NANOARROW_OK; } @@ -2128,39 +2204,10 @@ static void ArrowArrayFlushInternalPointers(struct ArrowArray* array) { for (int64_t i = 0; i < array->n_children; i++) { ArrowArrayFlushInternalPointers(array->children[i]); } -} - -static ArrowErrorCode ArrowArrayCheckInternalBufferSizes( - struct ArrowArray* array, struct ArrowArrayView* array_view, char set_length, - struct ArrowError* error) { - if (set_length) { - ArrowArrayViewSetLength(array_view, array->offset + array->length); - } - - for (int64_t i = 0; i < array->n_buffers; i++) { - if (array_view->layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_VALIDITY && - array->null_count == 0 && array->buffers[i] == NULL) { - continue; - } - - int64_t expected_size = array_view->buffer_views[i].size_bytes; - int64_t actual_size = ArrowArrayBuffer(array, i)->size_bytes; - - if (actual_size < expected_size) { - ArrowErrorSet( - error, - "Expected buffer %d to size >= %ld bytes but found buffer with %ld bytes", - (int)i, (long)expected_size, (long)actual_size); - return EINVAL; - } - } - for (int64_t i = 0; i < array->n_children; i++) { - NANOARROW_RETURN_NOT_OK(ArrowArrayCheckInternalBufferSizes( - array->children[i], array_view->children[i], set_length, error)); + if (array->dictionary != NULL) { + ArrowArrayFlushInternalPointers(array->dictionary); } - - return NANOARROW_OK; } ArrowErrorCode ArrowArrayFinishBuilding(struct ArrowArray* array, @@ -2170,7 +2217,7 @@ ArrowErrorCode ArrowArrayFinishBuilding(struct ArrowArray* array, // in some implementations (at least one version of Arrow C++ at the time this // was added). Only do this fix if we can assume CPU data access. if (validation_level >= NANOARROW_VALIDATION_LEVEL_DEFAULT) { - NANOARROW_RETURN_NOT_OK(ArrowArrayFinalizeBuffers(array)); + NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowArrayFinalizeBuffers(array), error); } // Make sure the value we get with array->buffers[i] is set to the actual @@ -2181,44 +2228,11 @@ ArrowErrorCode ArrowArrayFinishBuilding(struct ArrowArray* array, return NANOARROW_OK; } - // Check buffer sizes to make sure we are not sending an ArrowArray - // into the wild that is going to segfault + // For validation, initialize an ArrowArrayView with our known buffer sizes struct ArrowArrayView array_view; - - NANOARROW_RETURN_NOT_OK(ArrowArrayViewInitFromArray(&array_view, array)); - - // Check buffer sizes once without using internal buffer data since - // ArrowArrayViewSetArray() assumes that all the buffers are long enough - // and issues invalid reads on offset buffers if they are not - int result = ArrowArrayCheckInternalBufferSizes(array, &array_view, 1, error); - if (result != NANOARROW_OK) { - ArrowArrayViewReset(&array_view); - return result; - } - - if (validation_level == NANOARROW_VALIDATION_LEVEL_MINIMAL) { - ArrowArrayViewReset(&array_view); - return NANOARROW_OK; - } - - result = ArrowArrayViewSetArray(&array_view, array, error); - if (result != NANOARROW_OK) { - ArrowArrayViewReset(&array_view); - return result; - } - - result = ArrowArrayCheckInternalBufferSizes(array, &array_view, 0, error); - if (result != NANOARROW_OK) { - ArrowArrayViewReset(&array_view); - return result; - } - - if (validation_level == NANOARROW_VALIDATION_LEVEL_DEFAULT) { - ArrowArrayViewReset(&array_view); - return NANOARROW_OK; - } - - result = ArrowArrayViewValidateFull(&array_view, error); + NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowArrayViewInitFromArray(&array_view, array), + error); + int result = ArrowArrayViewValidate(&array_view, validation_level, error); ArrowArrayViewReset(&array_view); return result; } @@ -2265,6 +2279,21 @@ ArrowErrorCode ArrowArrayViewAllocateChildren(struct ArrowArrayView* array_view, return NANOARROW_OK; } +ArrowErrorCode ArrowArrayViewAllocateDictionary(struct ArrowArrayView* array_view) { + if (array_view->dictionary != NULL) { + return EINVAL; + } + + array_view->dictionary = + (struct ArrowArrayView*)ArrowMalloc(sizeof(struct ArrowArrayView)); + if (array_view->dictionary == NULL) { + return ENOMEM; + } + + ArrowArrayViewInitFromType(array_view->dictionary, NANOARROW_TYPE_UNINITIALIZED); + return NANOARROW_OK; +} + ArrowErrorCode ArrowArrayViewInitFromSchema(struct ArrowArrayView* array_view, struct ArrowSchema* schema, struct ArrowError* error) { @@ -2279,6 +2308,7 @@ ArrowErrorCode ArrowArrayViewInitFromSchema(struct ArrowArrayView* array_view, result = ArrowArrayViewAllocateChildren(array_view, schema->n_children); if (result != NANOARROW_OK) { + ArrowErrorSet(error, "ArrowArrayViewAllocateChildren() failed"); ArrowArrayViewReset(array_view); return result; } @@ -2292,6 +2322,21 @@ ArrowErrorCode ArrowArrayViewInitFromSchema(struct ArrowArrayView* array_view, } } + if (schema->dictionary != NULL) { + result = ArrowArrayViewAllocateDictionary(array_view); + if (result != NANOARROW_OK) { + ArrowArrayViewReset(array_view); + return result; + } + + result = + ArrowArrayViewInitFromSchema(array_view->dictionary, schema->dictionary, error); + if (result != NANOARROW_OK) { + ArrowArrayViewReset(array_view); + return result; + } + } + if (array_view->storage_type == NANOARROW_TYPE_SPARSE_UNION || array_view->storage_type == NANOARROW_TYPE_DENSE_UNION) { array_view->union_type_id_map = (int8_t*)ArrowMalloc(256 * sizeof(int8_t)); @@ -2323,6 +2368,11 @@ void ArrowArrayViewReset(struct ArrowArrayView* array_view) { ArrowFree(array_view->children); } + if (array_view->dictionary != NULL) { + ArrowArrayViewReset(array_view->dictionary); + ArrowFree(array_view->dictionary); + } + if (array_view->union_type_id_map != NULL) { ArrowFree(array_view->union_type_id_map); } @@ -2333,7 +2383,6 @@ void ArrowArrayViewReset(struct ArrowArrayView* array_view) { void ArrowArrayViewSetLength(struct ArrowArrayView* array_view, int64_t length) { for (int i = 0; i < 3; i++) { int64_t element_size_bytes = array_view->layout.element_size_bits[i] / 8; - array_view->buffer_views[i].data.data = NULL; switch (array_view->layout.buffer_type[i]) { case NANOARROW_BUFFER_TYPE_VALIDITY: @@ -2377,11 +2426,11 @@ void ArrowArrayViewSetLength(struct ArrowArrayView* array_view, int64_t length) } } -ArrowErrorCode ArrowArrayViewSetArray(struct ArrowArrayView* array_view, - struct ArrowArray* array, - struct ArrowError* error) { - array_view->array = array; - +// This version recursively extracts information from the array and stores it +// in the array view, performing any checks that require the original array. +static int ArrowArrayViewSetArrayInternal(struct ArrowArrayView* array_view, + struct ArrowArray* array, + struct ArrowError* error) { // Check length and offset if (array->offset < 0) { ArrowErrorSet(error, "Expected array offset >= 0 but found array offset of %ld", @@ -2395,8 +2444,10 @@ ArrowErrorCode ArrowArrayViewSetArray(struct ArrowArrayView* array_view, return EINVAL; } - // First pass setting lengths that do not depend on the data buffer - ArrowArrayViewSetLength(array_view, array->offset + array->length); + array_view->array = array; + array_view->offset = array->offset; + array_view->length = array->length; + array_view->null_count = array->null_count; int64_t buffers_required = 0; for (int i = 0; i < 3; i++) { @@ -2406,28 +2457,187 @@ ArrowErrorCode ArrowArrayViewSetArray(struct ArrowArrayView* array_view, buffers_required++; - // If the null_count is 0, the validity buffer can be NULL - if (array_view->layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_VALIDITY && - array->null_count == 0 && array->buffers[i] == NULL) { + // Set buffer pointer + array_view->buffer_views[i].data.data = array->buffers[i]; + + // If non-null, set buffer size to unknown. + if (array->buffers[i] == NULL) { array_view->buffer_views[i].size_bytes = 0; + } else { + array_view->buffer_views[i].size_bytes = -1; } - - array_view->buffer_views[i].data.data = array->buffers[i]; } + // Check the number of buffers if (buffers_required != array->n_buffers) { ArrowErrorSet(error, "Expected array with %d buffer(s) but found %d buffer(s)", (int)buffers_required, (int)array->n_buffers); return EINVAL; } + // Check number of children if (array_view->n_children != array->n_children) { ArrowErrorSet(error, "Expected %ld children but found %ld children", (long)array_view->n_children, (long)array->n_children); return EINVAL; } - // Check child sizes and calculate sizes that depend on data in the array buffers + // Recurse for children + for (int64_t i = 0; i < array_view->n_children; i++) { + NANOARROW_RETURN_NOT_OK(ArrowArrayViewSetArrayInternal(array_view->children[i], + array->children[i], error)); + } + + // Check dictionary + if (array->dictionary == NULL && array_view->dictionary != NULL) { + ArrowErrorSet(error, "Expected dictionary but found NULL"); + return EINVAL; + } + + if (array->dictionary != NULL && array_view->dictionary == NULL) { + ArrowErrorSet(error, "Expected NULL dictionary but found dictionary member"); + return EINVAL; + } + + if (array->dictionary != NULL) { + NANOARROW_RETURN_NOT_OK( + ArrowArrayViewSetArrayInternal(array_view->dictionary, array->dictionary, error)); + } + + return NANOARROW_OK; +} + +static int ArrowArrayViewValidateMinimal(struct ArrowArrayView* array_view, + struct ArrowError* error) { + // Calculate buffer sizes that do not require buffer access. If marked as + // unknown, assign the buffer size; otherwise, validate it. + int64_t offset_plus_length = array_view->offset + array_view->length; + + // Only loop over the first two buffers because the size of the third buffer + // is always data dependent for all current Arrow types. + for (int i = 0; i < 2; i++) { + int64_t element_size_bytes = array_view->layout.element_size_bits[i] / 8; + // Initialize with a value that will cause an error if accidentally used uninitialized + int64_t min_buffer_size_bytes = array_view->buffer_views[i].size_bytes + 1; + + switch (array_view->layout.buffer_type[i]) { + case NANOARROW_BUFFER_TYPE_VALIDITY: + if (array_view->null_count == 0 && array_view->buffer_views[i].size_bytes == 0) { + continue; + } + + min_buffer_size_bytes = _ArrowBytesForBits(offset_plus_length); + break; + case NANOARROW_BUFFER_TYPE_DATA_OFFSET: + // Probably don't want/need to rely on the producer to have allocated an + // offsets buffer of length 1 for a zero-size array + min_buffer_size_bytes = + (offset_plus_length != 0) * element_size_bytes * (offset_plus_length + 1); + break; + case NANOARROW_BUFFER_TYPE_DATA: + min_buffer_size_bytes = + _ArrowRoundUpToMultipleOf8(array_view->layout.element_size_bits[i] * + offset_plus_length) / + 8; + break; + case NANOARROW_BUFFER_TYPE_TYPE_ID: + case NANOARROW_BUFFER_TYPE_UNION_OFFSET: + min_buffer_size_bytes = element_size_bytes * offset_plus_length; + break; + case NANOARROW_BUFFER_TYPE_NONE: + continue; + } + + // Assign or validate buffer size + if (array_view->buffer_views[i].size_bytes == -1) { + array_view->buffer_views[i].size_bytes = min_buffer_size_bytes; + } else if (array_view->buffer_views[i].size_bytes < min_buffer_size_bytes) { + ArrowErrorSet(error, + "Expected %s array buffer %d to have size >= %ld bytes but found " + "buffer with %ld bytes", + ArrowTypeString(array_view->storage_type), (int)i, + (long)min_buffer_size_bytes, + (long)array_view->buffer_views[i].size_bytes); + return EINVAL; + } + } + + // For list, fixed-size list and map views, we can validate the number of children + switch (array_view->storage_type) { + case NANOARROW_TYPE_LIST: + case NANOARROW_TYPE_LARGE_LIST: + case NANOARROW_TYPE_FIXED_SIZE_LIST: + case NANOARROW_TYPE_MAP: + if (array_view->n_children != 1) { + ArrowErrorSet(error, "Expected 1 child of %s array but found %ld child arrays", + ArrowTypeString(array_view->storage_type), + (long)array_view->n_children); + return EINVAL; + } + default: + break; + } + + // For struct, the sparse union, and the fixed-size list views, we can validate child + // lengths. + int64_t child_min_length; + switch (array_view->storage_type) { + case NANOARROW_TYPE_SPARSE_UNION: + case NANOARROW_TYPE_STRUCT: + child_min_length = (array_view->offset + array_view->length); + for (int64_t i = 0; i < array_view->n_children; i++) { + if (array_view->children[i]->length < child_min_length) { + ArrowErrorSet( + error, + "Expected struct child %d to have length >= %ld but found child with " + "length %ld", + (int)(i + 1), (long)(child_min_length), + (long)array_view->children[i]->length); + return EINVAL; + } + } + break; + + case NANOARROW_TYPE_FIXED_SIZE_LIST: + child_min_length = (array_view->offset + array_view->length) * + array_view->layout.child_size_elements; + if (array_view->children[0]->length < child_min_length) { + ArrowErrorSet(error, + "Expected child of fixed_size_list array to have length >= %ld but " + "found array with length %ld", + (long)child_min_length, (long)array_view->children[0]->length); + return EINVAL; + } + break; + default: + break; + } + + // Recurse for children + for (int64_t i = 0; i < array_view->n_children; i++) { + NANOARROW_RETURN_NOT_OK( + ArrowArrayViewValidateMinimal(array_view->children[i], error)); + } + + // Recurse for dictionary + if (array_view->dictionary != NULL) { + NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateMinimal(array_view->dictionary, error)); + } + + return NANOARROW_OK; +} + +static int ArrowArrayViewValidateDefault(struct ArrowArrayView* array_view, + struct ArrowError* error) { + // Perform minimal validation. This will validate or assign + // buffer sizes as long as buffer access is not required. + NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateMinimal(array_view, error)); + + // Calculate buffer sizes or child lengths that require accessing the offsets + // buffer. Where appropriate, validate that the first offset is >= 0. + // If a buffer size is marked as unknown, assign it; otherwise, validate it. + int64_t offset_plus_length = array_view->offset + array_view->length; + int64_t first_offset; int64_t last_offset; switch (array_view->storage_type) { @@ -2441,11 +2651,22 @@ ArrowErrorCode ArrowArrayViewSetArray(struct ArrowArrayView* array_view, return EINVAL; } - last_offset = - array_view->buffer_views[1].data.as_int32[array->offset + array->length]; - array_view->buffer_views[2].size_bytes = last_offset; + last_offset = array_view->buffer_views[1].data.as_int32[offset_plus_length]; + + // If the data buffer size is unknown, assign it; otherwise, check it + if (array_view->buffer_views[2].size_bytes == -1) { + array_view->buffer_views[2].size_bytes = last_offset; + } else if (array_view->buffer_views[2].size_bytes < last_offset) { + ArrowErrorSet(error, + "Expected %s array buffer 2 to have size >= %ld bytes but found " + "buffer with %ld bytes", + ArrowTypeString(array_view->storage_type), (long)last_offset, + (long)array_view->buffer_views[2].size_bytes); + return EINVAL; + } } break; + case NANOARROW_TYPE_LARGE_STRING: case NANOARROW_TYPE_LARGE_BINARY: if (array_view->buffer_views[1].size_bytes != 0) { @@ -2456,34 +2677,38 @@ ArrowErrorCode ArrowArrayViewSetArray(struct ArrowArrayView* array_view, return EINVAL; } - last_offset = - array_view->buffer_views[1].data.as_int64[array->offset + array->length]; - array_view->buffer_views[2].size_bytes = last_offset; + last_offset = array_view->buffer_views[1].data.as_int64[offset_plus_length]; + + // If the data buffer size is unknown, assign it; otherwise, check it + if (array_view->buffer_views[2].size_bytes == -1) { + array_view->buffer_views[2].size_bytes = last_offset; + } else if (array_view->buffer_views[2].size_bytes < last_offset) { + ArrowErrorSet(error, + "Expected %s array buffer 2 to have size >= %ld bytes but found " + "buffer with %ld bytes", + ArrowTypeString(array_view->storage_type), (long)last_offset, + (long)array_view->buffer_views[2].size_bytes); + return EINVAL; + } } break; + case NANOARROW_TYPE_STRUCT: for (int64_t i = 0; i < array_view->n_children; i++) { - if (array->children[i]->length < (array->offset + array->length)) { + if (array_view->children[i]->length < offset_plus_length) { ArrowErrorSet( error, "Expected struct child %d to have length >= %ld but found child with " "length %ld", - (int)(i + 1), (long)(array->offset + array->length), - (long)array->children[i]->length); + (int)(i + 1), (long)offset_plus_length, + (long)array_view->children[i]->length); return EINVAL; } } break; - case NANOARROW_TYPE_LIST: - case NANOARROW_TYPE_MAP: { - const char* type_name = - array_view->storage_type == NANOARROW_TYPE_LIST ? "list" : "map"; - if (array->n_children != 1) { - ArrowErrorSet(error, "Expected 1 child of %s array but found %d child arrays", - type_name, (int)array->n_children); - return EINVAL; - } + case NANOARROW_TYPE_LIST: + case NANOARROW_TYPE_MAP: if (array_view->buffer_views[1].size_bytes != 0) { first_offset = array_view->buffer_views[1].data.as_int32[0]; if (first_offset < 0) { @@ -2492,27 +2717,20 @@ ArrowErrorCode ArrowArrayViewSetArray(struct ArrowArrayView* array_view, return EINVAL; } - last_offset = - array_view->buffer_views[1].data.as_int32[array->offset + array->length]; - if (array->children[0]->length < last_offset) { + last_offset = array_view->buffer_views[1].data.as_int32[offset_plus_length]; + if (array_view->children[0]->length < last_offset) { ArrowErrorSet( error, - "Expected child of %s array with length >= %ld but found array with " + "Expected child of %s array to have length >= %ld but found array with " "length %ld", - type_name, (long)last_offset, (long)array->children[0]->length); + ArrowTypeString(array_view->storage_type), (long)last_offset, + (long)array_view->children[0]->length); return EINVAL; } } break; - } - case NANOARROW_TYPE_LARGE_LIST: - if (array->n_children != 1) { - ArrowErrorSet(error, - "Expected 1 child of large list array but found %d child arrays", - (int)array->n_children); - return EINVAL; - } + case NANOARROW_TYPE_LARGE_LIST: if (array_view->buffer_views[1].size_bytes != 0) { first_offset = array_view->buffer_views[1].data.as_int64[0]; if (first_offset < 0) { @@ -2521,46 +2739,58 @@ ArrowErrorCode ArrowArrayViewSetArray(struct ArrowArrayView* array_view, return EINVAL; } - last_offset = - array_view->buffer_views[1].data.as_int64[array->offset + array->length]; - if (array->children[0]->length < last_offset) { + last_offset = array_view->buffer_views[1].data.as_int64[offset_plus_length]; + if (array_view->children[0]->length < last_offset) { ArrowErrorSet( error, - "Expected child of large list array with length >= %ld but found array " + "Expected child of large list array to have length >= %ld but found array " "with length %ld", - (long)last_offset, (long)array->children[0]->length); + (long)last_offset, (long)array_view->children[0]->length); return EINVAL; } } break; - case NANOARROW_TYPE_FIXED_SIZE_LIST: - if (array->n_children != 1) { - ArrowErrorSet(error, - "Expected 1 child of fixed-size array but found %d child arrays", - (int)array->n_children); - return EINVAL; - } - - last_offset = - (array->offset + array->length) * array_view->layout.child_size_elements; - if (array->children[0]->length < last_offset) { - ArrowErrorSet( - error, - "Expected child of fixed-size list array with length >= %ld but found array " - "with length %ld", - (long)last_offset, (long)array->children[0]->length); - return EINVAL; - } - break; default: break; } + // Recurse for children for (int64_t i = 0; i < array_view->n_children; i++) { NANOARROW_RETURN_NOT_OK( - ArrowArrayViewSetArray(array_view->children[i], array->children[i], error)); + ArrowArrayViewValidateDefault(array_view->children[i], error)); } + // Recurse for dictionary + if (array_view->dictionary != NULL) { + NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateDefault(array_view->dictionary, error)); + } + + return NANOARROW_OK; +} + +ArrowErrorCode ArrowArrayViewSetArray(struct ArrowArrayView* array_view, + struct ArrowArray* array, + struct ArrowError* error) { + // Extract information from the array into the array view + NANOARROW_RETURN_NOT_OK(ArrowArrayViewSetArrayInternal(array_view, array, error)); + + // Run default validation. Because we've marked all non-NULL buffers as having unknown + // size, validation will also update the buffer sizes as it goes. + NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateDefault(array_view, error)); + + return NANOARROW_OK; +} + +ArrowErrorCode ArrowArrayViewSetArrayMinimal(struct ArrowArrayView* array_view, + struct ArrowArray* array, + struct ArrowError* error) { + // Extract information from the array into the array view + NANOARROW_RETURN_NOT_OK(ArrowArrayViewSetArrayInternal(array_view, array, error)); + + // Run default validation. Because we've marked all non-NULL buffers as having unknown + // size, validation will also update the buffer sizes as it goes. + NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateMinimal(array_view, error)); + return NANOARROW_OK; } @@ -2571,10 +2801,8 @@ static int ArrowAssertIncreasingInt32(struct ArrowBufferView view, } for (int64_t i = 1; i < view.size_bytes / (int64_t)sizeof(int32_t); i++) { - int32_t diff = view.data.as_int32[i] - view.data.as_int32[i - 1]; - if (diff < 0) { - ArrowErrorSet(error, "[%ld] Expected element size >= 0 but found element size %ld", - (long)i, (long)diff); + if (view.data.as_int32[i] < view.data.as_int32[i - 1]) { + ArrowErrorSet(error, "[%ld] Expected element size >= 0", (long)i); return EINVAL; } } @@ -2589,10 +2817,8 @@ static int ArrowAssertIncreasingInt64(struct ArrowBufferView view, } for (int64_t i = 1; i < view.size_bytes / (int64_t)sizeof(int64_t); i++) { - int64_t diff = view.data.as_int64[i] - view.data.as_int64[i - 1]; - if (diff < 0) { - ArrowErrorSet(error, "[%ld] Expected element size >= 0 but found element size %ld", - (long)i, (long)diff); + if (view.data.as_int64[i] < view.data.as_int64[i - 1]) { + ArrowErrorSet(error, "[%ld] Expected element size >= 0", (long)i); return EINVAL; } } @@ -2635,8 +2861,8 @@ static int ArrowAssertInt8In(struct ArrowBufferView view, const int8_t* values, return NANOARROW_OK; } -ArrowErrorCode ArrowArrayViewValidateFull(struct ArrowArrayView* array_view, - struct ArrowError* error) { +static int ArrowArrayViewValidateFull(struct ArrowArrayView* array_view, + struct ArrowError* error) { for (int i = 0; i < 3; i++) { switch (array_view->layout.buffer_type[i]) { case NANOARROW_BUFFER_TYPE_DATA_OFFSET: @@ -2655,17 +2881,18 @@ ArrowErrorCode ArrowArrayViewValidateFull(struct ArrowArrayView* array_view, if (array_view->storage_type == NANOARROW_TYPE_DENSE_UNION || array_view->storage_type == NANOARROW_TYPE_SPARSE_UNION) { - // Check that we have valid type ids. if (array_view->union_type_id_map == NULL) { - // If the union_type_id map is NULL - // (e.g., when using ArrowArrayInitFromType() + ArrowArrayAllocateChildren() - // + ArrowArrayFinishBuilding()), we don't have enough information to validate - // this buffer (GH-178). + // If the union_type_id map is NULL (e.g., when using ArrowArrayInitFromType() + + // ArrowArrayAllocateChildren() + ArrowArrayFinishBuilding()), we don't have enough + // information to validate this buffer. + ArrowErrorSet(error, + "Insufficient information provided for validation of union array"); + return EINVAL; } else if (_ArrowParsedUnionTypeIdsWillEqualChildIndices( array_view->union_type_id_map, array_view->n_children, array_view->n_children)) { - NANOARROW_RETURN_NOT_OK(ArrowAssertRangeInt8(array_view->buffer_views[0], 0, - array_view->n_children - 1, error)); + NANOARROW_RETURN_NOT_OK(ArrowAssertRangeInt8( + array_view->buffer_views[0], 0, (int8_t)(array_view->n_children - 1), error)); } else { NANOARROW_RETURN_NOT_OK(ArrowAssertInt8In(array_view->buffer_views[0], array_view->union_type_id_map + 128, @@ -2676,10 +2903,10 @@ ArrowErrorCode ArrowArrayViewValidateFull(struct ArrowArrayView* array_view, if (array_view->storage_type == NANOARROW_TYPE_DENSE_UNION && array_view->union_type_id_map != NULL) { // Check that offsets refer to child elements that actually exist - for (int64_t i = 0; i < array_view->array->length; i++) { + for (int64_t i = 0; i < array_view->length; i++) { int8_t child_id = ArrowArrayViewUnionChildIndex(array_view, i); int64_t offset = ArrowArrayViewUnionChildOffset(array_view, i); - int64_t child_length = array_view->array->children[child_id]->length; + int64_t child_length = array_view->children[child_id]->length; if (offset < 0 || offset > child_length) { ArrowErrorSet( error, @@ -2691,12 +2918,38 @@ ArrowErrorCode ArrowArrayViewValidateFull(struct ArrowArrayView* array_view, } } + // Recurse for children for (int64_t i = 0; i < array_view->n_children; i++) { NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateFull(array_view->children[i], error)); } + // Dictionary valiation not implemented + if (array_view->dictionary != NULL) { + ArrowErrorSet(error, "Validation for dictionary-encoded arrays is not implemented"); + return ENOTSUP; + } + return NANOARROW_OK; } + +ArrowErrorCode ArrowArrayViewValidate(struct ArrowArrayView* array_view, + enum ArrowValidationLevel validation_level, + struct ArrowError* error) { + switch (validation_level) { + case NANOARROW_VALIDATION_LEVEL_NONE: + return NANOARROW_OK; + case NANOARROW_VALIDATION_LEVEL_MINIMAL: + return ArrowArrayViewValidateMinimal(array_view, error); + case NANOARROW_VALIDATION_LEVEL_DEFAULT: + return ArrowArrayViewValidateDefault(array_view, error); + case NANOARROW_VALIDATION_LEVEL_FULL: + NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateDefault(array_view, error)); + return ArrowArrayViewValidateFull(array_view, error); + } + + ArrowErrorSet(error, "validation_level not recognized"); + return EINVAL; +} // Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information @@ -2787,8 +3040,9 @@ static void ArrowBasicArrayStreamRelease(struct ArrowArrayStream* array_stream) ArrowErrorCode ArrowBasicArrayStreamInit(struct ArrowArrayStream* array_stream, struct ArrowSchema* schema, int64_t n_arrays) { - struct BasicArrayStreamPrivate* private_data = (struct BasicArrayStreamPrivate*)ArrowMalloc( - sizeof(struct BasicArrayStreamPrivate)); + struct BasicArrayStreamPrivate* private_data = + (struct BasicArrayStreamPrivate*)ArrowMalloc( + sizeof(struct BasicArrayStreamPrivate)); if (private_data == NULL) { return ENOMEM; } diff --git a/src/nanoarrow.h b/src/nanoarrow.h index 90ce2dc06a..e8ecdd344f 100644 --- a/src/nanoarrow.h +++ b/src/nanoarrow.h @@ -19,15 +19,15 @@ #define NANOARROW_BUILD_ID_H_INCLUDED #define NANOARROW_VERSION_MAJOR 0 -#define NANOARROW_VERSION_MINOR 2 +#define NANOARROW_VERSION_MINOR 3 #define NANOARROW_VERSION_PATCH 0 -#define NANOARROW_VERSION "0.2.0-SNAPSHOT" +#define NANOARROW_VERSION "0.3.0" #define NANOARROW_VERSION_INT \ (NANOARROW_VERSION_MAJOR * 10000 + NANOARROW_VERSION_MINOR * 100 + \ NANOARROW_VERSION_PATCH) -// #define NANOARROW_NAMESPACE YourNamespaceHere +#define NANOARROW_NAMESPACE RPkg #endif // Licensed to the Apache Software Foundation (ASF) under one @@ -55,6 +55,11 @@ +#if defined(NANOARROW_DEBUG) && !defined(NANOARROW_PRINT_AND_DIE) +#include +#include +#endif + #ifdef __cplusplus extern "C" { #endif @@ -191,6 +196,30 @@ static inline void ArrowArrayStreamMove(struct ArrowArrayStream* src, #define _NANOARROW_CHECK_RANGE(x_, min_, max_) \ NANOARROW_RETURN_NOT_OK((x_ >= min_ && x_ <= max_) ? NANOARROW_OK : EINVAL) +#define _NANOARROW_CHECK_UPPER_LIMIT(x_, max_) \ + NANOARROW_RETURN_NOT_OK((x_ <= max_) ? NANOARROW_OK : EINVAL) + +#if defined(NANOARROW_DEBUG) +#define _NANOARROW_RETURN_NOT_OK_WITH_ERROR_IMPL(NAME, EXPR, ERROR_PTR_EXPR, EXPR_STR) \ + do { \ + const int NAME = (EXPR); \ + if (NAME) { \ + ArrowErrorSet((ERROR_PTR_EXPR), "%s failed with errno %d\n* %s:%d", EXPR_STR, \ + NAME, __FILE__, __LINE__); \ + return NAME; \ + } \ + } while (0) +#else +#define _NANOARROW_RETURN_NOT_OK_WITH_ERROR_IMPL(NAME, EXPR, ERROR_PTR_EXPR, EXPR_STR) \ + do { \ + const int NAME = (EXPR); \ + if (NAME) { \ + ArrowErrorSet((ERROR_PTR_EXPR), "%s failed with errno %d", EXPR_STR, NAME); \ + return NAME; \ + } \ + } while (0) +#endif + /// \brief Return code for success. /// \ingroup nanoarrow-errors #define NANOARROW_OK 0 @@ -204,6 +233,47 @@ typedef int ArrowErrorCode; #define NANOARROW_RETURN_NOT_OK(EXPR) \ _NANOARROW_RETURN_NOT_OK_IMPL(_NANOARROW_MAKE_NAME(errno_status_, __COUNTER__), EXPR) +/// \brief Check the result of an expression and return it if not NANOARROW_OK, +/// adding an auto-generated message to an ArrowError. +/// \ingroup nanoarrow-errors +/// +/// This macro is used to ensure that functions that accept an ArrowError +/// as input always set its message when returning an error code (e.g., when calling +/// a nanoarrow function that does *not* accept ArrowError). +#define NANOARROW_RETURN_NOT_OK_WITH_ERROR(EXPR, ERROR_EXPR) \ + _NANOARROW_RETURN_NOT_OK_WITH_ERROR_IMPL( \ + _NANOARROW_MAKE_NAME(errno_status_, __COUNTER__), EXPR, ERROR_EXPR, #EXPR) + +#if defined(NANOARROW_DEBUG) && !defined(NANOARROW_PRINT_AND_DIE) +#define NANOARROW_PRINT_AND_DIE(VALUE, EXPR_STR) \ + do { \ + fprintf(stderr, "%s failed with errno %d\n* %s:%d\n", EXPR_STR, (int)(VALUE), \ + __FILE__, (int)__LINE__); \ + abort(); \ + } while (0) +#endif + +#if defined(NANOARROW_DEBUG) +#define _NANOARROW_ASSERT_OK_IMPL(NAME, EXPR, EXPR_STR) \ + do { \ + const int NAME = (EXPR); \ + if (NAME) NANOARROW_PRINT_AND_DIE(NAME, EXPR_STR); \ + } while (0) + +/// \brief Assert that an expression's value is NANOARROW_OK +/// \ingroup nanoarrow-errors +/// +/// If nanoarrow was built in debug mode (i.e., defined(NANOARROW_DEBUG) is true), +/// print a message to stderr and abort. If nanoarrow was built in release mode, +/// this statement has no effect. You can customize fatal error behaviour +/// be defining the NANOARROW_PRINT_AND_DIE macro before including nanoarrow.h +/// This macro is provided as a convenience for users and is not used internally. +#define NANOARROW_ASSERT_OK(EXPR) \ + _NANOARROW_ASSERT_OK_IMPL(_NANOARROW_MAKE_NAME(errno_status_, __COUNTER__), EXPR, #EXPR) +#else +#define NANOARROW_ASSERT_OK(EXPR) EXPR +#endif + static char _ArrowIsLittleEndian(void) { uint32_t check = 1; char first_byte; @@ -263,6 +333,8 @@ enum ArrowType { /// \ingroup nanoarrow-utils /// /// Returns NULL for invalid values for type +static inline const char* ArrowTypeString(enum ArrowType type); + static inline const char* ArrowTypeString(enum ArrowType type) { switch (type) { case NANOARROW_TYPE_NA: @@ -381,6 +453,8 @@ enum ArrowValidationLevel { /// \ingroup nanoarrow-utils /// /// Returns NULL for invalid values for time_unit +static inline const char* ArrowTimeUnitString(enum ArrowTimeUnit time_unit); + static inline const char* ArrowTimeUnitString(enum ArrowTimeUnit time_unit) { switch (time_unit) { case NANOARROW_TIME_UNIT_SECOND: @@ -423,6 +497,8 @@ struct ArrowStringView { /// \brief Return a view of a const C string /// \ingroup nanoarrow-utils +static inline struct ArrowStringView ArrowCharView(const char* value); + static inline struct ArrowStringView ArrowCharView(const char* value) { struct ArrowStringView out; @@ -436,26 +512,28 @@ static inline struct ArrowStringView ArrowCharView(const char* value) { return out; } +union ArrowBufferViewData { + const void* data; + const int8_t* as_int8; + const uint8_t* as_uint8; + const int16_t* as_int16; + const uint16_t* as_uint16; + const int32_t* as_int32; + const uint32_t* as_uint32; + const int64_t* as_int64; + const uint64_t* as_uint64; + const double* as_double; + const float* as_float; + const char* as_char; +}; + /// \brief An non-owning view of a buffer /// \ingroup nanoarrow-utils struct ArrowBufferView { /// \brief A pointer to the start of the buffer /// /// If size_bytes is 0, this value may be NULL. - union { - const void* data; - const int8_t* as_int8; - const uint8_t* as_uint8; - const int16_t* as_int16; - const uint16_t* as_uint16; - const int32_t* as_int32; - const uint32_t* as_uint32; - const int64_t* as_int64; - const uint64_t* as_uint64; - const double* as_double; - const float* as_float; - const char* as_char; - } data; + union ArrowBufferViewData data; /// \brief The size of the buffer in bytes int64_t size_bytes; @@ -517,6 +595,9 @@ struct ArrowLayout { /// \brief The function of each buffer enum ArrowBufferType buffer_type[3]; + /// \brief The data type of each buffer + enum ArrowType buffer_data_type[3]; + /// \brief The size of an element each buffer or 0 if this size is variable or unknown int64_t element_size_bits[3]; @@ -531,12 +612,23 @@ struct ArrowLayout { /// This data structure provides access to the values contained within /// an ArrowArray with fields provided in a more readily-extractible /// form. You can re-use an ArrowArrayView for multiple ArrowArrays -/// with the same storage type, or use it to represent a hypothetical -/// ArrowArray that does not exist yet. +/// with the same storage type, use it to represent a hypothetical +/// ArrowArray that does not exist yet, or use it to validate the buffers +/// of a future ArrowArray. struct ArrowArrayView { - /// \brief The underlying ArrowArray or NULL if it has not been set + /// \brief The underlying ArrowArray or NULL if it has not been set or + /// if the buffers in this ArrowArrayView are not backed by an ArrowArray. struct ArrowArray* array; + /// \brief The number of elements from the physical start of the buffers. + int64_t offset; + + /// \brief The number of elements in this view. + int64_t length; + + /// \brief A cached null count or -1 to indicate that this value is unknown. + int64_t null_count; + /// \brief The type used to store values in this array /// /// This type represents only the minimum required information to @@ -557,6 +649,9 @@ struct ArrowArrayView { /// \brief Pointers to views of this array's children struct ArrowArrayView** children; + /// \brief Pointer to a view of this array's dictionary + struct ArrowArrayView* dictionary; + /// \brief Union type id to child index mapping /// /// If storage_type is a union type, a 256-byte ArrowMalloc()ed buffer @@ -593,6 +688,29 @@ struct ArrowArrayPrivateData { int8_t union_type_id_is_child_index; }; +/// \brief A representation of an interval. +/// \ingroup nanoarrow-utils +struct ArrowInterval { + /// \brief The type of interval being used + enum ArrowType type; + /// \brief The number of months represented by the interval + int32_t months; + /// \brief The number of days represented by the interval + int32_t days; + /// \brief The number of ms represented by the interval + int32_t ms; + /// \brief The number of ns represented by the interval + int64_t ns; +}; + +/// \brief Zero initialize an Interval with a given unit +/// \ingroup nanoarrow-utils +static inline void ArrowIntervalInit(struct ArrowInterval* interval, + enum ArrowType type) { + memset(interval, 0, sizeof(struct ArrowInterval)); + interval->type = type; +} + /// \brief A representation of a fixed-precision decimal number /// \ingroup nanoarrow-utils /// @@ -776,6 +894,10 @@ static inline void ArrowDecimalSetBytes(struct ArrowDecimal* decimal, NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayInitFromType) #define ArrowArrayInitFromSchema \ NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayInitFromSchema) +#define ArrowArrayInitFromArrayView \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayInitFromArrayView) +#define ArrowArrayInitFromArrayView \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayInitFromArrayView) #define ArrowArrayAllocateChildren \ NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayAllocateChildren) #define ArrowArrayAllocateDictionary \ @@ -794,12 +916,16 @@ static inline void ArrowDecimalSetBytes(struct ArrowDecimal* decimal, NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewInitFromSchema) #define ArrowArrayViewAllocateChildren \ NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewAllocateChildren) +#define ArrowArrayViewAllocateDictionary \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewAllocateDictionary) #define ArrowArrayViewSetLength \ NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewSetLength) #define ArrowArrayViewSetArray \ NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewSetArray) -#define ArrowArrayViewValidateFull \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewValidateFull) +#define ArrowArrayViewSetArrayMinimal \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewSetArrayMinimal) +#define ArrowArrayViewValidate \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewValidate) #define ArrowArrayViewReset NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewReset) #define ArrowBasicArrayStreamInit \ NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowBasicArrayStreamInit) @@ -866,7 +992,16 @@ struct ArrowBufferAllocator ArrowBufferDeallocator( /// need to communicate more verbose error information accept a pointer /// to an ArrowError. This can be stack or statically allocated. The /// content of the message is undefined unless an error code has been -/// returned. +/// returned. If a nanoarrow function is passed a non-null ArrowError pointer, the +/// ArrowError pointed to by the argument will be propagated with a +/// null-terminated error message. It is safe to pass a NULL ArrowError anywhere +/// in the nanoarrow API. +/// +/// Except where documented, it is generally not safe to continue after a +/// function has returned a non-zero ArrowErrorCode. The NANOARROW_RETURN_NOT_OK and +/// NANOARROW_ASSERT_OK macros are provided to help propagate errors. C++ clients can use +/// the helpers provided in the nanoarrow.hpp header to facilitate using C++ idioms +/// for memory management and error propgagtion. /// /// @{ @@ -876,10 +1011,24 @@ struct ArrowError { char message[1024]; }; -/// \brief Set the contents of an error using printf syntax +/// \brief Ensure an ArrowError is null-terminated by zeroing the first character. +/// +/// If error is NULL, this function does nothing. +static inline void ArrowErrorInit(struct ArrowError* error) { + if (error) { + error->message[0] = '\0'; + } +} + +/// \brief Set the contents of an error using printf syntax. +/// +/// If error is NULL, this function does nothing and returns NANOARROW_OK. ArrowErrorCode ArrowErrorSet(struct ArrowError* error, const char* fmt, ...); /// \brief Get the contents of an error +/// +/// If error is NULL, returns "", or returns the contents of the error message +/// otherwise. const char* ArrowErrorMessage(struct ArrowError* error); /// @} @@ -1103,7 +1252,7 @@ ArrowErrorCode ArrowMetadataBuilderRemove(struct ArrowBuffer* buffer, /// Contains more readily extractable values than a raw ArrowSchema. /// Clients can stack or statically allocate this structure but are /// encouraged to use the provided getters to ensure forward -/// compatiblity. +/// compatibility. struct ArrowSchemaView { /// \brief A pointer to the schema represented by this view struct ArrowSchema* schema; @@ -1333,6 +1482,14 @@ static inline void ArrowBitsSetTo(uint8_t* bits, int64_t start_offset, int64_t l /// \brief Count true values in a bitmap static inline int64_t ArrowBitCountSet(const uint8_t* bits, int64_t i_from, int64_t i_to); +/// \brief Extract int8 boolean values from a range in a bitmap +static inline void ArrowBitsUnpackInt8(const uint8_t* bits, int64_t start_offset, + int64_t length, int8_t* out); + +/// \brief Extract int32 boolean values from a range in a bitmap +static inline void ArrowBitsUnpackInt32(const uint8_t* bits, int64_t start_offset, + int64_t length, int32_t* out); + /// \brief Initialize an ArrowBitmap /// /// Initialize the builder's buffer, empty its cache, and reset the size to zero @@ -1413,6 +1570,14 @@ ArrowErrorCode ArrowArrayInitFromSchema(struct ArrowArray* array, struct ArrowSchema* schema, struct ArrowError* error); +/// \brief Initialize the contents of an ArrowArray from an ArrowArrayView +/// +/// Caller is responsible for calling the array->release callback if +/// NANOARROW_OK is returned. +ArrowErrorCode ArrowArrayInitFromArrayView(struct ArrowArray* array, + struct ArrowArrayView* array_view, + struct ArrowError* error); + /// \brief Allocate the array->children array /// /// Includes the memory for each child struct ArrowArray, @@ -1500,21 +1665,31 @@ static inline ArrowErrorCode ArrowArrayAppendDouble(struct ArrowArray* array, /// \brief Append a string of bytes to an array /// /// Returns NANOARROW_OK if value can be exactly represented by -/// the underlying storage type or EINVAL otherwise (e.g., -/// the underlying array is not a binary, string, large binary, large string, -/// or fixed-size binary array, or value is the wrong size for a fixed-size -/// binary array). +/// the underlying storage type, EOVERFLOW if appending value would overflow +/// the offset type (e.g., if the data buffer would be larger than 2 GB for a +/// non-large string type), or EINVAL otherwise (e.g., the underlying array is not a +/// binary, string, large binary, large string, or fixed-size binary array, or value is +/// the wrong size for a fixed-size binary array). static inline ArrowErrorCode ArrowArrayAppendBytes(struct ArrowArray* array, struct ArrowBufferView value); /// \brief Append a string value to an array /// /// Returns NANOARROW_OK if value can be exactly represented by -/// the underlying storage type or EINVAL otherwise (e.g., -/// the underlying array is not a string or large string array). +/// the underlying storage type, EOVERFLOW if appending value would overflow +/// the offset type (e.g., if the data buffer would be larger than 2 GB for a +/// non-large string type), or EINVAL otherwise (e.g., the underlying array is not a +/// string or large string array). static inline ArrowErrorCode ArrowArrayAppendString(struct ArrowArray* array, struct ArrowStringView value); +/// \brief Append a Interval to an array +/// +/// Returns NANOARROW_OK if value can be exactly represented by +/// the underlying storage type or EINVAL otherwise. +static inline ArrowErrorCode ArrowArrayAppendInterval(struct ArrowArray* array, + struct ArrowInterval* value); + /// \brief Append a decimal value to an array /// /// Returns NANOARROW_OK if array is a decimal array with the appropriate @@ -1525,7 +1700,8 @@ static inline ArrowErrorCode ArrowArrayAppendDecimal(struct ArrowArray* array, /// \brief Finish a nested array element /// /// Appends a non-null element to the array based on the first child's current -/// length. Returns NANOARROW_OK if the item was successfully added or EINVAL +/// length. Returns NANOARROW_OK if the item was successfully added, EOVERFLOW +/// if the child of a list or map array would exceed INT_MAX elements, or EINVAL /// if the underlying storage type is not a struct, list, large list, or fixed-size /// list, or if there was an attempt to add a struct or fixed-size list element where the /// length of the child array(s) did not match the expected length. @@ -1561,7 +1737,7 @@ ArrowErrorCode ArrowArrayFinishBuildingDefault(struct ArrowArray* array, /// (i.e. NANOARROW_VALIDATION_LEVEL_NONE or NANOARROW_VALIDATION_LEVEL_MINIMAL) if CPU /// buffer data access is not possible or more validation (i.e., /// NANOARROW_VALIDATION_LEVEL_FULL) if buffer content was obtained from an untrusted or -/// corruptable source. +/// corruptible source. ArrowErrorCode ArrowArrayFinishBuilding(struct ArrowArray* array, enum ArrowValidationLevel validation_level, struct ArrowError* error); @@ -1570,7 +1746,7 @@ ArrowErrorCode ArrowArrayFinishBuilding(struct ArrowArray* array, /// \defgroup nanoarrow-array-view Reading arrays /// -/// These functions read and validate the contents ArrowArray structures +/// These functions read and validate the contents ArrowArray structures. /// /// @{ @@ -1590,12 +1766,15 @@ ArrowErrorCode ArrowArrayViewInitFromSchema(struct ArrowArrayView* array_view, struct ArrowSchema* schema, struct ArrowError* error); -/// \brief Allocate the schema_view->children array +/// \brief Allocate the array_view->children array /// /// Includes the memory for each child struct ArrowArrayView ArrowErrorCode ArrowArrayViewAllocateChildren(struct ArrowArrayView* array_view, int64_t n_children); +/// \brief Allocate array_view->dictionary +ArrowErrorCode ArrowArrayViewAllocateDictionary(struct ArrowArrayView* array_view); + /// \brief Set data-independent buffer sizes from length void ArrowArrayViewSetLength(struct ArrowArrayView* array_view, int64_t length); @@ -1603,9 +1782,23 @@ void ArrowArrayViewSetLength(struct ArrowArrayView* array_view, int64_t length); ArrowErrorCode ArrowArrayViewSetArray(struct ArrowArrayView* array_view, struct ArrowArray* array, struct ArrowError* error); -/// \brief Performs extra checks on the array that was set via ArrowArrayViewSetArray() -ArrowErrorCode ArrowArrayViewValidateFull(struct ArrowArrayView* array_view, - struct ArrowError* error); +/// \brief Set buffer sizes and data pointers from an ArrowArray except for those +/// that require dereferencing buffer content. +ArrowErrorCode ArrowArrayViewSetArrayMinimal(struct ArrowArrayView* array_view, + struct ArrowArray* array, + struct ArrowError* error); + +/// \brief Performs checks on the content of an ArrowArrayView +/// +/// If using ArrowArrayViewSetArray() to back array_view with an ArrowArray, +/// the buffer sizes and some content (fist and last offset) have already +/// been validated at the "default" level. If setting the buffer pointers +/// and sizes otherwise, you may wish to perform checks at a different level. See +/// documentation for ArrowValidationLevel for the details of checks performed +/// at each level. +ArrowErrorCode ArrowArrayViewValidate(struct ArrowArrayView* array_view, + enum ArrowValidationLevel validation_level, + struct ArrowError* error); /// \brief Reset the contents of an ArrowArrayView and frees resources void ArrowArrayViewReset(struct ArrowArrayView* array_view); @@ -1938,6 +2131,28 @@ static inline int64_t _ArrowBytesForBits(int64_t bits) { return (bits >> 3) + ((bits & 7) != 0); } +static inline void _ArrowBitsUnpackInt8(const uint8_t word, int8_t* out) { + out[0] = (word >> 0) & 1; + out[1] = (word >> 1) & 1; + out[2] = (word >> 2) & 1; + out[3] = (word >> 3) & 1; + out[4] = (word >> 4) & 1; + out[5] = (word >> 5) & 1; + out[6] = (word >> 6) & 1; + out[7] = (word >> 7) & 1; +} + +static inline void _ArrowBitsUnpackInt32(const uint8_t word, int32_t* out) { + out[0] = (word >> 0) & 1; + out[1] = (word >> 1) & 1; + out[2] = (word >> 2) & 1; + out[3] = (word >> 3) & 1; + out[4] = (word >> 4) & 1; + out[5] = (word >> 5) & 1; + out[6] = (word >> 6) & 1; + out[7] = (word >> 7) & 1; +} + static inline void _ArrowBitmapPackInt8(const int8_t* values, uint8_t* out) { *out = (values[0] | values[1] << 1 | values[2] << 2 | values[3] << 3 | values[4] << 4 | values[5] << 5 | values[6] << 6 | values[7] << 7); @@ -1952,6 +2167,84 @@ static inline int8_t ArrowBitGet(const uint8_t* bits, int64_t i) { return (bits[i >> 3] >> (i & 0x07)) & 1; } +static inline void ArrowBitsUnpackInt8(const uint8_t* bits, int64_t start_offset, + int64_t length, int8_t* out) { + if (length == 0) { + return; + } + + const int64_t i_begin = start_offset; + const int64_t i_end = start_offset + length; + const int64_t i_last_valid = i_end - 1; + + const int64_t bytes_begin = i_begin / 8; + const int64_t bytes_last_valid = i_last_valid / 8; + + if (bytes_begin == bytes_last_valid) { + for (int i = 0; i < length; i++) { + out[i] = ArrowBitGet(&bits[bytes_begin], i + i_begin % 8); + } + + return; + } + + // first byte + for (int i = 0; i < 8 - (i_begin % 8); i++) { + *out++ = ArrowBitGet(&bits[bytes_begin], i + i_begin % 8); + } + + // middle bytes + for (int64_t i = bytes_begin + 1; i < bytes_last_valid; i++) { + _ArrowBitsUnpackInt8(bits[i], out); + out += 8; + } + + // last byte + const int bits_remaining = i_end % 8 == 0 ? 8 : i_end % 8; + for (int i = 0; i < bits_remaining; i++) { + *out++ = ArrowBitGet(&bits[bytes_last_valid], i); + } +} + +static inline void ArrowBitsUnpackInt32(const uint8_t* bits, int64_t start_offset, + int64_t length, int32_t* out) { + if (length == 0) { + return; + } + + const int64_t i_begin = start_offset; + const int64_t i_end = start_offset + length; + const int64_t i_last_valid = i_end - 1; + + const int64_t bytes_begin = i_begin / 8; + const int64_t bytes_last_valid = i_last_valid / 8; + + if (bytes_begin == bytes_last_valid) { + for (int i = 0; i < length; i++) { + out[i] = ArrowBitGet(&bits[bytes_begin], i + i_begin % 8); + } + + return; + } + + // first byte + for (int i = 0; i < 8 - (i_begin % 8); i++) { + *out++ = ArrowBitGet(&bits[bytes_begin], i + i_begin % 8); + } + + // middle bytes + for (int64_t i = bytes_begin + 1; i < bytes_last_valid; i++) { + _ArrowBitsUnpackInt32(bits[i], out); + out += 8; + } + + // last byte + const int bits_remaining = i_end % 8 == 0 ? 8 : i_end % 8; + for (int i = 0; i < bits_remaining; i++) { + *out++ = ArrowBitGet(&bits[bytes_last_valid], i); + } +} + static inline void ArrowBitSet(uint8_t* bits, int64_t i) { bits[i / 8] |= _ArrowkBitmask[i % 8]; } @@ -2012,36 +2305,37 @@ static inline int64_t ArrowBitCountSet(const uint8_t* bits, int64_t start_offset const int64_t i_begin = start_offset; const int64_t i_end = start_offset + length; + const int64_t i_last_valid = i_end - 1; const int64_t bytes_begin = i_begin / 8; - const int64_t bytes_end = i_end / 8 + 1; + const int64_t bytes_last_valid = i_last_valid / 8; - if (bytes_end == bytes_begin + 1) { + if (bytes_begin == bytes_last_valid) { // count bits within a single byte const uint8_t first_byte_mask = _ArrowkPrecedingBitmask[i_end % 8]; const uint8_t last_byte_mask = _ArrowkTrailingBitmask[i_begin % 8]; const uint8_t only_byte_mask = - i_end % 8 == 0 ? first_byte_mask : (uint8_t)(first_byte_mask & last_byte_mask); + i_end % 8 == 0 ? last_byte_mask : (uint8_t)(first_byte_mask & last_byte_mask); const uint8_t byte_masked = bits[bytes_begin] & only_byte_mask; return _ArrowkBytePopcount[byte_masked]; } const uint8_t first_byte_mask = _ArrowkPrecedingBitmask[i_begin % 8]; - const uint8_t last_byte_mask = _ArrowkTrailingBitmask[i_end % 8]; + const uint8_t last_byte_mask = i_end % 8 == 0 ? 0 : _ArrowkTrailingBitmask[i_end % 8]; int64_t count = 0; // first byte count += _ArrowkBytePopcount[bits[bytes_begin] & ~first_byte_mask]; // middle bytes - for (int64_t i = bytes_begin + 1; i < (bytes_end - 1); i++) { + for (int64_t i = bytes_begin + 1; i < bytes_last_valid; i++) { count += _ArrowkBytePopcount[bits[i]]; } // last byte - count += _ArrowkBytePopcount[bits[bytes_end - 1] & ~last_byte_mask]; + count += _ArrowkBytePopcount[bits[bytes_last_valid] & ~last_byte_mask]; return count; } @@ -2286,7 +2580,7 @@ static inline int8_t _ArrowParseUnionTypeIds(const char* type_ids, int8_t* out) } if (out != NULL) { - out[i] = type_id; + out[i] = (int8_t)type_id; } i++; @@ -2360,11 +2654,15 @@ static inline ArrowErrorCode ArrowArrayStartAppending(struct ArrowArray* array) } } - // Start building any child arrays + // Start building any child arrays or dictionaries for (int64_t i = 0; i < array->n_children; i++) { NANOARROW_RETURN_NOT_OK(ArrowArrayStartAppending(array->children[i])); } + if (array->dictionary != NULL) { + NANOARROW_RETURN_NOT_OK(ArrowArrayStartAppending(array->dictionary)); + } + return NANOARROW_OK; } @@ -2378,6 +2676,10 @@ static inline ArrowErrorCode ArrowArrayShrinkToFit(struct ArrowArray* array) { NANOARROW_RETURN_NOT_OK(ArrowArrayShrinkToFit(array->children[i])); } + if (array->dictionary != NULL) { + NANOARROW_RETURN_NOT_OK(ArrowArrayShrinkToFit(array->dictionary)); + } + return NANOARROW_OK; } @@ -2559,10 +2861,10 @@ static inline ArrowErrorCode ArrowArrayAppendInt(struct ArrowArray* array, _NANOARROW_CHECK_RANGE(value, 0, INT64_MAX); return ArrowArrayAppendUInt(array, value); case NANOARROW_TYPE_DOUBLE: - NANOARROW_RETURN_NOT_OK(ArrowBufferAppendDouble(data_buffer, value)); + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendDouble(data_buffer, (double)value)); break; case NANOARROW_TYPE_FLOAT: - NANOARROW_RETURN_NOT_OK(ArrowBufferAppendFloat(data_buffer, value)); + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendFloat(data_buffer, (float)value)); break; case NANOARROW_TYPE_BOOL: NANOARROW_RETURN_NOT_OK(_ArrowArrayAppendBits(array, 1, value != 0, 1)); @@ -2591,28 +2893,28 @@ static inline ArrowErrorCode ArrowArrayAppendUInt(struct ArrowArray* array, NANOARROW_RETURN_NOT_OK(ArrowBufferAppend(data_buffer, &value, sizeof(uint64_t))); break; case NANOARROW_TYPE_UINT32: - _NANOARROW_CHECK_RANGE(value, 0, UINT32_MAX); + _NANOARROW_CHECK_UPPER_LIMIT(value, UINT32_MAX); NANOARROW_RETURN_NOT_OK(ArrowBufferAppendUInt32(data_buffer, (uint32_t)value)); break; case NANOARROW_TYPE_UINT16: - _NANOARROW_CHECK_RANGE(value, 0, UINT16_MAX); + _NANOARROW_CHECK_UPPER_LIMIT(value, UINT16_MAX); NANOARROW_RETURN_NOT_OK(ArrowBufferAppendUInt16(data_buffer, (uint16_t)value)); break; case NANOARROW_TYPE_UINT8: - _NANOARROW_CHECK_RANGE(value, 0, UINT8_MAX); + _NANOARROW_CHECK_UPPER_LIMIT(value, UINT8_MAX); NANOARROW_RETURN_NOT_OK(ArrowBufferAppendUInt8(data_buffer, (uint8_t)value)); break; case NANOARROW_TYPE_INT64: case NANOARROW_TYPE_INT32: case NANOARROW_TYPE_INT16: case NANOARROW_TYPE_INT8: - _NANOARROW_CHECK_RANGE(value, 0, INT64_MAX); + _NANOARROW_CHECK_UPPER_LIMIT(value, INT64_MAX); return ArrowArrayAppendInt(array, value); case NANOARROW_TYPE_DOUBLE: - NANOARROW_RETURN_NOT_OK(ArrowBufferAppendDouble(data_buffer, value)); + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendDouble(data_buffer, (double)value)); break; case NANOARROW_TYPE_FLOAT: - NANOARROW_RETURN_NOT_OK(ArrowBufferAppendFloat(data_buffer, value)); + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendFloat(data_buffer, (float)value)); break; case NANOARROW_TYPE_BOOL: NANOARROW_RETURN_NOT_OK(_ArrowArrayAppendBits(array, 1, value != 0, 1)); @@ -2671,11 +2973,11 @@ static inline ArrowErrorCode ArrowArrayAppendBytes(struct ArrowArray* array, case NANOARROW_TYPE_STRING: case NANOARROW_TYPE_BINARY: offset = ((int32_t*)offset_buffer->data)[array->length]; - if ((offset + value.size_bytes) > INT32_MAX) { - return EINVAL; + if ((((int64_t)offset) + value.size_bytes) > INT32_MAX) { + return EOVERFLOW; } - offset += value.size_bytes; + offset += (int32_t)value.size_bytes; NANOARROW_RETURN_NOT_OK(ArrowBufferAppend(offset_buffer, &offset, sizeof(int32_t))); NANOARROW_RETURN_NOT_OK( ArrowBufferAppend(data_buffer, value.data.data, value.size_bytes)); @@ -2723,12 +3025,57 @@ static inline ArrowErrorCode ArrowArrayAppendString(struct ArrowArray* array, switch (private_data->storage_type) { case NANOARROW_TYPE_STRING: case NANOARROW_TYPE_LARGE_STRING: + case NANOARROW_TYPE_BINARY: + case NANOARROW_TYPE_LARGE_BINARY: return ArrowArrayAppendBytes(array, buffer_view); default: return EINVAL; } } +static inline ArrowErrorCode ArrowArrayAppendInterval(struct ArrowArray* array, + struct ArrowInterval* value) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + + struct ArrowBuffer* data_buffer = ArrowArrayBuffer(array, 1); + + switch (private_data->storage_type) { + case NANOARROW_TYPE_INTERVAL_MONTHS: { + if (value->type != NANOARROW_TYPE_INTERVAL_MONTHS) { + return EINVAL; + } + + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32(data_buffer, value->months)); + break; + } + case NANOARROW_TYPE_INTERVAL_DAY_TIME: { + if (value->type != NANOARROW_TYPE_INTERVAL_DAY_TIME) { + return EINVAL; + } + + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32(data_buffer, value->days)); + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32(data_buffer, value->ms)); + break; + } + case NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO: { + if (value->type != NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO) { + return EINVAL; + } + + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32(data_buffer, value->months)); + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32(data_buffer, value->days)); + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt64(data_buffer, value->ns)); + break; + } + default: + return EINVAL; + } + + array->length++; + return NANOARROW_OK; +} + static inline ArrowErrorCode ArrowArrayAppendDecimal(struct ArrowArray* array, struct ArrowDecimal* value) { struct ArrowArrayPrivateData* private_data = @@ -2775,7 +3122,7 @@ static inline ArrowErrorCode ArrowArrayFinishElement(struct ArrowArray* array) { case NANOARROW_TYPE_MAP: child_length = array->children[0]->length; if (child_length > INT32_MAX) { - return EINVAL; + return EOVERFLOW; } NANOARROW_RETURN_NOT_OK( ArrowBufferAppendInt32(ArrowArrayBuffer(array, 1), (int32_t)child_length)); @@ -2824,7 +3171,7 @@ static inline ArrowErrorCode ArrowArrayFinishUnionElement(struct ArrowArray* arr switch (private_data->storage_type) { case NANOARROW_TYPE_DENSE_UNION: - // Apppend the target child length to the union offsets buffer + // Append the target child length to the union offsets buffer _NANOARROW_CHECK_RANGE(array->children[child_index]->length, 0, INT32_MAX); NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32( ArrowArrayBuffer(array, 1), (int32_t)array->children[child_index]->length - 1)); @@ -2864,7 +3211,7 @@ static inline void ArrowArrayViewMove(struct ArrowArrayView* src, static inline int8_t ArrowArrayViewIsNull(struct ArrowArrayView* array_view, int64_t i) { const uint8_t* validity_buffer = array_view->buffer_views[0].data.as_uint8; - i += array_view->array->offset; + i += array_view->offset; switch (array_view->storage_type) { case NANOARROW_TYPE_NA: return 0x01; @@ -2910,10 +3257,22 @@ static inline int64_t ArrowArrayViewUnionChildOffset(struct ArrowArrayView* arra } } +static inline int64_t ArrowArrayViewListChildOffset(struct ArrowArrayView* array_view, + int64_t i) { + switch (array_view->storage_type) { + case NANOARROW_TYPE_LIST: + return array_view->buffer_views[1].data.as_int32[i]; + case NANOARROW_TYPE_LARGE_LIST: + return array_view->buffer_views[1].data.as_int64[i]; + default: + return -1; + } +} + static inline int64_t ArrowArrayViewGetIntUnsafe(struct ArrowArrayView* array_view, int64_t i) { struct ArrowBufferView* data_view = &array_view->buffer_views[1]; - i += array_view->array->offset; + i += array_view->offset; switch (array_view->storage_type) { case NANOARROW_TYPE_INT64: return data_view->data.as_int64[i]; @@ -2932,9 +3291,9 @@ static inline int64_t ArrowArrayViewGetIntUnsafe(struct ArrowArrayView* array_vi case NANOARROW_TYPE_UINT8: return data_view->data.as_uint8[i]; case NANOARROW_TYPE_DOUBLE: - return data_view->data.as_double[i]; + return (int64_t)data_view->data.as_double[i]; case NANOARROW_TYPE_FLOAT: - return data_view->data.as_float[i]; + return (int64_t)data_view->data.as_float[i]; case NANOARROW_TYPE_BOOL: return ArrowBitGet(data_view->data.as_uint8, i); default: @@ -2944,7 +3303,7 @@ static inline int64_t ArrowArrayViewGetIntUnsafe(struct ArrowArrayView* array_vi static inline uint64_t ArrowArrayViewGetUIntUnsafe(struct ArrowArrayView* array_view, int64_t i) { - i += array_view->array->offset; + i += array_view->offset; struct ArrowBufferView* data_view = &array_view->buffer_views[1]; switch (array_view->storage_type) { case NANOARROW_TYPE_INT64: @@ -2964,9 +3323,9 @@ static inline uint64_t ArrowArrayViewGetUIntUnsafe(struct ArrowArrayView* array_ case NANOARROW_TYPE_UINT8: return data_view->data.as_uint8[i]; case NANOARROW_TYPE_DOUBLE: - return data_view->data.as_double[i]; + return (uint64_t)data_view->data.as_double[i]; case NANOARROW_TYPE_FLOAT: - return data_view->data.as_float[i]; + return (uint64_t)data_view->data.as_float[i]; case NANOARROW_TYPE_BOOL: return ArrowBitGet(data_view->data.as_uint8, i); default: @@ -2976,13 +3335,13 @@ static inline uint64_t ArrowArrayViewGetUIntUnsafe(struct ArrowArrayView* array_ static inline double ArrowArrayViewGetDoubleUnsafe(struct ArrowArrayView* array_view, int64_t i) { - i += array_view->array->offset; + i += array_view->offset; struct ArrowBufferView* data_view = &array_view->buffer_views[1]; switch (array_view->storage_type) { case NANOARROW_TYPE_INT64: - return data_view->data.as_int64[i]; + return (double)data_view->data.as_int64[i]; case NANOARROW_TYPE_UINT64: - return data_view->data.as_uint64[i]; + return (double)data_view->data.as_uint64[i]; case NANOARROW_TYPE_INT32: return data_view->data.as_int32[i]; case NANOARROW_TYPE_UINT32: @@ -3008,7 +3367,7 @@ static inline double ArrowArrayViewGetDoubleUnsafe(struct ArrowArrayView* array_ static inline struct ArrowStringView ArrowArrayViewGetStringUnsafe( struct ArrowArrayView* array_view, int64_t i) { - i += array_view->array->offset; + i += array_view->offset; struct ArrowBufferView* offsets_view = &array_view->buffer_views[1]; const char* data_view = array_view->buffer_views[2].data.as_char; @@ -3041,7 +3400,7 @@ static inline struct ArrowStringView ArrowArrayViewGetStringUnsafe( static inline struct ArrowBufferView ArrowArrayViewGetBytesUnsafe( struct ArrowArrayView* array_view, int64_t i) { - i += array_view->array->offset; + i += array_view->offset; struct ArrowBufferView* offsets_view = &array_view->buffer_views[1]; const uint8_t* data_view = array_view->buffer_views[2].data.as_uint8; @@ -3073,9 +3432,36 @@ static inline struct ArrowBufferView ArrowArrayViewGetBytesUnsafe( return view; } +static inline void ArrowArrayViewGetIntervalUnsafe(struct ArrowArrayView* array_view, + int64_t i, struct ArrowInterval* out) { + const uint8_t* data_view = array_view->buffer_views[1].data.as_uint8; + switch (array_view->storage_type) { + case NANOARROW_TYPE_INTERVAL_MONTHS: { + const size_t size = sizeof(int32_t); + memcpy(&out->months, data_view + i * size, sizeof(int32_t)); + break; + } + case NANOARROW_TYPE_INTERVAL_DAY_TIME: { + const size_t size = sizeof(int32_t) + sizeof(int32_t); + memcpy(&out->days, data_view + i * size, sizeof(int32_t)); + memcpy(&out->ms, data_view + i * size + 4, sizeof(int32_t)); + break; + } + case NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO: { + const size_t size = sizeof(int32_t) + sizeof(int32_t) + sizeof(int64_t); + memcpy(&out->months, data_view + i * size, sizeof(int32_t)); + memcpy(&out->days, data_view + i * size + 4, sizeof(int32_t)); + memcpy(&out->ns, data_view + i * size + 8, sizeof(int64_t)); + break; + } + default: + break; + } +} + static inline void ArrowArrayViewGetDecimalUnsafe(struct ArrowArrayView* array_view, int64_t i, struct ArrowDecimal* out) { - i += array_view->array->offset; + i += array_view->offset; const uint8_t* data_view = array_view->buffer_views[1].data.as_uint8; switch (array_view->storage_type) { case NANOARROW_TYPE_DECIMAL128: