From 78dc855cb02bc73666b6759629298a60a0493682 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Wed, 17 Jul 2024 13:33:54 -0300 Subject: [PATCH] document some internals --- python/src/nanoarrow/_array.pyx | 435 +++++++++++++++++--------------- 1 file changed, 227 insertions(+), 208 deletions(-) diff --git a/python/src/nanoarrow/_array.pyx b/python/src/nanoarrow/_array.pyx index e4e028abd..c4236b5ed 100644 --- a/python/src/nanoarrow/_array.pyx +++ b/python/src/nanoarrow/_array.pyx @@ -87,10 +87,187 @@ from nanoarrow._utils cimport ( Error ) +from typing import Iterable, Tuple, Union + from nanoarrow import _repr_utils from nanoarrow._device import DEVICE_CPU, DeviceType +cdef class CArrayView: + """Low-level ArrowArrayView wrapper + + This object is a literal wrapper around an ArrowArrayView. It provides field accessors + that return Python objects and handles the structure lifecycle (i.e., initialized + ArrowArrayView structures are always released). + + See `nanoarrow.c_array_view()` for construction and usage examples. + """ + + def __cinit__(self, object base, uintptr_t addr): + self._base = base + self._ptr = addr + self._device = DEVICE_CPU + + def _set_array(self, CArray array, Device device=DEVICE_CPU): + cdef Error error = Error() + cdef int code + + if device is DEVICE_CPU: + code = ArrowArrayViewSetArray(self._ptr, array._ptr, &error.c_error) + else: + code = ArrowArrayViewSetArrayMinimal(self._ptr, array._ptr, &error.c_error) + + error.raise_message_not_ok("ArrowArrayViewSetArray()", code) + self._array_base = array._base + self._device = device + return self + + @property + def storage_type_id(self): + return self._ptr.storage_type + + @property + def storage_type(self): + cdef const char* type_str = ArrowTypeString(self._ptr.storage_type) + if type_str != NULL: + return type_str.decode('UTF-8') + + @property + def layout(self): + return CLayout(self, &self._ptr.layout) + + def __len__(self): + return self._ptr.length + + @property + def length(self): + return len(self) + + @property + def offset(self): + return self._ptr.offset + + @property + def null_count(self): + if self._ptr.null_count != -1: + return self._ptr.null_count + + cdef ArrowBufferType buffer_type = self._ptr.layout.buffer_type[0] + cdef const uint8_t* validity_bits = self._ptr.buffer_views[0].data.as_uint8 + + if buffer_type != NANOARROW_BUFFER_TYPE_VALIDITY: + self._ptr.null_count = 0 + elif validity_bits == NULL: + self._ptr.null_count = 0 + elif self._device is DEVICE_CPU: + self._ptr.null_count = ( + self._ptr.length - + ArrowBitCountSet(validity_bits, self.offset, self.length) + ) + + return self._ptr.null_count + + @property + def n_children(self): + return self._ptr.n_children + + def child(self, int64_t i): + if i < 0 or i >= self._ptr.n_children: + raise IndexError(f"{i} out of range [0, {self._ptr.n_children})") + + cdef CArrayView child = CArrayView( + self._base, + self._ptr.children[i] + ) + + child._device = self._device + return child + + @property + def children(self): + for i in range(self.n_children): + yield self.child(i) + + @property + def n_buffers(self): + return self.layout.n_buffers + + def buffer_type(self, int64_t i): + if i < 0 or i >= self.n_buffers: + raise IndexError(f"{i} out of range [0, {self.n_buffers}]") + + buffer_type = self._ptr.layout.buffer_type[i] + if buffer_type == NANOARROW_BUFFER_TYPE_VALIDITY: + return "validity" + elif buffer_type == NANOARROW_BUFFER_TYPE_TYPE_ID: + return "type_id" + elif buffer_type == NANOARROW_BUFFER_TYPE_UNION_OFFSET: + return "union_offset" + elif buffer_type == NANOARROW_BUFFER_TYPE_DATA_OFFSET: + return "data_offset" + elif buffer_type == NANOARROW_BUFFER_TYPE_DATA: + return "data" + else: + return "none" + + def buffer(self, int64_t i): + if i < 0 or i >= self.n_buffers: + raise IndexError(f"{i} out of range [0, {self.n_buffers}]") + + cdef ArrowBufferView* buffer_view = &(self._ptr.buffer_views[i]) + + # Check the buffer size here because the error later is cryptic. + # Buffer sizes are set to -1 when they are "unknown", so because of errors + # in nanoarrow/C or because the array is on a non-CPU device, that -1 value + # could leak its way here. + if buffer_view.size_bytes < 0: + raise RuntimeError(f"ArrowArrayView buffer {i} has size_bytes < 0") + + return CBufferView( + self._array_base, + buffer_view.data.data, + buffer_view.size_bytes, + self._ptr.layout.buffer_data_type[i], + self._ptr.layout.element_size_bits[i], + self._device + ) + + @property + def buffers(self): + for i in range(self.n_buffers): + yield self.buffer(i) + + @property + def dictionary(self): + if self._ptr.dictionary == NULL: + return None + else: + return CArrayView( + self, + self._ptr.dictionary + ) + + def __repr__(self): + return _repr_utils.array_view_repr(self) + + @staticmethod + def from_schema(CSchema schema): + cdef ArrowArrayView* c_array_view + base = alloc_c_array_view(&c_array_view) + + cdef Error error = Error() + cdef int code = ArrowArrayViewInitFromSchema(c_array_view, + schema._ptr, &error.c_error) + error.raise_message_not_ok("ArrowArrayViewInitFromSchema()", code) + + return CArrayView(base, c_array_view) + + @staticmethod + def from_array(CArray array, Device device=DEVICE_CPU): + out = CArrayView.from_schema(array._schema) + return out._set_array(array, device) + + cdef class CArray: """Low-level ArrowArray wrapper @@ -102,7 +279,8 @@ cdef class CArray: """ @staticmethod - def allocate(CSchema schema): + def allocate(CSchema schema) -> CArray: + """Allocate a released ArrowArray""" cdef ArrowArray* c_array_out base = alloc_c_array(&c_array_out) return CArray(base, c_array_out, schema) @@ -119,7 +297,7 @@ cdef class CArray: self._device_id = device_id @staticmethod - def _import_from_c_capsule(schema_capsule, array_capsule): + def _import_from_c_capsule(schema_capsule, array_capsule) -> CArray: """ Import from a ArrowSchema and ArrowArray PyCapsule tuple. @@ -145,7 +323,7 @@ cdef class CArray: return out - def __getitem__(self, k): + def __getitem__(self, k) -> CArray: self._assert_valid() if not isinstance(k, slice): @@ -215,10 +393,11 @@ cdef class CArray: return self._schema.__arrow_c_schema__(), array_capsule - def _addr(self): + def _addr(self) -> int: return self._ptr - def is_valid(self): + def is_valid(self) -> bool: + """Check for a non-null and non-released underlying ArrowArray""" return self._ptr != NULL and self._ptr.release != NULL def _assert_valid(self): @@ -227,56 +406,57 @@ cdef class CArray: if self._ptr.release == NULL: raise RuntimeError("CArray is released") - def view(self): + def view(self) -> CArrayView: + """Allocate a :class:`CArrayView` to access the buffers of this array""" device = Device.resolve(self._device_type, self._device_id) return CArrayView.from_array(self, device) @property - def schema(self): + def schema(self) -> CSchema: return self._schema @property - def device_type(self): + def device_type(self) -> DeviceType: return DeviceType(self._device_type) @property - def device_type_id(self): + def device_type_id(self) -> int: return self._device_type @property - def device_id(self): + def device_id(self) -> int: return self._device_id - def __len__(self): + def __len__(self) -> int: self._assert_valid() return self._ptr.length @property - def length(self): + def length(self) -> int: return len(self) @property - def offset(self): + def offset(self) -> int: self._assert_valid() return self._ptr.offset @property - def null_count(self): + def null_count(self) -> int: self._assert_valid() return self._ptr.null_count @property - def n_buffers(self): + def n_buffers(self) -> int: self._assert_valid() return self._ptr.n_buffers @property - def buffers(self): + def buffers(self) -> Tuple[int, ...]: self._assert_valid() return tuple(self._ptr.buffers[i] for i in range(self._ptr.n_buffers)) @property - def n_children(self): + def n_children(self) -> int: self._assert_valid() return self._ptr.n_children @@ -293,12 +473,12 @@ cdef class CArray: return out @property - def children(self): + def children(self) -> Iterable[CArray]: for i in range(self.n_children): yield self.child(i) @property - def dictionary(self): + def dictionary(self) -> Union[CArray, None]: self._assert_valid() cdef CArray out if self._ptr.dictionary != NULL: @@ -308,186 +488,16 @@ cdef class CArray: else: return None - def __repr__(self): + def __repr__(self) -> str: return _repr_utils.array_repr(self) -cdef class CArrayView: - """Low-level ArrowArrayView wrapper - - This object is a literal wrapper around an ArrowArrayView. It provides field accessors - that return Python objects and handles the structure lifecycle (i.e., initialized - ArrowArrayView structures are always released). +cdef class CArrayBuilder: + """Helper for constructing an ArrowArray - See `nanoarrow.c_array_view()` for construction and usage examples. + The primary function of this class is to wrap the nanoarrow C library calls + that build up the components of an ArrowArray. """ - - def __cinit__(self, object base, uintptr_t addr): - self._base = base - self._ptr = addr - self._device = DEVICE_CPU - - def _set_array(self, CArray array, Device device=DEVICE_CPU): - cdef Error error = Error() - cdef int code - - if device is DEVICE_CPU: - code = ArrowArrayViewSetArray(self._ptr, array._ptr, &error.c_error) - else: - code = ArrowArrayViewSetArrayMinimal(self._ptr, array._ptr, &error.c_error) - - error.raise_message_not_ok("ArrowArrayViewSetArray()", code) - self._array_base = array._base - self._device = device - return self - - @property - def storage_type_id(self): - return self._ptr.storage_type - - @property - def storage_type(self): - cdef const char* type_str = ArrowTypeString(self._ptr.storage_type) - if type_str != NULL: - return type_str.decode('UTF-8') - - @property - def layout(self): - return CLayout(self, &self._ptr.layout) - - def __len__(self): - return self._ptr.length - - @property - def length(self): - return len(self) - - @property - def offset(self): - return self._ptr.offset - - @property - def null_count(self): - if self._ptr.null_count != -1: - return self._ptr.null_count - - cdef ArrowBufferType buffer_type = self._ptr.layout.buffer_type[0] - cdef const uint8_t* validity_bits = self._ptr.buffer_views[0].data.as_uint8 - - if buffer_type != NANOARROW_BUFFER_TYPE_VALIDITY: - self._ptr.null_count = 0 - elif validity_bits == NULL: - self._ptr.null_count = 0 - elif self._device is DEVICE_CPU: - self._ptr.null_count = ( - self._ptr.length - - ArrowBitCountSet(validity_bits, self.offset, self.length) - ) - - return self._ptr.null_count - - @property - def n_children(self): - return self._ptr.n_children - - def child(self, int64_t i): - if i < 0 or i >= self._ptr.n_children: - raise IndexError(f"{i} out of range [0, {self._ptr.n_children})") - - cdef CArrayView child = CArrayView( - self._base, - self._ptr.children[i] - ) - - child._device = self._device - return child - - @property - def children(self): - for i in range(self.n_children): - yield self.child(i) - - @property - def n_buffers(self): - return self.layout.n_buffers - - def buffer_type(self, int64_t i): - if i < 0 or i >= self.n_buffers: - raise IndexError(f"{i} out of range [0, {self.n_buffers}]") - - buffer_type = self._ptr.layout.buffer_type[i] - if buffer_type == NANOARROW_BUFFER_TYPE_VALIDITY: - return "validity" - elif buffer_type == NANOARROW_BUFFER_TYPE_TYPE_ID: - return "type_id" - elif buffer_type == NANOARROW_BUFFER_TYPE_UNION_OFFSET: - return "union_offset" - elif buffer_type == NANOARROW_BUFFER_TYPE_DATA_OFFSET: - return "data_offset" - elif buffer_type == NANOARROW_BUFFER_TYPE_DATA: - return "data" - else: - return "none" - - def buffer(self, int64_t i): - if i < 0 or i >= self.n_buffers: - raise IndexError(f"{i} out of range [0, {self.n_buffers}]") - - cdef ArrowBufferView* buffer_view = &(self._ptr.buffer_views[i]) - - # Check the buffer size here because the error later is cryptic. - # Buffer sizes are set to -1 when they are "unknown", so because of errors - # in nanoarrow/C or because the array is on a non-CPU device, that -1 value - # could leak its way here. - if buffer_view.size_bytes < 0: - raise RuntimeError(f"ArrowArrayView buffer {i} has size_bytes < 0") - - return CBufferView( - self._array_base, - buffer_view.data.data, - buffer_view.size_bytes, - self._ptr.layout.buffer_data_type[i], - self._ptr.layout.element_size_bits[i], - self._device - ) - - @property - def buffers(self): - for i in range(self.n_buffers): - yield self.buffer(i) - - @property - def dictionary(self): - if self._ptr.dictionary == NULL: - return None - else: - return CArrayView( - self, - self._ptr.dictionary - ) - - def __repr__(self): - return _repr_utils.array_view_repr(self) - - @staticmethod - def from_schema(CSchema schema): - cdef ArrowArrayView* c_array_view - base = alloc_c_array_view(&c_array_view) - - cdef Error error = Error() - cdef int code = ArrowArrayViewInitFromSchema(c_array_view, - schema._ptr, &error.c_error) - error.raise_message_not_ok("ArrowArrayViewInitFromSchema()", code) - - return CArrayView(base, c_array_view) - - @staticmethod - def from_array(CArray array, Device device=DEVICE_CPU): - out = CArrayView.from_schema(array._schema) - return out._set_array(array, device) - - -cdef class CArrayBuilder: cdef CArray c_array cdef ArrowArray* _ptr cdef bint _can_validate @@ -499,15 +509,21 @@ cdef class CArrayBuilder: @staticmethod def allocate(): + """Create a CArrayBuilder + + Allocates memory for an ArrowArray and populates it with nanoarrow's + ArrowArray private_data/release callback implementation. This should + usually be followed by :meth:`init_from_type` or :meth:`init_from_schema`. + """ return CArrayBuilder(CArray.allocate(CSchema.allocate())) - def is_empty(self): + def is_empty(self) -> bool: if self._ptr.release == NULL: raise RuntimeError("CArrayBuilder is not initialized") return self._ptr.length == 0 - def init_from_type(self, int type_id): + def init_from_type(self, int type_id) -> CArrayBuilder: if self._ptr.release != NULL: raise RuntimeError("CArrayBuilder is already initialized") @@ -519,7 +535,7 @@ cdef class CArrayBuilder: return self - def init_from_schema(self, CSchema schema): + def init_from_schema(self, CSchema schema) -> CArrayBuilder: if self._ptr.release != NULL: raise RuntimeError("CArrayBuilder is already initialized") @@ -530,12 +546,12 @@ cdef class CArrayBuilder: self.c_array._schema = schema return self - def start_appending(self): + def start_appending(self) -> CArrayBuilder: cdef int code = ArrowArrayStartAppending(self._ptr) Error.raise_error_not_ok("ArrowArrayStartAppending()", code) return self - def append_strings(self, obj): + def append_strings(self, obj: Iterable[Union[str, None]]) -> CArrayBuilder: cdef int code cdef Py_ssize_t item_utf8_size cdef ArrowStringView item @@ -557,7 +573,7 @@ cdef class CArrayBuilder: return self - def append_bytes(self, obj): + def append_bytes(self, obj: Iterable[Union[str, None]]) -> CArrayBuilder: cdef Py_buffer buffer cdef ArrowBufferView item @@ -582,22 +598,23 @@ cdef class CArrayBuilder: if code != NANOARROW_OK: Error.raise_error(f"append bytes item {py_item}", code) - def set_offset(self, int64_t offset): + def set_offset(self, int64_t offset) -> CArrayBuilder: self.c_array._assert_valid() self._ptr.offset = offset return self - def set_length(self, int64_t length): + def set_length(self, int64_t length) -> CArrayBuilder: self.c_array._assert_valid() self._ptr.length = length return self - def set_null_count(self, int64_t null_count): + def set_null_count(self, int64_t null_count) -> CArrayBuilder: self.c_array._assert_valid() self._ptr.null_count = null_count return self - def resolve_null_count(self): + def resolve_null_count(self) -> CArrayBuilder: + """Ensure the output null count is synchronized with existing buffers""" self.c_array._assert_valid() # This doesn't apply to unions. We currently don't have a schema view @@ -635,8 +652,10 @@ cdef class CArrayBuilder: self._ptr.null_count = self._ptr.length - count return self - def set_buffer(self, int64_t i, CBuffer buffer, move=False): - """Sets a buffer of this ArrowArray such the pointer at array->buffers[i] is + def set_buffer(self, int64_t i, CBuffer buffer, move=False) -> CArrayBuilder: + """Set an ArrowArray buffer + + Sets a buffer of this ArrowArray such the pointer at array->buffers[i] is equal to buffer->data and such that the buffer's lifcycle is managed by the array. If move is True, the input Python object that previously wrapped the ArrowBuffer will be invalidated, which is usually the desired behaviour @@ -659,7 +678,7 @@ cdef class CArrayBuilder: return self - def set_child(self, int64_t i, CArray c_array, move=False): + def set_child(self, int64_t i, CArray c_array, move=False) -> CArrayBuilder: cdef CArray child = self.c_array.child(i) if child._ptr.release != NULL: ArrowArrayRelease(child._ptr) @@ -676,7 +695,7 @@ cdef class CArrayBuilder: return self - def finish(self, validation_level=None): + def finish(self, validation_level=None) -> CArray: self.c_array._assert_valid() cdef ArrowValidationLevel c_validation_level cdef Error error = Error()