diff --git a/python/src/nanoarrow/_array.pyx b/python/src/nanoarrow/_array.pyx index 3baaf16e0..0bd3a961a 100644 --- a/python/src/nanoarrow/_array.pyx +++ b/python/src/nanoarrow/_array.pyx @@ -22,10 +22,12 @@ from cpython.pycapsule cimport PyCapsule_GetPointer from cpython.unicode cimport PyUnicode_AsUTF8AndSize from cpython cimport ( Py_buffer, - PyObject_GetBuffer, PyBuffer_Release, PyBUF_ANY_CONTIGUOUS, PyBUF_FORMAT, + PyBytes_FromStringAndSize, + PyObject_GetBuffer, + PyUnicode_FromStringAndSize, ) from nanoarrow_c cimport ( @@ -43,6 +45,9 @@ from nanoarrow_c cimport ( ArrowArrayView, ArrowArrayViewComputeNullCount, ArrowArrayViewInitFromSchema, + ArrowArrayViewIsNull, + ArrowArrayViewGetStringUnsafe, + ArrowArrayViewGetBytesUnsafe, ArrowArrayViewSetArray, ArrowArrayViewSetArrayMinimal, ArrowBitCountSet, @@ -57,6 +62,7 @@ from nanoarrow_c cimport ( ArrowValidationLevel, NANOARROW_BUFFER_TYPE_DATA, NANOARROW_BUFFER_TYPE_DATA_OFFSET, + NANOARROW_BUFFER_TYPE_DATA_VIEW, NANOARROW_BUFFER_TYPE_TYPE_ID, NANOARROW_BUFFER_TYPE_UNION_OFFSET, NANOARROW_BUFFER_TYPE_VALIDITY, @@ -78,6 +84,7 @@ from nanoarrow._device cimport Device, CSharedSyncEvent from nanoarrow._buffer cimport CBuffer, CBufferView from nanoarrow._schema cimport CSchema, CLayout +from nanoarrow cimport _types from nanoarrow._utils cimport ( alloc_c_array, alloc_c_device_array, @@ -189,13 +196,48 @@ cdef class CArrayView: @property def n_buffers(self): + if _types.is_data_view(self._ptr.storage_type): + return 2 + self._ptr.n_variadic_buffers + 1 + return self.layout.n_buffers - def buffer_type(self, int64_t i): + def _buffer_info(self, int64_t i): if i < 0 or i >= self.n_buffers: raise IndexError(f"{i} out of range [0, {self.n_buffers}]") - buffer_type = self._ptr.layout.buffer_type[i] + if ( + _types.is_data_view(self._ptr.storage_type) + and i == (2 + self._ptr.n_variadic_buffers) + ): + return ( + NANOARROW_BUFFER_TYPE_DATA, + _types.INT64, + 64, + self._ptr.array.buffers[i], + (self._ptr.n_variadic_buffers) * 8 + ) + elif ( + _types.is_data_view(self._ptr.storage_type) + and i >= 2 + ): + return ( + NANOARROW_BUFFER_TYPE_DATA, + _types.STRING if int(self._ptr.storage_type) == _types.STRING_VIEW else _types.BINARY, + 0, + self._ptr.array.buffers[i], + (self._ptr.array.buffers[2 + self._ptr.n_variadic_buffers])[i - 2] + ) + + return ( + self._ptr.layout.buffer_type[i], + self._ptr.layout.buffer_data_type[i], + self._ptr.layout.element_size_bits[i], + self._ptr.buffer_views[i].data.data, + self._ptr.buffer_views[i].size_bytes + ) + + def buffer_type(self, int64_t i): + buffer_type = self._buffer_info(i)[0] if buffer_type == NANOARROW_BUFFER_TYPE_VALIDITY: return "validity" elif buffer_type == NANOARROW_BUFFER_TYPE_TYPE_ID: @@ -206,14 +248,17 @@ cdef class CArrayView: return "data_offset" elif buffer_type == NANOARROW_BUFFER_TYPE_DATA: return "data" + elif buffer_type == NANOARROW_BUFFER_TYPE_DATA_VIEW: + return "data_view" else: return "none" def buffer(self, int64_t i): - if i < 0 or i >= self.n_buffers: - raise IndexError(f"{i} out of range [0, {self.n_buffers}]") + _, data_type, element_size_bits, addr, size = self._buffer_info(i) - cdef ArrowBufferView* buffer_view = &(self._ptr.buffer_views[i]) + cdef ArrowBufferView buffer_view + buffer_view.data.data = addr + buffer_view.size_bytes = size # Check the buffer size here because the error later is cryptic. # Buffer sizes are set to -1 when they are "unknown", so because of errors @@ -224,10 +269,10 @@ cdef class CArrayView: return CBufferView( self._array_base, - buffer_view.data.data, - buffer_view.size_bytes, - self._ptr.layout.buffer_data_type[i], - self._ptr.layout.element_size_bits[i], + addr, + size, + data_type, + element_size_bits, self._event ) @@ -249,6 +294,24 @@ cdef class CArrayView: return dictionary + def _iter_bytes(self, int64_t offset, int64_t length) -> bytes | None: + cdef ArrowBufferView item_view + for i in range(offset, length): + if ArrowArrayViewIsNull(self._ptr, i): + yield None + else: + item_view = ArrowArrayViewGetBytesUnsafe(self._ptr, i) + yield PyBytes_FromStringAndSize(item_view.data.as_char, item_view.size_bytes) + + def _iter_str(self, int64_t offset, int64_t length) -> str | None: + cdef ArrowStringView item_view + for i in range(offset, length): + if ArrowArrayViewIsNull(self._ptr, i): + yield None + else: + item_view = ArrowArrayViewGetStringUnsafe(self._ptr, i) + yield PyUnicode_FromStringAndSize(item_view.data, item_view.size_bytes) + def __repr__(self): return _repr_utils.array_view_repr(self) diff --git a/python/src/nanoarrow/_types.pxd b/python/src/nanoarrow/_types.pxd index 4a53fe310..3dc727c47 100644 --- a/python/src/nanoarrow/_types.pxd +++ b/python/src/nanoarrow/_types.pxd @@ -90,6 +90,8 @@ cpdef bint has_time_unit(int type_id) cpdef bint is_union(int type_id) +cpdef bint is_data_view(int type_id) + cdef int to_format(int type_id, int element_size_bits, size_t out_size, char* out) cdef tuple from_format(format) diff --git a/python/src/nanoarrow/_types.pyi b/python/src/nanoarrow/_types.pyi index 23c8cf8c8..e55097b2b 100644 --- a/python/src/nanoarrow/_types.pyi +++ b/python/src/nanoarrow/_types.pyi @@ -20,6 +20,7 @@ import enum from typing import Callable, ClassVar BINARY: CArrowType +BINARY_VIEW: CArrowType BOOL: CArrowType DATE32: CArrowType DATE64: CArrowType @@ -47,8 +48,10 @@ LARGE_STRING: CArrowType LIST: CArrowType MAP: CArrowType NA: CArrowType +RUN_END_ENCODED: CArrowType SPARSE_UNION: CArrowType STRING: CArrowType +STRING_VIEW: CArrowType STRUCT: CArrowType TIME32: CArrowType TIME64: CArrowType @@ -61,6 +64,7 @@ UNINITIALIZED: CArrowType __pyx_capi__: dict __test__: dict has_time_unit: _cython_3_0_11.cython_function_or_method +is_data_view: _cython_3_0_11.cython_function_or_method is_decimal: _cython_3_0_11.cython_function_or_method is_fixed_size: _cython_3_0_11.cython_function_or_method is_floating_point: _cython_3_0_11.cython_function_or_method @@ -72,6 +76,7 @@ sys_byteorder: str class CArrowType(enum.IntFlag): __new__: ClassVar[Callable] = ... BINARY: ClassVar[CArrowType] = ... + BINARY_VIEW: ClassVar[CArrowType] = ... BOOL: ClassVar[CArrowType] = ... DATE32: ClassVar[CArrowType] = ... DATE64: ClassVar[CArrowType] = ... @@ -99,8 +104,10 @@ class CArrowType(enum.IntFlag): LIST: ClassVar[CArrowType] = ... MAP: ClassVar[CArrowType] = ... NA: ClassVar[CArrowType] = ... + RUN_END_ENCODED: ClassVar[CArrowType] = ... SPARSE_UNION: ClassVar[CArrowType] = ... STRING: ClassVar[CArrowType] = ... + STRING_VIEW: ClassVar[CArrowType] = ... STRUCT: ClassVar[CArrowType] = ... TIME32: ClassVar[CArrowType] = ... TIME64: ClassVar[CArrowType] = ... diff --git a/python/src/nanoarrow/_types.pyx b/python/src/nanoarrow/_types.pyx index c10463b09..e43545f2d 100644 --- a/python/src/nanoarrow/_types.pyx +++ b/python/src/nanoarrow/_types.pyx @@ -109,6 +109,14 @@ cpdef bint is_union(int type_id): ) +cpdef bint is_data_view(int type_id): + """Check if type_id is a binary view or string view type""" + return type_id in ( + _types.BINARY_VIEW, + _types.STRING_VIEW + ) + + cdef tuple from_format(format): """Convert a Python buffer protocol format string to a itemsize/type_id tuple @@ -236,6 +244,9 @@ cdef int to_format(int type_id, int element_size_bits, size_t out_size, char* ou elif type_id == _types.DECIMAL256: format_const = "32s" element_size_bits_calc = 256 + elif is_data_view(type_id): + format_const = "16s" + element_size_bits_calc = 128 else: raise ValueError(f"Unsupported Arrow type_id for format conversion: {type_id}") diff --git a/python/src/nanoarrow/c_array.py b/python/src/nanoarrow/c_array.py index d0250456d..0c71bda45 100644 --- a/python/src/nanoarrow/c_array.py +++ b/python/src/nanoarrow/c_array.py @@ -547,8 +547,10 @@ def _append_using_buffer_builder(self, obj: Iterable) -> None: _types.BINARY: "_append_bytes", _types.LARGE_BINARY: "_append_bytes", _types.FIXED_SIZE_BINARY: "_append_bytes", + _types.BINARY_VIEW: "_append_bytes", _types.STRING: "_append_strings", _types.LARGE_STRING: "_append_strings", + _types.STRING_VIEW: "_append_strings", _types.INT8: "_append_using_array", _types.UINT8: "_append_using_array", _types.INT16: "_append_using_array", diff --git a/python/src/nanoarrow/iterator.py b/python/src/nanoarrow/iterator.py index 874f285fc..fc6e1428f 100644 --- a/python/src/nanoarrow/iterator.py +++ b/python/src/nanoarrow/iterator.py @@ -322,6 +322,12 @@ def _binary_iter(self, offset, length): for start, end in zip(starts, ends): yield bytes(data[start:end]) + def _binary_view_iter(self, offset, length): + return self._array_view._iter_bytes(offset, length) + + def _string_view_iter(self, offset, length): + return self._array_view._iter_str(offset, length) + def _decimal_iter(self, offset, length): from decimal import Context, Decimal from sys import byteorder @@ -564,6 +570,8 @@ def _get_tzinfo(tz_string, strategy=None): _types.DURATION: "_duration_iter", _types.DECIMAL128: "_decimal_iter", _types.DECIMAL256: "_decimal_iter", + _types.STRING_VIEW: "_string_view_iter", + _types.BINARY_VIEW: "_binary_view_iter", } _PRIMITIVE_TYPE_NAMES = [ diff --git a/python/tests/test_c_array.py b/python/tests/test_c_array.py index e4d98f176..d2993ec6e 100644 --- a/python/tests/test_c_array.py +++ b/python/tests/test_c_array.py @@ -288,6 +288,32 @@ def test_c_array_from_iterable_string(): na.c_array([b"1234"], na.string()) +def test_c_array_from_iterable_string_view(): + string = na.c_array( + ["abc", None, "a string longer than 12 bytes"], na.string_view() + ) + assert string.length == 3 + assert string.null_count == 1 + assert string.n_buffers == 4 + + array_view = string.view() + assert len(array_view.buffer(0)) == 1 + assert bytes(array_view.buffer(2)) == b"a string longer than 12 bytes" + assert list(array_view.buffer(3)) == [len("a string longer than 12 bytes")] + + # Make sure this also works when all strings are inlined (i.e., no variadic buffers) + string = na.c_array(["abc", None, "short string"], na.string_view()) + assert string.length == 3 + assert string.null_count == 1 + assert string.n_buffers == 3 + + array_view = string.view() + assert len(array_view.buffer(0)) == 1 + assert len(array_view.buffer(1)) == 3 + assert len(bytes(array_view.buffer(1))) == 3 * 16 + assert list(array_view.buffer(2)) == [] + + def test_c_array_from_iterable_bytes(): string = na.c_array([b"abc", None, b"defg"], na.binary()) assert string.length == 3 @@ -311,6 +337,20 @@ def test_c_array_from_iterable_bytes(): na.c_array([buf_2d], na.binary()) +def test_c_array_from_iterable__view(): + string = na.c_array( + [b"abc", None, b"a string longer than 12 bytes"], na.binary_view() + ) + assert string.length == 3 + assert string.null_count == 1 + assert string.n_buffers == 4 + + array_view = string.view() + assert len(array_view.buffer(0)) == 1 + assert bytes(array_view.buffer(2)) == b"a string longer than 12 bytes" + assert list(array_view.buffer(3)) == [len("a string longer than 12 bytes")] + + def test_c_array_from_iterable_non_empty_nullable_without_nulls(): c_array = na.c_array([1, 2, 3], na.int32()) assert c_array.length == 3 diff --git a/python/tests/test_iterator.py b/python/tests/test_iterator.py index fe6e8bbd8..0c0e0474a 100644 --- a/python/tests/test_iterator.py +++ b/python/tests/test_iterator.py @@ -68,8 +68,11 @@ def test_iterator_nullable_primitive(): assert list(iter_py(sliced)) == [2, 3, None] -def test_iterator_string(): - array = na.c_array(["ab", "cde"], na.string()) +@pytest.mark.parametrize( + "arrow_type", [na.string(), na.large_string(), na.string_view()] +) +def test_iterator_string(arrow_type): + array = na.c_array(["ab", "cde"], arrow_type) assert list(iter_py(array)) == ["ab", "cde"] @@ -77,8 +80,11 @@ def test_iterator_string(): assert list(iter_py(sliced)) == ["cde"] -def test_iterator_nullable_string(): - array = na.c_array(["ab", "cde", None], na.string()) +@pytest.mark.parametrize( + "arrow_type", [na.string(), na.large_string(), na.string_view()] +) +def test_iterator_nullable_string(arrow_type): + array = na.c_array(["ab", "cde", None], arrow_type) assert list(iter_py(array)) == ["ab", "cde", None] @@ -86,8 +92,11 @@ def test_iterator_nullable_string(): assert list(iter_py(sliced)) == ["cde", None] -def test_iterator_binary(): - array = na.c_array([b"ab", b"cde"], na.binary()) +@pytest.mark.parametrize( + "arrow_type", [na.binary(), na.large_binary(), na.binary_view()] +) +def test_iterator_binary(arrow_type): + array = na.c_array([b"ab", b"cde"], arrow_type) assert list(iter_py(array)) == [b"ab", b"cde"] @@ -95,8 +104,11 @@ def test_iterator_binary(): assert list(iter_py(sliced)) == [b"cde"] -def test_iterator_nullable_binary(): - array = na.c_array([b"ab", b"cde", None], na.binary()) +@pytest.mark.parametrize( + "arrow_type", [na.binary(), na.large_binary(), na.binary_view()] +) +def test_iterator_nullable_binary(arrow_type): + array = na.c_array([b"ab", b"cde", None], arrow_type) assert list(iter_py(array)) == [b"ab", b"cde", None]