apache · paleolimbot · May 17, 2024 · May 13, 2024 · May 13, 2024 · May 13, 2024
diff --git a/python/src/nanoarrow/__init__.py b/python/src/nanoarrow/__init__.py
@@ -72,6 +72,7 @@
 )
 from nanoarrow.array import array, Array
 from nanoarrow.array_stream import ArrayStream
+from nanoarrow.visitor import nulls_as_sentinel, nulls_forbid, nulls_separate
 from nanoarrow._version import __version__  # noqa: F401
 
 # Helps Sphinx automatically populate an API reference section
@@ -113,6 +114,9 @@
     "large_list",
     "list_",
     "null",
+    "nulls_as_sentinel",
+    "nulls_forbid",
+    "nulls_separate",
     "string",
     "struct",
     "schema",

diff --git a/python/src/nanoarrow/_lib.pyx b/python/src/nanoarrow/_lib.pyx
@@ -1009,6 +1009,11 @@ cdef class CSchemaView:
         if self.extension_name or self._schema_view.type != self._schema_view.storage_type:
             return None
 
+        # String/binary types do not have format strings as far as the Python
+        # buffer protocol is concerned
+        if self.layout.n_buffers != 2:
+            return None
+
         cdef char out[128]
         cdef int element_size_bits = 0
         if self._schema_view.type == NANOARROW_TYPE_FIXED_SIZE_BINARY:
@@ -1632,6 +1637,22 @@ cdef class CArrayView:
 
     @property
     def null_count(self):
+        if self._ptr.null_count != -1:
+            return self._ptr.null_count
+
+        cdef ArrowBufferType buffer_type = self._ptr.layout.buffer_type[0]
+        cdef uint8_t* validity_bits = self._ptr.buffer_views[0].data.as_uint8
+
+        if buffer_type != NANOARROW_BUFFER_TYPE_VALIDITY:
+            self._ptr.null_count = 0
+        elif validity_bits == NULL:
+            self._ptr.null_count = 0
+        elif self._device is DEVICE_CPU:
+            self._ptr.null_count = (
+                self._ptr.length -
+                ArrowBitCountSet(validity_bits, self.offset, self.length)
+            )
+
         return self._ptr.null_count
 
     @property
@@ -1869,7 +1890,7 @@ cdef class CBufferView:
         return self._format.decode("UTF-8")
 
     @property
-    def item_size(self):
+    def itemsize(self):
         return self._strides
 
     def __len__(self):
@@ -1957,7 +1978,7 @@ cdef class CBufferView:
 
         cdef int64_t c_offset = offset
         cdef int64_t c_length = length
-        cdef int64_t c_item_size = self.item_size
+        cdef int64_t c_item_size = self.itemsize
         cdef int64_t c_dest_offset = dest_offset
         self._check_copy_into_bounds(&buffer, c_offset, c_length, dest_offset, c_item_size)
 
@@ -2010,7 +2031,7 @@ cdef class CBufferView:
         if length is None:
             length = self.n_elements
 
-        cdef int64_t bytes_to_copy = length * self.item_size
+        cdef int64_t bytes_to_copy = length * self.itemsize
         out = CBufferBuilder().set_data_type(self.data_type_id)
         out.reserve_bytes(bytes_to_copy)
         self.copy_into(out, offset, length)
@@ -2224,9 +2245,9 @@ cdef class CBuffer:
         return self._element_size_bits
 
     @property
-    def item_size(self):
+    def itemsize(self):
         self._assert_valid()
-        return self._view.item_size
+        return self._view.itemsize
 
     @property
     def format(self):
@@ -2339,6 +2360,13 @@ cdef class CBufferBuilder:
         """The number of bytes that have been written to this buffer"""
         return self._buffer.size_bytes
 
+    @property
+    def itemsize(self):
+        return self._buffer.itemsize
+
+    def __len__(self):
+        return self._buffer.size_bytes // self.itemsize
+
     @property
     def capacity_bytes(self):
         """The number of bytes allocated in the underlying buffer"""

diff --git a/python/src/nanoarrow/array.py b/python/src/nanoarrow/array.py
@@ -17,7 +17,7 @@
 
 import itertools
 from functools import cached_property
-from typing import Iterable, List, Sequence, Tuple
+from typing import Iterable, Tuple
 
 from nanoarrow._lib import (
     DEVICE_CPU,
@@ -32,7 +32,7 @@
 from nanoarrow.c_schema import c_schema
 from nanoarrow.iterator import iter_array_views, iter_py, iter_tuples
 from nanoarrow.schema import Schema, _schema_repr
-from nanoarrow.visitor import to_columns, to_pylist
+from nanoarrow.visitor import ArrayViewVisitable
 
 from nanoarrow import _repr_utils
 
@@ -106,7 +106,7 @@ def __arrow_c_array__(self, requested_schema=None):
         return array.__arrow_c_array__(requested_schema=requested_schema)
 
 
-class Array:
+class Array(ArrayViewVisitable):
     """High-level in-memory Array representation
 
     The Array is nanoarrow's high-level in-memory array representation whose
@@ -345,42 +345,6 @@ def iter_chunk_views(self) -> Iterable[CArrayView]:
         """
         return iter_array_views(self)
 
-    def to_pylist(self) -> List:
-        """Convert this Array to a ``list()` of Python objects
-
-        Computes an identical value to list(:meth:`iter_py`) but can be several
-        times faster.
-
-        Examples
-        --------
-
-        >>> import nanoarrow as na
-        >>> array = na.Array([1, 2, 3], na.int32())
-        >>> array.to_pylist()
-        [1, 2, 3]
-        """
-        return to_pylist(self)
-
-    def to_columns(self) -> Tuple[str, Sequence]:
-        """Convert this Array to a ``list()` of sequences
-
-        Converts a stream of struct arrays into its column-wise representation
-        such that each column is either a contiguous buffer or a ``list()``.
-
-        Examples
-        --------
-
-        >>> import nanoarrow as na
-        >>> import pyarrow as pa
-        >>> array = na.Array(pa.record_batch([pa.array([1, 2, 3])], names=["col1"]))
-        >>> names, columns = array.to_columns()
-        >>> names
-        ['col1']
-        >>> columns
-        [[1, 2, 3]]
-        """
-        return to_columns(self)
-
     @property
     def n_children(self) -> int:
         """Get the number of children for an Array of this type.

diff --git a/python/src/nanoarrow/array_stream.py b/python/src/nanoarrow/array_stream.py
@@ -16,18 +16,18 @@
 # under the License.
 
 from functools import cached_property
-from typing import Iterable, List, Sequence, Tuple
+from typing import Iterable, Tuple
 
 from nanoarrow._lib import CMaterializedArrayStream
 from nanoarrow._repr_utils import make_class_label
 from nanoarrow.array import Array
 from nanoarrow.c_array_stream import c_array_stream
 from nanoarrow.iterator import iter_py, iter_tuples
 from nanoarrow.schema import Schema, _schema_repr
-from nanoarrow.visitor import to_columns, to_pylist
+from nanoarrow.visitor import ArrayViewVisitable
 
 
-class ArrayStream:
+class ArrayStream(ArrayViewVisitable):
     """High-level ArrayStream representation
 
     The ArrayStream is nanoarrow's high-level representation of zero
@@ -199,43 +199,6 @@ def iter_tuples(self) -> Iterable[Tuple]:
         """
         return iter_tuples(self)
 
-    def to_pylist(self) -> List:
-        """Convert this Array to a ``list()` of Python objects
-
-        Computes an identical value to list(:meth:`iter_py`) but can be several
-        times faster.
-
-        Examples
-        --------
-
-        >>> import nanoarrow as na
-        >>> stream = na.ArrayStream([1, 2, 3], na.int32())
-        >>> stream.to_pylist()
-        [1, 2, 3]
-        """
-        return to_pylist(self)
-
-    def to_columns(self) -> Tuple[str, Sequence]:
-        """Convert this Array to a ``list()` of sequences
-
-        Converts a stream of struct arrays into its column-wise representation
-        such that each column is either a contiguous buffer or a ``list()``.
-
-        Examples
-        --------
-
-        >>> import nanoarrow as na
-        >>> import pyarrow as pa
-        >>> batch = pa.record_batch([pa.array([1, 2, 3])], names=["col1"])
-        >>> stream = na.ArrayStream(batch)
-        >>> names, columns = stream.to_columns()
-        >>> names
-        ['col1']
-        >>> columns
-        [[1, 2, 3]]
-        """
-        return to_columns(self)
-
     def __repr__(self) -> str:
         cls = make_class_label(self, "nanoarrow")
         schema_repr = _schema_repr(self.schema, prefix="", include_metadata=False)

diff --git a/python/src/nanoarrow/iterator.py b/python/src/nanoarrow/iterator.py
@@ -156,11 +156,7 @@ def _object_label(self):
             return f"<unnamed {self._schema_view.type}>"
 
     def _contains_nulls(self):
-        return (
-            self._schema_view.nullable
-            and len(self._array_view.buffer(0))
-            and self._array_view.null_count != 0
-        )
+        return self._schema_view.nullable and self._array_view.null_count != 0
 
     def _set_array(self, array):
         self._array_view._set_array(array)