more docs

apache · Jul 18, 2024 · 063f1ec · 063f1ec
1 parent 78dc855
commit 063f1ec
Show file tree

Hide file tree

Showing 2 changed files with 109 additions and 44 deletions.
diff --git a/python/src/nanoarrow/_array.pyx b/python/src/nanoarrow/_array.pyx
@@ -298,8 +298,7 @@ cdef class CArray:
 
     @staticmethod
     def _import_from_c_capsule(schema_capsule, array_capsule) -> CArray:
-        """
-        Import from a ArrowSchema and ArrowArray PyCapsule tuple.
+        """Import from a ArrowSchema and ArrowArray PyCapsule tuple.
 
         Parameters
         ----------
@@ -518,6 +517,7 @@ cdef class CArrayBuilder:
         return CArrayBuilder(CArray.allocate(CSchema.allocate()))
 
     def is_empty(self) -> bool:
+        """Check if any items have been appended to this builder"""
         if self._ptr.release == NULL:
             raise RuntimeError("CArrayBuilder is not initialized")
 
@@ -547,6 +547,11 @@ cdef class CArrayBuilder:
         return self
 
     def start_appending(self) -> CArrayBuilder:
+        """Use append mode for building this ArrowArray
+
+        Calling this method is required to produce a valid array prior to calling
+        :meth:`append_strings` or `append_bytes`.
+        """
         cdef int code = ArrowArrayStartAppending(self._ptr)
         Error.raise_error_not_ok("ArrowArrayStartAppending()", code)
         return self
@@ -661,7 +666,8 @@ cdef class CArrayBuilder:
         the ArrowBuffer will be invalidated, which is usually the desired behaviour
         if you built or imported a buffer specifically to build this array. If move
         is False (the default), this function will a make a shallow copy via another
-        layer of Python object wrapping."""
+        layer of Python object wrapping.
+        """
         if i < 0 or i > 3:
             raise IndexError("i must be >= 0 and <= 3")
 
@@ -679,6 +685,13 @@ cdef class CArrayBuilder:
         return self
 
     def set_child(self, int64_t i, CArray c_array, move=False) -> CArrayBuilder:
+        """Set an ArrowArray child
+
+        Set a child of this array by performing a show copy or optionally
+        transferring ownership to this object. The initialized child array
+        must have been initialized before this call by initializing this
+        builder with a schema containing the correct number of children.
+        """
         cdef CArray child = self.c_array.child(i)
         if child._ptr.release != NULL:
             ArrowArrayRelease(child._ptr)
@@ -696,6 +709,19 @@ cdef class CArrayBuilder:
         return self
 
     def finish(self, validation_level=None) -> CArray:
+        """Finish building this array
+
+        Performs any steps required to return a valid ArrowArray and optionally
+        validates the output to ensure that the result is valid (given the information
+        the array has available to it).
+
+        Parameters
+        ----------
+        validation_level : None, "full", "default", "minimal", or "none", optional
+            Explicitly define a validation level or use None to perform default
+            validation if possible. Validation may not be possible if children
+            were set that were not created by nanoarrow.
+        """
         self.c_array._assert_valid()
         cdef ArrowValidationLevel c_validation_level
         cdef Error error = Error()
@@ -725,6 +751,14 @@ cdef class CArrayBuilder:
 
 
 cdef class CDeviceArray:
+    """Low-level ArrowDeviceArray wrapper
+
+    This object is a literal wrapper around an ArrowDeviceArray. It provides field accessors
+    that return Python objects and handles the structure lifecycle (i.e., initialized
+    ArrowDeviceArray structures are always released).
+
+    See `nanoarrow.device.c_device_array()` for construction and usage examples.
+    """
 
     def __cinit__(self, object base, uintptr_t addr, CSchema schema):
         self._base = base
@@ -743,30 +777,30 @@ cdef class CDeviceArray:
         return CDeviceArray(holder, <uintptr_t>device_array_ptr, schema)
 
     @property
-    def schema(self):
+    def schema(self) -> CSchema:
         return self._schema
 
     @property
-    def device_type(self):
+    def device_type(self) -> DeviceType:
         return DeviceType(self._ptr.device_type)
 
     @property
-    def device_type_id(self):
+    def device_type_id(self) -> int:
         return self._ptr.device_type
 
     @property
-    def device_id(self):
+    def device_id(self) -> int:
         return self._ptr.device_id
 
     @property
-    def array(self):
+    def array(self) -> CArray:
         # TODO: We lose access to the sync_event here, so we probably need to
         # synchronize (or propagate it, or somehow prevent data access downstream)
         cdef CArray array = CArray(self, <uintptr_t>&self._ptr.array, self._schema)
         array._set_device(self._ptr.device_type, self._ptr.device_id)
         return array
 
-    def view(self):
+    def view(self) -> CArrayView:
         return self.array.view()
 
     def __arrow_c_array__(self, requested_schema=None):
@@ -785,7 +819,7 @@ cdef class CDeviceArray:
         return self._schema.__arrow_c_schema__(), device_array_capsule
 
     @staticmethod
-    def _import_from_c_capsule(schema_capsule, device_array_capsule):
+    def _import_from_c_capsule(schema_capsule, device_array_capsule) -> CDeviceArray:
         """
         Import from an ArrowSchema and ArrowArray PyCapsule tuple.
 
@@ -811,5 +845,5 @@ cdef class CDeviceArray:
 
         return out
 
-    def __repr__(self):
+    def __repr__(self) -> str:
         return _repr_utils.device_array_repr(self)
diff --git a/python/src/nanoarrow/_array_stream.pyx b/python/src/nanoarrow/_array_stream.pyx
@@ -17,19 +17,6 @@
 
 # cython: language_level = 3
 
-"""Low-level nanoarrow Python bindings
-
-This Cython extension provides low-level Python wrappers around the
-Arrow C Data and Arrow C Stream interface structs. In general, there
-is one wrapper per C struct and pointer validity is managed by keeping
-strong references to Python objects. These wrappers are intended to
-be literal and stay close to the structure definitions: higher level
-interfaces can and should be built in Python where it is faster to
-iterate and where it is easier to create a better user experience
-by default (i.e., classes, methods, and functions implemented in Python
-generally have better autocomplete + documentation available to IDEs).
-"""
-
 
 from libc.stdint cimport uintptr_t, int64_t
 from cpython.pycapsule cimport PyCapsule_GetPointer
@@ -58,6 +45,8 @@ from nanoarrow._utils cimport (
     Error
 )
 
+from typing import Iterable, List, Tuple
+
 from nanoarrow import _repr_utils
 
 
@@ -80,13 +69,35 @@ cdef class CArrayStream:
         self._cached_schema = None
 
     @staticmethod
-    def allocate():
+    def allocate() -> CArrayStream:
+        """Allocate a released ArrowArrayStream"""
         cdef ArrowArrayStream* c_array_stream_out
         base = alloc_c_array_stream(&c_array_stream_out)
         return CArrayStream(base, <uintptr_t>c_array_stream_out)
 
     @staticmethod
-    def from_c_arrays(arrays, CSchema schema, move=False, validate=True):
+    def from_c_arrays(arrays: List[CArray], CSchema schema, move=False, validate=True) -> CArrayStream:
+        """Create an ArrowArrayStream from an existing set of arrays
+
+        Given a previously resolved list of arrays, create an ArrowArrayStream
+        representation of the sequence of chunks.
+
+        Parameters
+        ----------
+        arrays : List[CArray]
+            A list of arrays to use as batches.
+        schema : CSchema
+            The schema that will be returned. Must be type equal with the schema
+            of each array (this is checked if validate is ``True``)
+        move : bool, optional
+            If True, transfer ownership from each array instead of creating a
+            shallow copy. This is only safe if the caller knows the origin of the
+            arrays and knows that they will not be accessed after this stream has been
+            created.
+        validate : bool, optional
+            If True, enforce type equality between the provided schema and the schema
+            of each array.
+        """
         cdef ArrowArrayStream* c_array_stream_out
         base = alloc_c_array_stream(&c_array_stream_out)
 
@@ -127,13 +138,13 @@ cdef class CArrayStream:
         return CArrayStream(base, <uintptr_t>c_array_stream_out)
 
     def release(self):
+        """Explicitly call the release callback of this stream"""
         if self.is_valid():
             self._ptr.release(self._ptr)
 
     @staticmethod
-    def _import_from_c_capsule(stream_capsule):
-        """
-        Import from a ArrowArrayStream PyCapsule.
+    def _import_from_c_capsule(stream_capsule) -> CArrayStream:
+        """Import from a ArrowArrayStream PyCapsule.
 
         Parameters
         ----------
@@ -172,10 +183,11 @@ cdef class CArrayStream:
         ArrowArrayStreamMove(self._ptr, c_array_stream_out)
         return array_stream_capsule
 
-    def _addr(self):
+    def _addr(self) -> int:
         return <uintptr_t>self._ptr
 
-    def is_valid(self):
+    def is_valid(self) -> bool:
+        """Check for a non-null and non-released underlying ArrowArrayStream"""
         return self._ptr != NULL and self._ptr.release != NULL
 
     def _assert_valid(self):
@@ -197,14 +209,17 @@ cdef class CArrayStream:
 
         return self._cached_schema
 
-    def get_schema(self):
+    def get_schema(self) -> CSchema:
         """Get the schema associated with this stream
+
+        Calling this method will always issue a call to the underlying stream's
+        get_schema callback.
         """
         out = CSchema.allocate()
         self._get_schema(out)
         return out
 
-    def get_next(self):
+    def get_next(self) -> CArray:
         """Get the next Array from this stream
 
         Raises StopIteration when there are no more arrays in this stream.
@@ -243,6 +258,13 @@ cdef class CArrayStream:
 
 
 cdef class CMaterializedArrayStream:
+    """Optimized representation of a fully consumed ArrowArrayStream
+
+    This class provides a data structure similar to pyarrow's ChunkedArray
+    where each consumed array is a referenced-counted shared array. This
+    class wraps the utilities provided by the nanoarrow C library to iterate
+    over and facilitate log(n) random access to items in this container.
+    """
     cdef CSchema _schema
     cdef CBuffer _array_ends
     cdef list _arrays
@@ -260,10 +282,10 @@ cdef class CMaterializedArrayStream:
         self._array_ends._set_data_type(<ArrowType>_types.INT64)
 
     @property
-    def schema(self):
+    def schema(self) -> CSchema:
         return self._schema
 
-    def __getitem__(self, k):
+    def __getitem__(self, k) -> Tuple[CArray, int]:
         cdef int64_t kint
         cdef int array_i
         cdef const int64_t* sorted_offsets = <int64_t*>self._array_ends._ptr.data
@@ -281,23 +303,23 @@ cdef class CMaterializedArrayStream:
         kint -= sorted_offsets[array_i]
         return self._arrays[array_i], kint
 
-    def __len__(self):
+    def __len__(self) -> int:
         return self._array_ends[len(self._arrays)]
 
-    def __iter__(self):
+    def __iter__(self) -> Iterable[Tuple[CArray, int]]:
         for c_array in self._arrays:
             for item_i in range(len(c_array)):
                 yield c_array, item_i
 
-    def array(self, int64_t i):
+    def array(self, int64_t i) -> CArray:
         return self._arrays[i]
 
     @property
-    def n_arrays(self):
+    def n_arrays(self) -> int:
         return len(self._arrays)
 
     @property
-    def arrays(self):
+    def arrays(self) -> Iterable[CArray]:
         return iter(self._arrays)
 
     def __arrow_c_stream__(self, requested_schema=None):
@@ -312,7 +334,7 @@ cdef class CMaterializedArrayStream:
 
         return stream.__arrow_c_stream__(requested_schema=requested_schema)
 
-    def child(self, int64_t i):
+    def child(self, int64_t i) -> CMaterializedArrayStream:
         cdef CMaterializedArrayStream out = CMaterializedArrayStream()
         cdef int code
 
@@ -327,7 +349,12 @@ cdef class CMaterializedArrayStream:
         return out
 
     @staticmethod
-    def from_c_arrays(arrays, CSchema schema, bint validate=True):
+    def from_c_arrays(arrays: Iterable[CArray], CSchema schema, bint validate=True):
+        """"Create a materialized array stream from an existing iterable of arrays
+
+        This is slightly more efficient than creating a stream and then consuming it
+        because the implementation can avoid a shallow copy of each array.
+        """
         cdef CMaterializedArrayStream out = CMaterializedArrayStream()
 
         for array in arrays:
@@ -350,15 +377,19 @@ cdef class CMaterializedArrayStream:
         return out
 
     @staticmethod
-    def from_c_array(CArray array):
+    def from_c_array(CArray array) -> CMaterializedArrayStream:
+        """"Create a materialized array stream from a single array
+        """
         return CMaterializedArrayStream.from_c_arrays(
             [array],
             array.schema,
             validate=False
         )
 
     @staticmethod
-    def from_c_array_stream(CArrayStream stream):
+    def from_c_array_stream(CArrayStream stream) -> CMaterializedArrayStream:
+        """"Create a materialized array stream from an unmaterialized ArrowArrayStream
+        """
         with stream:
             return CMaterializedArrayStream.from_c_arrays(
                 stream,