Skip to content

Commit

Permalink
more docs
Browse files Browse the repository at this point in the history
  • Loading branch information
paleolimbot committed Jul 18, 2024
1 parent 78dc855 commit 063f1ec
Show file tree
Hide file tree
Showing 2 changed files with 109 additions and 44 deletions.
56 changes: 45 additions & 11 deletions python/src/nanoarrow/_array.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -298,8 +298,7 @@ cdef class CArray:

@staticmethod
def _import_from_c_capsule(schema_capsule, array_capsule) -> CArray:
"""
Import from a ArrowSchema and ArrowArray PyCapsule tuple.
"""Import from a ArrowSchema and ArrowArray PyCapsule tuple.

Parameters
----------
Expand Down Expand Up @@ -518,6 +517,7 @@ cdef class CArrayBuilder:
return CArrayBuilder(CArray.allocate(CSchema.allocate()))

def is_empty(self) -> bool:
"""Check if any items have been appended to this builder"""
if self._ptr.release == NULL:
raise RuntimeError("CArrayBuilder is not initialized")

Expand Down Expand Up @@ -547,6 +547,11 @@ cdef class CArrayBuilder:
return self

def start_appending(self) -> CArrayBuilder:
"""Use append mode for building this ArrowArray

Calling this method is required to produce a valid array prior to calling
:meth:`append_strings` or `append_bytes`.
"""
cdef int code = ArrowArrayStartAppending(self._ptr)
Error.raise_error_not_ok("ArrowArrayStartAppending()", code)
return self
Expand Down Expand Up @@ -661,7 +666,8 @@ cdef class CArrayBuilder:
the ArrowBuffer will be invalidated, which is usually the desired behaviour
if you built or imported a buffer specifically to build this array. If move
is False (the default), this function will a make a shallow copy via another
layer of Python object wrapping."""
layer of Python object wrapping.
"""
if i < 0 or i > 3:
raise IndexError("i must be >= 0 and <= 3")

Expand All @@ -679,6 +685,13 @@ cdef class CArrayBuilder:
return self

def set_child(self, int64_t i, CArray c_array, move=False) -> CArrayBuilder:
"""Set an ArrowArray child

Set a child of this array by performing a show copy or optionally
transferring ownership to this object. The initialized child array
must have been initialized before this call by initializing this
builder with a schema containing the correct number of children.
"""
cdef CArray child = self.c_array.child(i)
if child._ptr.release != NULL:
ArrowArrayRelease(child._ptr)
Expand All @@ -696,6 +709,19 @@ cdef class CArrayBuilder:
return self

def finish(self, validation_level=None) -> CArray:
"""Finish building this array

Performs any steps required to return a valid ArrowArray and optionally
validates the output to ensure that the result is valid (given the information
the array has available to it).

Parameters
----------
validation_level : None, "full", "default", "minimal", or "none", optional
Explicitly define a validation level or use None to perform default
validation if possible. Validation may not be possible if children
were set that were not created by nanoarrow.
"""
self.c_array._assert_valid()
cdef ArrowValidationLevel c_validation_level
cdef Error error = Error()
Expand Down Expand Up @@ -725,6 +751,14 @@ cdef class CArrayBuilder:


cdef class CDeviceArray:
"""Low-level ArrowDeviceArray wrapper
This object is a literal wrapper around an ArrowDeviceArray. It provides field accessors
that return Python objects and handles the structure lifecycle (i.e., initialized
ArrowDeviceArray structures are always released).
See `nanoarrow.device.c_device_array()` for construction and usage examples.
"""

def __cinit__(self, object base, uintptr_t addr, CSchema schema):
self._base = base
Expand All @@ -743,30 +777,30 @@ cdef class CDeviceArray:
return CDeviceArray(holder, <uintptr_t>device_array_ptr, schema)

@property
def schema(self):
def schema(self) -> CSchema:
return self._schema

@property
def device_type(self):
def device_type(self) -> DeviceType:
return DeviceType(self._ptr.device_type)

@property
def device_type_id(self):
def device_type_id(self) -> int:
return self._ptr.device_type

@property
def device_id(self):
def device_id(self) -> int:
return self._ptr.device_id

@property
def array(self):
def array(self) -> CArray:
# TODO: We lose access to the sync_event here, so we probably need to
# synchronize (or propagate it, or somehow prevent data access downstream)
cdef CArray array = CArray(self, <uintptr_t>&self._ptr.array, self._schema)
array._set_device(self._ptr.device_type, self._ptr.device_id)
return array

def view(self):
def view(self) -> CArrayView:
return self.array.view()

def __arrow_c_array__(self, requested_schema=None):
Expand All @@ -785,7 +819,7 @@ cdef class CDeviceArray:
return self._schema.__arrow_c_schema__(), device_array_capsule

@staticmethod
def _import_from_c_capsule(schema_capsule, device_array_capsule):
def _import_from_c_capsule(schema_capsule, device_array_capsule) -> CDeviceArray:
"""
Import from an ArrowSchema and ArrowArray PyCapsule tuple.

Expand All @@ -811,5 +845,5 @@ cdef class CDeviceArray:

return out

def __repr__(self):
def __repr__(self) -> str:
return _repr_utils.device_array_repr(self)
97 changes: 64 additions & 33 deletions python/src/nanoarrow/_array_stream.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -17,19 +17,6 @@

# cython: language_level = 3

"""Low-level nanoarrow Python bindings
This Cython extension provides low-level Python wrappers around the
Arrow C Data and Arrow C Stream interface structs. In general, there
is one wrapper per C struct and pointer validity is managed by keeping
strong references to Python objects. These wrappers are intended to
be literal and stay close to the structure definitions: higher level
interfaces can and should be built in Python where it is faster to
iterate and where it is easier to create a better user experience
by default (i.e., classes, methods, and functions implemented in Python
generally have better autocomplete + documentation available to IDEs).
"""


from libc.stdint cimport uintptr_t, int64_t
from cpython.pycapsule cimport PyCapsule_GetPointer
Expand Down Expand Up @@ -58,6 +45,8 @@ from nanoarrow._utils cimport (
Error
)

from typing import Iterable, List, Tuple

from nanoarrow import _repr_utils


Expand All @@ -80,13 +69,35 @@ cdef class CArrayStream:
self._cached_schema = None

@staticmethod
def allocate():
def allocate() -> CArrayStream:
"""Allocate a released ArrowArrayStream"""
cdef ArrowArrayStream* c_array_stream_out
base = alloc_c_array_stream(&c_array_stream_out)
return CArrayStream(base, <uintptr_t>c_array_stream_out)

@staticmethod
def from_c_arrays(arrays, CSchema schema, move=False, validate=True):
def from_c_arrays(arrays: List[CArray], CSchema schema, move=False, validate=True) -> CArrayStream:
"""Create an ArrowArrayStream from an existing set of arrays

Given a previously resolved list of arrays, create an ArrowArrayStream
representation of the sequence of chunks.

Parameters
----------
arrays : List[CArray]
A list of arrays to use as batches.
schema : CSchema
The schema that will be returned. Must be type equal with the schema
of each array (this is checked if validate is ``True``)
move : bool, optional
If True, transfer ownership from each array instead of creating a
shallow copy. This is only safe if the caller knows the origin of the
arrays and knows that they will not be accessed after this stream has been
created.
validate : bool, optional
If True, enforce type equality between the provided schema and the schema
of each array.
"""
cdef ArrowArrayStream* c_array_stream_out
base = alloc_c_array_stream(&c_array_stream_out)

Expand Down Expand Up @@ -127,13 +138,13 @@ cdef class CArrayStream:
return CArrayStream(base, <uintptr_t>c_array_stream_out)

def release(self):
"""Explicitly call the release callback of this stream"""
if self.is_valid():
self._ptr.release(self._ptr)

@staticmethod
def _import_from_c_capsule(stream_capsule):
"""
Import from a ArrowArrayStream PyCapsule.
def _import_from_c_capsule(stream_capsule) -> CArrayStream:
"""Import from a ArrowArrayStream PyCapsule.

Parameters
----------
Expand Down Expand Up @@ -172,10 +183,11 @@ cdef class CArrayStream:
ArrowArrayStreamMove(self._ptr, c_array_stream_out)
return array_stream_capsule

def _addr(self):
def _addr(self) -> int:
return <uintptr_t>self._ptr

def is_valid(self):
def is_valid(self) -> bool:
"""Check for a non-null and non-released underlying ArrowArrayStream"""
return self._ptr != NULL and self._ptr.release != NULL

def _assert_valid(self):
Expand All @@ -197,14 +209,17 @@ cdef class CArrayStream:

return self._cached_schema

def get_schema(self):
def get_schema(self) -> CSchema:
"""Get the schema associated with this stream

Calling this method will always issue a call to the underlying stream's
get_schema callback.
"""
out = CSchema.allocate()
self._get_schema(out)
return out

def get_next(self):
def get_next(self) -> CArray:
"""Get the next Array from this stream

Raises StopIteration when there are no more arrays in this stream.
Expand Down Expand Up @@ -243,6 +258,13 @@ cdef class CArrayStream:


cdef class CMaterializedArrayStream:
"""Optimized representation of a fully consumed ArrowArrayStream
This class provides a data structure similar to pyarrow's ChunkedArray
where each consumed array is a referenced-counted shared array. This
class wraps the utilities provided by the nanoarrow C library to iterate
over and facilitate log(n) random access to items in this container.
"""
cdef CSchema _schema
cdef CBuffer _array_ends
cdef list _arrays
Expand All @@ -260,10 +282,10 @@ cdef class CMaterializedArrayStream:
self._array_ends._set_data_type(<ArrowType>_types.INT64)

@property
def schema(self):
def schema(self) -> CSchema:
return self._schema

def __getitem__(self, k):
def __getitem__(self, k) -> Tuple[CArray, int]:
cdef int64_t kint
cdef int array_i
cdef const int64_t* sorted_offsets = <int64_t*>self._array_ends._ptr.data
Expand All @@ -281,23 +303,23 @@ cdef class CMaterializedArrayStream:
kint -= sorted_offsets[array_i]
return self._arrays[array_i], kint

def __len__(self):
def __len__(self) -> int:
return self._array_ends[len(self._arrays)]

def __iter__(self):
def __iter__(self) -> Iterable[Tuple[CArray, int]]:
for c_array in self._arrays:
for item_i in range(len(c_array)):
yield c_array, item_i

def array(self, int64_t i):
def array(self, int64_t i) -> CArray:
return self._arrays[i]

@property
def n_arrays(self):
def n_arrays(self) -> int:
return len(self._arrays)

@property
def arrays(self):
def arrays(self) -> Iterable[CArray]:
return iter(self._arrays)

def __arrow_c_stream__(self, requested_schema=None):
Expand All @@ -312,7 +334,7 @@ cdef class CMaterializedArrayStream:

return stream.__arrow_c_stream__(requested_schema=requested_schema)

def child(self, int64_t i):
def child(self, int64_t i) -> CMaterializedArrayStream:
cdef CMaterializedArrayStream out = CMaterializedArrayStream()
cdef int code

Expand All @@ -327,7 +349,12 @@ cdef class CMaterializedArrayStream:
return out

@staticmethod
def from_c_arrays(arrays, CSchema schema, bint validate=True):
def from_c_arrays(arrays: Iterable[CArray], CSchema schema, bint validate=True):
""""Create a materialized array stream from an existing iterable of arrays
This is slightly more efficient than creating a stream and then consuming it
because the implementation can avoid a shallow copy of each array.
"""
cdef CMaterializedArrayStream out = CMaterializedArrayStream()

for array in arrays:
Expand All @@ -350,15 +377,19 @@ cdef class CMaterializedArrayStream:
return out

@staticmethod
def from_c_array(CArray array):
def from_c_array(CArray array) -> CMaterializedArrayStream:
""""Create a materialized array stream from a single array
"""
return CMaterializedArrayStream.from_c_arrays(
[array],
array.schema,
validate=False
)

@staticmethod
def from_c_array_stream(CArrayStream stream):
def from_c_array_stream(CArrayStream stream) -> CMaterializedArrayStream:
""""Create a materialized array stream from an unmaterialized ArrowArrayStream
"""
with stream:
return CMaterializedArrayStream.from_c_arrays(
stream,
Expand Down

0 comments on commit 063f1ec

Please sign in to comment.