Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(python): Add column-wise buffer builder #464

Merged
merged 42 commits into from
May 17, 2024
Merged
Show file tree
Hide file tree
Changes from 41 commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
e924e9d
start on column builders
paleolimbot May 13, 2024
2049e63
maybe working tests
paleolimbot May 13, 2024
4526c0b
prototype null handling
paleolimbot May 13, 2024
f611ee8
some null handling
paleolimbot May 13, 2024
4fd4c2d
first stab at dispatch
paleolimbot May 13, 2024
3792f4c
nulls in columns
paleolimbot May 13, 2024
68fa3ff
fix doctests
paleolimbot May 14, 2024
b9be69a
Apply suggestions from code review
paleolimbot May 14, 2024
dd6c4f5
remove masked array handling
paleolimbot May 14, 2024
a427fca
document null handlers
paleolimbot May 14, 2024
62d5dd3
use base class
paleolimbot May 14, 2024
b474b29
fix doctests
paleolimbot May 14, 2024
14e3067
better doc
paleolimbot May 14, 2024
b249e99
test non-nullable dispatch
paleolimbot May 14, 2024
eb1aacc
use two columns in docs
paleolimbot May 16, 2024
ae44f7d
rename and export nulls_debug
paleolimbot May 16, 2024
67a7b44
test null error
paleolimbot May 16, 2024
6c1cde8
clarify null_count=-1
paleolimbot May 16, 2024
db7400c
Apply suggestions from code review
paleolimbot May 16, 2024
9ccd5a5
remove outdate param def
paleolimbot May 16, 2024
a7d5b90
improve handle_nulls docs
paleolimbot May 16, 2024
60f101c
use None as sentinel for "all valid"
paleolimbot May 16, 2024
b734af0
document is_valid as None
paleolimbot May 16, 2024
561f8ca
clarify sentinel handling
paleolimbot May 16, 2024
63adc2b
fix sentinel handler
paleolimbot May 16, 2024
beb1ce6
fix test
paleolimbot May 16, 2024
8899fcf
get names from the schema andn ot from the child visitors
paleolimbot May 16, 2024
8180f31
Update python/src/nanoarrow/visitor.py
paleolimbot May 16, 2024
c2d6813
Update python/src/nanoarrow/visitor.py
paleolimbot May 16, 2024
9a57453
add comment
paleolimbot May 16, 2024
7a2fb86
Builder -> Converter
paleolimbot May 17, 2024
408e2be
to_column -> convert
paleolimbot May 17, 2024
5ee1fc6
change names
paleolimbot May 17, 2024
e1d1690
remove usage of "column"
paleolimbot May 17, 2024
b881976
force kwarg for nulls, mark experimental
paleolimbot May 17, 2024
62c85fd
fix doctests
paleolimbot May 17, 2024
d2730a2
less confusing converter names
paleolimbot May 17, 2024
ca0d5eb
convert -> to_pysequence
paleolimbot May 17, 2024
78e666b
a few more refs
paleolimbot May 17, 2024
4c9d2d2
use arrayviewvisitor instead of arraystream visitor
paleolimbot May 17, 2024
3412b0f
fix some references
paleolimbot May 17, 2024
9d27bd1
one more rename
paleolimbot May 17, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions python/src/nanoarrow/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@
)
from nanoarrow.array import array, Array
from nanoarrow.array_stream import ArrayStream
from nanoarrow.visitor import nulls_as_sentinel, nulls_forbid, nulls_separate
from nanoarrow._version import __version__ # noqa: F401

# Helps Sphinx automatically populate an API reference section
Expand Down Expand Up @@ -113,6 +114,9 @@
"large_list",
"list_",
"null",
"nulls_as_sentinel",
"nulls_forbid",
"nulls_separate",
"string",
"struct",
"schema",
Expand Down
38 changes: 33 additions & 5 deletions python/src/nanoarrow/_lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1009,6 +1009,11 @@ cdef class CSchemaView:
if self.extension_name or self._schema_view.type != self._schema_view.storage_type:
return None

# String/binary types do not have format strings as far as the Python
# buffer protocol is concerned
if self.layout.n_buffers != 2:
return None

cdef char out[128]
cdef int element_size_bits = 0
if self._schema_view.type == NANOARROW_TYPE_FIXED_SIZE_BINARY:
Expand Down Expand Up @@ -1632,6 +1637,22 @@ cdef class CArrayView:

@property
def null_count(self):
if self._ptr.null_count != -1:
return self._ptr.null_count

cdef ArrowBufferType buffer_type = self._ptr.layout.buffer_type[0]
cdef uint8_t* validity_bits = self._ptr.buffer_views[0].data.as_uint8

if buffer_type != NANOARROW_BUFFER_TYPE_VALIDITY:
self._ptr.null_count = 0
elif validity_bits == NULL:
self._ptr.null_count = 0
elif self._device is DEVICE_CPU:
self._ptr.null_count = (
self._ptr.length -
ArrowBitCountSet(validity_bits, self.offset, self.length)
)

return self._ptr.null_count

@property
Expand Down Expand Up @@ -1869,7 +1890,7 @@ cdef class CBufferView:
return self._format.decode("UTF-8")

@property
def item_size(self):
def itemsize(self):
return self._strides

def __len__(self):
Expand Down Expand Up @@ -1957,7 +1978,7 @@ cdef class CBufferView:

cdef int64_t c_offset = offset
cdef int64_t c_length = length
cdef int64_t c_item_size = self.item_size
cdef int64_t c_item_size = self.itemsize
cdef int64_t c_dest_offset = dest_offset
self._check_copy_into_bounds(&buffer, c_offset, c_length, dest_offset, c_item_size)

Expand Down Expand Up @@ -2010,7 +2031,7 @@ cdef class CBufferView:
if length is None:
length = self.n_elements

cdef int64_t bytes_to_copy = length * self.item_size
cdef int64_t bytes_to_copy = length * self.itemsize
out = CBufferBuilder().set_data_type(self.data_type_id)
out.reserve_bytes(bytes_to_copy)
self.copy_into(out, offset, length)
Expand Down Expand Up @@ -2224,9 +2245,9 @@ cdef class CBuffer:
return self._element_size_bits

@property
def item_size(self):
def itemsize(self):
self._assert_valid()
return self._view.item_size
return self._view.itemsize

@property
def format(self):
Expand Down Expand Up @@ -2339,6 +2360,13 @@ cdef class CBufferBuilder:
"""The number of bytes that have been written to this buffer"""
return self._buffer.size_bytes

@property
def itemsize(self):
return self._buffer.itemsize

def __len__(self):
return self._buffer.size_bytes // self.itemsize

@property
def capacity_bytes(self):
"""The number of bytes allocated in the underlying buffer"""
Expand Down
42 changes: 3 additions & 39 deletions python/src/nanoarrow/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

import itertools
from functools import cached_property
from typing import Iterable, List, Sequence, Tuple
from typing import Iterable, Tuple

from nanoarrow._lib import (
DEVICE_CPU,
Expand All @@ -32,7 +32,7 @@
from nanoarrow.c_schema import c_schema
from nanoarrow.iterator import iter_array_views, iter_py, iter_tuples
from nanoarrow.schema import Schema, _schema_repr
from nanoarrow.visitor import to_columns, to_pylist
from nanoarrow.visitor import ArrayViewVisitable

from nanoarrow import _repr_utils

Expand Down Expand Up @@ -106,7 +106,7 @@ def __arrow_c_array__(self, requested_schema=None):
return array.__arrow_c_array__(requested_schema=requested_schema)


class Array:
class Array(ArrayViewVisitable):
"""High-level in-memory Array representation

The Array is nanoarrow's high-level in-memory array representation whose
Expand Down Expand Up @@ -345,42 +345,6 @@ def iter_chunk_views(self) -> Iterable[CArrayView]:
"""
return iter_array_views(self)

def to_pylist(self) -> List:
"""Convert this Array to a ``list()` of Python objects

Computes an identical value to list(:meth:`iter_py`) but can be several
times faster.

Examples
--------

>>> import nanoarrow as na
>>> array = na.Array([1, 2, 3], na.int32())
>>> array.to_pylist()
[1, 2, 3]
"""
return to_pylist(self)

def to_columns(self) -> Tuple[str, Sequence]:
"""Convert this Array to a ``list()` of sequences

Converts a stream of struct arrays into its column-wise representation
such that each column is either a contiguous buffer or a ``list()``.

Examples
--------

>>> import nanoarrow as na
>>> import pyarrow as pa
>>> array = na.Array(pa.record_batch([pa.array([1, 2, 3])], names=["col1"]))
>>> names, columns = array.to_columns()
>>> names
['col1']
>>> columns
[[1, 2, 3]]
"""
return to_columns(self)

@property
def n_children(self) -> int:
"""Get the number of children for an Array of this type.
Expand Down
43 changes: 3 additions & 40 deletions python/src/nanoarrow/array_stream.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,18 +16,18 @@
# under the License.

from functools import cached_property
from typing import Iterable, List, Sequence, Tuple
from typing import Iterable, Tuple

from nanoarrow._lib import CMaterializedArrayStream
from nanoarrow._repr_utils import make_class_label
from nanoarrow.array import Array
from nanoarrow.c_array_stream import c_array_stream
from nanoarrow.iterator import iter_py, iter_tuples
from nanoarrow.schema import Schema, _schema_repr
from nanoarrow.visitor import to_columns, to_pylist
from nanoarrow.visitor import ArrayViewVisitable


class ArrayStream:
class ArrayStream(ArrayViewVisitable):
"""High-level ArrayStream representation

The ArrayStream is nanoarrow's high-level representation of zero
Expand Down Expand Up @@ -199,43 +199,6 @@ def iter_tuples(self) -> Iterable[Tuple]:
"""
return iter_tuples(self)

def to_pylist(self) -> List:
"""Convert this Array to a ``list()` of Python objects

Computes an identical value to list(:meth:`iter_py`) but can be several
times faster.

Examples
--------

>>> import nanoarrow as na
>>> stream = na.ArrayStream([1, 2, 3], na.int32())
>>> stream.to_pylist()
[1, 2, 3]
"""
return to_pylist(self)

def to_columns(self) -> Tuple[str, Sequence]:
"""Convert this Array to a ``list()` of sequences

Converts a stream of struct arrays into its column-wise representation
such that each column is either a contiguous buffer or a ``list()``.

Examples
--------

>>> import nanoarrow as na
>>> import pyarrow as pa
>>> batch = pa.record_batch([pa.array([1, 2, 3])], names=["col1"])
>>> stream = na.ArrayStream(batch)
>>> names, columns = stream.to_columns()
>>> names
['col1']
>>> columns
[[1, 2, 3]]
"""
return to_columns(self)

def __repr__(self) -> str:
cls = make_class_label(self, "nanoarrow")
schema_repr = _schema_repr(self.schema, prefix="", include_metadata=False)
Expand Down
6 changes: 1 addition & 5 deletions python/src/nanoarrow/iterator.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,11 +156,7 @@ def _object_label(self):
return f"<unnamed {self._schema_view.type}>"

def _contains_nulls(self):
return (
self._schema_view.nullable
and len(self._array_view.buffer(0))
and self._array_view.null_count != 0
)
return self._schema_view.nullable and self._array_view.null_count != 0

def _set_array(self, array):
self._array_view._set_array(array)
Expand Down
Loading
Loading