From 879d75e2904473098975b6ba32f59d48d703aea6 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Thu, 23 Feb 2023 15:06:03 -0400 Subject: [PATCH 01/52] maybe schema class --- python/.gitignore | 2 +- python/setup.py | 3 +- python/src/nanoarrow/__init__.py | 18 +++++ python/src/nanoarrow/_lib.pyx | 85 +++++++++++++++++++++- python/src/nanoarrow/arrow_c.pxd | 55 +++++++++++++++ python/src/nanoarrow/nanoarrow_c.pxd | 101 ++++++++++++++++++--------- python/tests/test_nanoarrow.py | 22 ++++-- 7 files changed, 241 insertions(+), 45 deletions(-) create mode 100644 python/src/nanoarrow/arrow_c.pxd diff --git a/python/.gitignore b/python/.gitignore index fcf8363ba..a73fd3d06 100644 --- a/python/.gitignore +++ b/python/.gitignore @@ -18,7 +18,7 @@ src/nanoarrow/nanoarrow.c src/nanoarrow/nanoarrow.h -src/nanoarrow/*.cpp +src/nanoarrow/*.c # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/python/setup.py b/python/setup.py index f6f7efb1c..b89cf1903 100644 --- a/python/setup.py +++ b/python/setup.py @@ -24,7 +24,6 @@ import numpy as np - # setuptools gets confused by relative paths that extend above the project root target = Path(__file__).parent / "src" / "nanoarrow" shutil.copy( @@ -39,7 +38,7 @@ Extension( name="nanoarrow._lib", include_dirs=[np.get_include(), "src/nanoarrow"], - language="c++", + language="c", sources=[ "src/nanoarrow/_lib.pyx", "src/nanoarrow/nanoarrow.c", diff --git a/python/src/nanoarrow/__init__.py b/python/src/nanoarrow/__init__.py index 1586e60ab..9a148a4fc 100644 --- a/python/src/nanoarrow/__init__.py +++ b/python/src/nanoarrow/__init__.py @@ -17,4 +17,22 @@ from ._lib import ( # noqa: F401 as_numpy_array, + version, + CSchemaHolder, + CSchema, ) + +class Schema(CSchema): + + def __init__(self, parent=None, addr=None) -> None: + if parent is None: + parent = CSchemaHolder() + if addr is None: + addr = parent._addr() + super().__init__(parent, addr) + + @staticmethod + def from_pyarrow(obj): + schema = Schema() + obj._export_to_c(schema._addr()) + return schema diff --git a/python/src/nanoarrow/_lib.pyx b/python/src/nanoarrow/_lib.pyx index a6b4da153..ba9cd21f9 100644 --- a/python/src/nanoarrow/_lib.pyx +++ b/python/src/nanoarrow/_lib.pyx @@ -19,8 +19,8 @@ """Low-level nanoarrow Python bindings.""" -from libc.stdint cimport uint8_t, uintptr_t - +from libc.stdint cimport uint8_t, uintptr_t, int64_t +from cpython.mem cimport PyMem_Malloc, PyMem_Free from nanoarrow_c cimport * import numpy as np @@ -84,3 +84,84 @@ def as_numpy_array(arr): # TODO set base return result + + +def version(): + return ArrowNanoarrowVersion().decode("UTF-8") + +cdef class CSchemaHolder: + cdef ArrowSchema c_schema + + def __init__(self): + self.c_schema.release = NULL + + def __del__(self): + if self.c_schema.release != NULL: + self.c_schema.release(&self.c_schema) + + def _addr(self): + return &self.c_schema + +cdef class CSchemaChildren: + cdef CSchema _parent + cdef int64_t _length + + def __init__(self, CSchema parent): + self._parent = parent + self._length = parent._ptr.n_children + + def __len__(self): + return self._length + + def __getitem__(self, k): + k = int(k) + if k < 0 or k >= self._length: + raise IndexError(f"{k} out of range [0, {self._length})") + + return type(self._parent)(self._parent, self._child_addr(k)) + + cdef _child_addr(self, int64_t i): + cdef ArrowSchema** children = self._parent._ptr.children + cdef ArrowSchema* child = children[i] + return child + +cdef class CSchema: + cdef object _base + cdef ArrowSchema* _ptr + + def __init__(self, object base, uintptr_t addr) -> None: + self._base = base, + self._ptr = addr + + def _addr(self): + return self._ptr + + def __repr__(self) -> str: + cdef int64_t n_chars = ArrowSchemaToString(self._ptr, NULL, 0, True) + cdef char* out = PyMem_Malloc(n_chars + 1) + if not out: + raise MemoryError() + + ArrowSchemaToString(self._ptr, out, n_chars + 1, True) + out_str = out.decode("UTF-8") + PyMem_Free(out) + + return out_str + + @property + def format(self): + if self._ptr.format != NULL: + return self._ptr.format.decode("UTF-8") + + @property + def name(self): + if self._ptr.name != NULL: + return self._ptr.name.decode("UTF-8") + + @property + def flags(self): + return self._ptr.flags + + @property + def children(self): + return CSchemaChildren(self) diff --git a/python/src/nanoarrow/arrow_c.pxd b/python/src/nanoarrow/arrow_c.pxd new file mode 100644 index 000000000..a5f98c8af --- /dev/null +++ b/python/src/nanoarrow/arrow_c.pxd @@ -0,0 +1,55 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# cython: language_level = 3 + +from libc.stdint cimport int64_t + +cdef extern from "nanoarrow.h": + cdef int ARROW_FLAG_DICTIONARY_ORDERED + cdef int ARROW_FLAG_NULLABLE + cdef int ARROW_FLAG_MAP_KEYS_SORTED + + cdef struct ArrowSchema: + const char* format + const char* name + const char* metadata + int64_t flags + int64_t n_children + ArrowSchema** children + ArrowSchema* dictionary + void (*release)(ArrowSchema*) + void* private_data + + cdef struct ArrowArray: + int64_t length + int64_t null_count + int64_t offset + int64_t n_buffers + int64_t n_children + const void** buffers + ArrowArray** children + ArrowArray* dictionary + void (*release)(ArrowArray*) + void* private_data + + cdef struct ArrowArrayStream: + int (*get_schema)(ArrowArrayStream* stream, ArrowSchema* out) + int (*get_next)(ArrowArrayStream* stream, ArrowArray* out) + const char* (*get_last_error)(ArrowArrayStream*) + void (*release)(ArrowArrayStream* stream) + void* private_data diff --git a/python/src/nanoarrow/nanoarrow_c.pxd b/python/src/nanoarrow/nanoarrow_c.pxd index 440f449c1..2d76e0d8a 100644 --- a/python/src/nanoarrow/nanoarrow_c.pxd +++ b/python/src/nanoarrow/nanoarrow_c.pxd @@ -17,30 +17,20 @@ # cython: language_level = 3 -from libc.stdint cimport int64_t, int8_t, uint8_t +from libc.stdint cimport int8_t, uint8_t, int16_t, uint16_t, int32_t, uint32_t, int64_t, uint64_t +from arrow_c cimport ArrowSchema, ArrowArray, ArrowArrayStream cdef extern from "nanoarrow.h": - struct ArrowSchema: - const char* format - int64_t n_children - void (*release)(ArrowSchema*) - - struct ArrowArray: - int64_t length - int64_t null_count - int64_t offset - const void** buffers - void (*release)(ArrowArray*) - - struct ArrowArrayStream: - int (*get_schema)(ArrowArrayStream* stream, ArrowSchema* out) - ctypedef int ArrowErrorCode + cdef int NANOARROW_OK + + cdef struct ArrowError: + pass enum ArrowType: - NANOARROW_TYPE_UNINITIALIZED = 0 - NANOARROW_TYPE_NA = 1 + NANOARROW_TYPE_UNINITIALIZED + NANOARROW_TYPE_NA NANOARROW_TYPE_BOOL NANOARROW_TYPE_UINT8 NANOARROW_TYPE_INT8 @@ -87,34 +77,53 @@ cdef extern from "nanoarrow.h": NANOARROW_BUFFER_TYPE_DATA_OFFSET NANOARROW_BUFFER_TYPE_DATA - struct ArrowError: - pass + enum ArrowTimeUnit: + NANOARROW_TIME_UNIT_SECOND + NANOARROW_TIME_UNIT_MILLI + NANOARROW_TIME_UNIT_MICRO + NANOARROW_TIME_UNIT_NANO - const char* ArrowErrorMessage(ArrowError* error) - - struct ArrowLayout: - ArrowBufferType buffer_type[3] - int64_t element_size_bits[3] - int64_t child_size_elements + cdef struct ArrowStringView: + const char* data + int64_t size_bytes cdef union buffer_data: const void* data const int8_t* as_int8 const uint8_t* as_uint8 - - struct ArrowBufferView: + const int16_t* as_int16 + const uint16_t* as_uint16 + const int32_t* as_int32 + const uint32_t* as_uint32 + const int64_t* as_int64 + const uint64_t* as_uint64 + const double* as_double + const float* as_float + const char* as_char + + cdef struct ArrowBufferView: buffer_data data int64_t size_bytes - struct ArrowBuffer: + cdef struct ArrowBufferAllocator: + pass + + cdef struct ArrowBuffer: uint8_t* data int64_t size_bytes + int64_t capacity_bytes + ArrowBufferAllocator allocator - struct ArrowBitmap: + cdef struct ArrowBitmap: ArrowBuffer buffer int64_t size_bits - struct ArrowArrayView: + cdef struct ArrowLayout: + ArrowBufferType buffer_type[3] + int64_t element_size_bits[3] + int64_t child_size_elements + + cdef struct ArrowArrayView: ArrowArray* array ArrowType storage_type ArrowLayout layout @@ -122,6 +131,30 @@ cdef extern from "nanoarrow.h": int64_t n_children ArrowArrayView** children - ArrowErrorCode ArrowArrayViewInitFromSchema(ArrowArrayView* array_view, ArrowSchema* schema, ArrowError* error) - ArrowErrorCode ArrowArrayViewSetArray(ArrowArrayView* array_view, ArrowArray* array, ArrowError* error) - int64_t ArrowBitCountSet(const uint8_t* bits, int64_t i_from, int64_t i_to) + cdef const char* ArrowNanoarrowVersion() + cdef const char* ArrowErrorMessage(ArrowError* error) + + cdef void ArrowSchemaMove(ArrowSchema* src, ArrowSchema* dst) + cdef void ArrowArrayMove(ArrowArray* src, ArrowArray* dst) + cdef void ArrowArrayStreamMove(ArrowArrayStream* src, ArrowArrayStream* dst) + + cdef int64_t ArrowSchemaToString(ArrowSchema* schema, char* out, int64_t n, + char recursive) + cdef ArrowErrorCode ArrowSchemaDeepCopy(ArrowSchema* schema, + ArrowSchema* schema_out) + cdef ArrowErrorCode ArrowSchemaSetType(ArrowSchema* schema,ArrowType type_) + ArrowErrorCode ArrowSchemaSetTypeStruct(ArrowSchema* schema, int64_t n_children) + + cdef struct ArrowMetadataReader: + pass + + cdef ArrowErrorCode ArrowMetadataReaderInit(ArrowMetadataReader* reader, + const char* metadata) + + cdef ArrowErrorCode ArrowMetadataReaderRead(ArrowMetadataReader* reader, + ArrowStringView* key_out, + ArrowStringView* value_out) + + cdef ArrowErrorCode ArrowArrayViewInitFromSchema(ArrowArrayView* array_view, ArrowSchema* schema, ArrowError* error) + cdef ArrowErrorCode ArrowArrayViewSetArray(ArrowArrayView* array_view, ArrowArray* array, ArrowError* error) + cdef int64_t ArrowBitCountSet(const uint8_t* bits, int64_t i_from, int64_t i_to) diff --git a/python/tests/test_nanoarrow.py b/python/tests/test_nanoarrow.py index fd76534e1..2e3bbb709 100644 --- a/python/tests/test_nanoarrow.py +++ b/python/tests/test_nanoarrow.py @@ -1,27 +1,37 @@ import numpy as np import pyarrow as pa -import nanoarrow +import nanoarrow as na import pytest +def test_version(): + assert(na.version() == "0.1.0-SNAPSHOT") def test_as_numpy_array(): - + arr = pa.array([1, 2, 3]) - result = nanoarrow.as_numpy_array(arr) + result = na.as_numpy_array(arr) expected = arr.to_numpy() np.testing.assert_array_equal(result, expected) arr = pa.array([1, 2, 3], pa.uint8()) - result = nanoarrow.as_numpy_array(arr) + result = na.as_numpy_array(arr) expected = arr.to_numpy() np.testing.assert_array_equal(result, expected) arr = pa.array([1, 2, None]) with pytest.raises(ValueError, match="Cannot convert array with nulls"): - nanoarrow.as_numpy_array(arr) + na.as_numpy_array(arr) arr = pa.array([[1], [2, 3]]) with pytest.raises(TypeError, match="Cannot convert a non-primitive array"): - nanoarrow.as_numpy_array(arr) + na.as_numpy_array(arr) + +def test_schema(): + pa_schema = pa.schema([pa.field("some_name", pa.int32())]) + na_schema = na.Schema.from_pyarrow(pa_schema) + assert(na_schema.format == "+s") + assert(na_schema.flags == 0) + assert(len(na_schema.children), 1) + assert(na_schema.children[0].format == "i") From cff939da42831b29c46a560e7d64ebc60945d67e Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Wed, 8 Mar 2023 16:44:46 -0400 Subject: [PATCH 02/52] remove pxds --- python/src/nanoarrow/arrow_c.pxd | 55 --------- python/src/nanoarrow/nanoarrow_c.pxd | 160 --------------------------- 2 files changed, 215 deletions(-) delete mode 100644 python/src/nanoarrow/arrow_c.pxd delete mode 100644 python/src/nanoarrow/nanoarrow_c.pxd diff --git a/python/src/nanoarrow/arrow_c.pxd b/python/src/nanoarrow/arrow_c.pxd deleted file mode 100644 index a5f98c8af..000000000 --- a/python/src/nanoarrow/arrow_c.pxd +++ /dev/null @@ -1,55 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# cython: language_level = 3 - -from libc.stdint cimport int64_t - -cdef extern from "nanoarrow.h": - cdef int ARROW_FLAG_DICTIONARY_ORDERED - cdef int ARROW_FLAG_NULLABLE - cdef int ARROW_FLAG_MAP_KEYS_SORTED - - cdef struct ArrowSchema: - const char* format - const char* name - const char* metadata - int64_t flags - int64_t n_children - ArrowSchema** children - ArrowSchema* dictionary - void (*release)(ArrowSchema*) - void* private_data - - cdef struct ArrowArray: - int64_t length - int64_t null_count - int64_t offset - int64_t n_buffers - int64_t n_children - const void** buffers - ArrowArray** children - ArrowArray* dictionary - void (*release)(ArrowArray*) - void* private_data - - cdef struct ArrowArrayStream: - int (*get_schema)(ArrowArrayStream* stream, ArrowSchema* out) - int (*get_next)(ArrowArrayStream* stream, ArrowArray* out) - const char* (*get_last_error)(ArrowArrayStream*) - void (*release)(ArrowArrayStream* stream) - void* private_data diff --git a/python/src/nanoarrow/nanoarrow_c.pxd b/python/src/nanoarrow/nanoarrow_c.pxd deleted file mode 100644 index 2d76e0d8a..000000000 --- a/python/src/nanoarrow/nanoarrow_c.pxd +++ /dev/null @@ -1,160 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# cython: language_level = 3 - -from libc.stdint cimport int8_t, uint8_t, int16_t, uint16_t, int32_t, uint32_t, int64_t, uint64_t - -from arrow_c cimport ArrowSchema, ArrowArray, ArrowArrayStream - -cdef extern from "nanoarrow.h": - ctypedef int ArrowErrorCode - cdef int NANOARROW_OK - - cdef struct ArrowError: - pass - - enum ArrowType: - NANOARROW_TYPE_UNINITIALIZED - NANOARROW_TYPE_NA - NANOARROW_TYPE_BOOL - NANOARROW_TYPE_UINT8 - NANOARROW_TYPE_INT8 - NANOARROW_TYPE_UINT16 - NANOARROW_TYPE_INT16 - NANOARROW_TYPE_UINT32 - NANOARROW_TYPE_INT32 - NANOARROW_TYPE_UINT64 - NANOARROW_TYPE_INT64 - NANOARROW_TYPE_HALF_FLOAT - NANOARROW_TYPE_FLOAT - NANOARROW_TYPE_DOUBLE - NANOARROW_TYPE_STRING - NANOARROW_TYPE_BINARY - NANOARROW_TYPE_FIXED_SIZE_BINARY - NANOARROW_TYPE_DATE32 - NANOARROW_TYPE_DATE64 - NANOARROW_TYPE_TIMESTAMP - NANOARROW_TYPE_TIME32 - NANOARROW_TYPE_TIME64 - NANOARROW_TYPE_INTERVAL_MONTHS - NANOARROW_TYPE_INTERVAL_DAY_TIME - NANOARROW_TYPE_DECIMAL128 - NANOARROW_TYPE_DECIMAL256 - NANOARROW_TYPE_LIST - NANOARROW_TYPE_STRUCT - NANOARROW_TYPE_SPARSE_UNION - NANOARROW_TYPE_DENSE_UNION - NANOARROW_TYPE_DICTIONARY - NANOARROW_TYPE_MAP - NANOARROW_TYPE_EXTENSION - NANOARROW_TYPE_FIXED_SIZE_LIST - NANOARROW_TYPE_DURATION - NANOARROW_TYPE_LARGE_STRING - NANOARROW_TYPE_LARGE_BINARY - NANOARROW_TYPE_LARGE_LIST - NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO - - enum ArrowBufferType: - NANOARROW_BUFFER_TYPE_NONE - NANOARROW_BUFFER_TYPE_VALIDITY - NANOARROW_BUFFER_TYPE_TYPE_ID - NANOARROW_BUFFER_TYPE_UNION_OFFSET - NANOARROW_BUFFER_TYPE_DATA_OFFSET - NANOARROW_BUFFER_TYPE_DATA - - enum ArrowTimeUnit: - NANOARROW_TIME_UNIT_SECOND - NANOARROW_TIME_UNIT_MILLI - NANOARROW_TIME_UNIT_MICRO - NANOARROW_TIME_UNIT_NANO - - cdef struct ArrowStringView: - const char* data - int64_t size_bytes - - cdef union buffer_data: - const void* data - const int8_t* as_int8 - const uint8_t* as_uint8 - const int16_t* as_int16 - const uint16_t* as_uint16 - const int32_t* as_int32 - const uint32_t* as_uint32 - const int64_t* as_int64 - const uint64_t* as_uint64 - const double* as_double - const float* as_float - const char* as_char - - cdef struct ArrowBufferView: - buffer_data data - int64_t size_bytes - - cdef struct ArrowBufferAllocator: - pass - - cdef struct ArrowBuffer: - uint8_t* data - int64_t size_bytes - int64_t capacity_bytes - ArrowBufferAllocator allocator - - cdef struct ArrowBitmap: - ArrowBuffer buffer - int64_t size_bits - - cdef struct ArrowLayout: - ArrowBufferType buffer_type[3] - int64_t element_size_bits[3] - int64_t child_size_elements - - cdef struct ArrowArrayView: - ArrowArray* array - ArrowType storage_type - ArrowLayout layout - ArrowBufferView buffer_views[3] - int64_t n_children - ArrowArrayView** children - - cdef const char* ArrowNanoarrowVersion() - cdef const char* ArrowErrorMessage(ArrowError* error) - - cdef void ArrowSchemaMove(ArrowSchema* src, ArrowSchema* dst) - cdef void ArrowArrayMove(ArrowArray* src, ArrowArray* dst) - cdef void ArrowArrayStreamMove(ArrowArrayStream* src, ArrowArrayStream* dst) - - cdef int64_t ArrowSchemaToString(ArrowSchema* schema, char* out, int64_t n, - char recursive) - cdef ArrowErrorCode ArrowSchemaDeepCopy(ArrowSchema* schema, - ArrowSchema* schema_out) - cdef ArrowErrorCode ArrowSchemaSetType(ArrowSchema* schema,ArrowType type_) - ArrowErrorCode ArrowSchemaSetTypeStruct(ArrowSchema* schema, int64_t n_children) - - cdef struct ArrowMetadataReader: - pass - - cdef ArrowErrorCode ArrowMetadataReaderInit(ArrowMetadataReader* reader, - const char* metadata) - - cdef ArrowErrorCode ArrowMetadataReaderRead(ArrowMetadataReader* reader, - ArrowStringView* key_out, - ArrowStringView* value_out) - - cdef ArrowErrorCode ArrowArrayViewInitFromSchema(ArrowArrayView* array_view, ArrowSchema* schema, ArrowError* error) - cdef ArrowErrorCode ArrowArrayViewSetArray(ArrowArrayView* array_view, ArrowArray* array, ArrowError* error) - cdef int64_t ArrowBitCountSet(const uint8_t* bits, int64_t i_from, int64_t i_to) From 73eb934db2f4bf3df08b8a977ab9427955fbb312 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Wed, 8 Mar 2023 16:56:28 -0400 Subject: [PATCH 03/52] generate the nanoarrow pxd file --- python/.gitignore | 1 + python/setup.py | 114 +++++++++++++++++++++++++++++++++ python/tests/test_nanoarrow.py | 2 +- 3 files changed, 116 insertions(+), 1 deletion(-) diff --git a/python/.gitignore b/python/.gitignore index a73fd3d06..8abd5d0de 100644 --- a/python/.gitignore +++ b/python/.gitignore @@ -18,6 +18,7 @@ src/nanoarrow/nanoarrow.c src/nanoarrow/nanoarrow.h +src/nanoarrow/nanoarrow_c.pxd src/nanoarrow/*.c # Byte-compiled / optimized / DLL files diff --git a/python/setup.py b/python/setup.py index b89cf1903..e0b3fe52a 100644 --- a/python/setup.py +++ b/python/setup.py @@ -17,6 +17,9 @@ # specific language governing permissions and limitations # under the License. +import re +import os + import shutil from pathlib import Path @@ -24,6 +27,112 @@ import numpy as np +class NanoarrowPxdGenerator: + + def __init__(self): + self._define_regexes() + + def generate_nanoarrow_pxd(self, file_in, file_out): + file_in_name = os.path.basename(file_in) + + # Read the nanoarrow.h header + content = None + with open(file_in, 'r') as input: + content = input.read() + + # Strip comments + content = self.re_comment.sub('', content) + + # Find types and function definitions + types = self._find_types(content) + func_defs = self._find_func_defs(content) + + # Make corresponding cython definitions + types_cython = [self._type_to_cython(t, ' ') for t in types] + func_defs_cython = [self._func_def_to_cython(d, ' ') for d in func_defs] + + # Unindent the header + header = self.re_newline_plus_indent.sub('\n', self._pxd_header()) + + # Write nanoarrow_c.pxd + with open(file_out, 'wb') as output: + output.write(header.encode('UTF-8')) + + output.write(f'\ncdef extern from "{file_in_name}":\n'.encode("UTF-8")) + + for type in types_cython: + output.write(type.encode('UTF-8')) + output.write(b'\n\n') + + for func_def in func_defs_cython: + output.write(func_def.encode('UTF-8')) + output.write(b'\n') + + output.write(b'\n') + + def _define_regexes(self): + self.re_comment = re.compile(r'\s*//[^\n]*') + self.re_type = re.compile(r'(?Pstruct|union|enum) (?PArrow[^ ]+) {(?P[^}]*)}') + self.re_func_def = re.compile(r'\n(static inline )?(struct|enum )?(?P[A-Za-z]+) (?PArrow[A-Za-z]+)\((?P[^\)]*)\);') + self.re_tagged_type = re.compile(r'(?Pstruct|union|enum) (?PArrow[A-Za-z]+)') + self.re_struct_delim = re.compile(r';\s*') + self.re_enum_delim = re.compile(r',\s*') + self.re_whitespace = re.compile(r'\s+') + self.re_newline_plus_indent = re.compile(r'\n +') + + def _strip_comments(self, content): + return self.re_comment.sub('', content) + + def _find_types(self, content): + return [m.groupdict() for m in self.re_type.finditer(content)] + + def _find_func_defs(self, content): + return [m.groupdict() for m in self.re_func_def.finditer(content)] + + def _type_to_cython(self, t, indent=''): + type = t['type'] + name = t['name'] + body = self.re_tagged_type.sub(r'\2', t['body'].strip()) + if type == 'enum': + items = [item for item in self.re_enum_delim.split(body) if item] + else: + items = [item for item in self.re_struct_delim.split(body) if item] + + cython_body = f'\n{indent} '.join([''] + items) + return f'{indent}cdef {type} {name}:{cython_body}' + + def _func_def_to_cython(self, d, indent=''): + return_type = d['return_type'] + name = d['name'] + args = re.sub(r'\s+', ' ', d['args'].strip()) + args = self.re_tagged_type.sub(r'\2', args) + return f'{indent}cdef {return_type} {name}({args})' + + def _pxd_header(self): + return """ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# cython: language_level = 3 + +from libc.stdint cimport int8_t, uint8_t, int16_t, uint16_t, int32_t, uint32_t, int64_t, uint64_t + """ + + # setuptools gets confused by relative paths that extend above the project root target = Path(__file__).parent / "src" / "nanoarrow" shutil.copy( @@ -33,6 +142,11 @@ Path(__file__).parent / "../dist/nanoarrow.h", target / "nanoarrow.h" ) +NanoarrowPxdGenerator().generate_nanoarrow_pxd( + 'src/nanoarrow/nanoarrow.h', + 'src/nanoarrow/nanoarrow_c.pxd' +) + setup( ext_modules=[ Extension( diff --git a/python/tests/test_nanoarrow.py b/python/tests/test_nanoarrow.py index 2e3bbb709..305b6615c 100644 --- a/python/tests/test_nanoarrow.py +++ b/python/tests/test_nanoarrow.py @@ -6,7 +6,7 @@ import pytest def test_version(): - assert(na.version() == "0.1.0-SNAPSHOT") + assert(na.version() == "0.2.0-SNAPSHOT") def test_as_numpy_array(): From 6153916f7298b9a5493104daa35690620d956814 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Wed, 8 Mar 2023 17:09:47 -0400 Subject: [PATCH 04/52] completely invalid but working towards ideal setup.py --- python/setup.py | 68 +++++++++++++++++++++++++++++-------------------- 1 file changed, 41 insertions(+), 27 deletions(-) diff --git a/python/setup.py b/python/setup.py index e0b3fe52a..e547b70f3 100644 --- a/python/setup.py +++ b/python/setup.py @@ -110,37 +110,51 @@ def _func_def_to_cython(self, d, indent=''): def _pxd_header(self): return """ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. + # Licensed to the Apache Software Foundation (ASF) under one + # or more contributor license agreements. See the NOTICE file + # distributed with this work for additional information + # regarding copyright ownership. The ASF licenses this file + # to you under the Apache License, Version 2.0 (the + # "License"); you may not use this file except in compliance + # with the License. You may obtain a copy of the License at + # + # http://www.apache.org/licenses/LICENSE-2.0 + # + # Unless required by applicable law or agreed to in writing, + # software distributed under the License is distributed on an + # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + # KIND, either express or implied. See the License for the + # specific language governing permissions and limitations + # under the License. + + # cython: language_level = 3 + + from libc.stdint cimport int8_t, uint8_t, int16_t, uint16_t, int32_t,\ + uint32_t, int64_t, uint64_t + """ -# cython: language_level = 3 +def copy_or_generate_nanoarrow_c(): + this_dir = os.path.abspath(os.path.dirname(__file__)) -from libc.stdint cimport int8_t, uint8_t, int16_t, uint16_t, int32_t, uint32_t, int64_t, uint64_t - """ + is_cmake_dir = 'CMakeLists.txt' in os.listdir('..') + is_in_nanoarrow_repo = 'nanoarrow.h' in os.listdir('../src/nanoarrow') + has_cmake = os.system('cmake --version') == 0 + build_dir = os.path.join('.', '_cmake') + source_dir = os.path.abspath(os.path.join('..')) + if has_cmake and is_cmake_dir and is_in_nanoarrow_repo: + try: + os.system(f'cmake -B "{build_dir}" -S "{source_dir}" -DNANOARROW_BUNDLE=ON') + os.system(f'cmake --install -B "{build_dir}" -DNANOARROW_BUNDLE=ON') + finally: + os.unlink(build_dir) -# setuptools gets confused by relative paths that extend above the project root -target = Path(__file__).parent / "src" / "nanoarrow" -shutil.copy( - Path(__file__).parent / "../dist/nanoarrow.c", target / "nanoarrow.c" -) -shutil.copy( - Path(__file__).parent / "../dist/nanoarrow.h", target / "nanoarrow.h" -) + elif is_in_nanoarrow_repo: + shutil.copyfile() + else: + raise ValueError('Attempt to build source distribution outside the nanoarrow repo') + +copy_or_generate_nanoarrow_c() NanoarrowPxdGenerator().generate_nanoarrow_pxd( 'src/nanoarrow/nanoarrow.h', From 1637afeaf052d7117560c561671ab925a557cc7a Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Wed, 8 Mar 2023 17:20:15 -0400 Subject: [PATCH 05/52] still invalid but better setup.py --- python/setup.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/python/setup.py b/python/setup.py index e547b70f3..68f538e9d 100644 --- a/python/setup.py +++ b/python/setup.py @@ -19,9 +19,7 @@ import re import os - import shutil -from pathlib import Path from setuptools import Extension, setup @@ -134,20 +132,23 @@ def _pxd_header(self): """ def copy_or_generate_nanoarrow_c(): + this_wd = os.getcwd() this_dir = os.path.abspath(os.path.dirname(__file__)) + source_dir = os.path.dirname(this_dir) - is_cmake_dir = 'CMakeLists.txt' in os.listdir('..') - is_in_nanoarrow_repo = 'nanoarrow.h' in os.listdir('../src/nanoarrow') + is_cmake_dir = 'CMakeLists.txt' in os.listdir(source_dir) + is_in_nanoarrow_repo = 'nanoarrow.h' in os.listdir(os.path.join(source_dir, 'src', 'nanoarrow')) has_cmake = os.system('cmake --version') == 0 - build_dir = os.path.join('.', '_cmake') - source_dir = os.path.abspath(os.path.join('..')) + build_dir = os.path.join(this_dir, '_cmake') if has_cmake and is_cmake_dir and is_in_nanoarrow_repo: try: - os.system(f'cmake -B "{build_dir}" -S "{source_dir}" -DNANOARROW_BUNDLE=ON') - os.system(f'cmake --install -B "{build_dir}" -DNANOARROW_BUNDLE=ON') + os.mkdir(build_dir) + os.chdir(build_dir) + os.system(f'cmake .. -DNANOARROW_BUNDLE=ON') + os.system(f'cmake --install . --prefix=../src/nanoarrow') finally: - os.unlink(build_dir) + os.chdir(this_wd) elif is_in_nanoarrow_repo: shutil.copyfile() From 1b38e9e8337147191c2c13ce4fa76ce4b5425d7e Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Wed, 8 Mar 2023 22:21:23 -0400 Subject: [PATCH 06/52] actually working bootstrap setup --- python/bootstrap.py | 185 ++++++++++++++++++++++++++++++++++++++++++++ python/setup.py | 146 ++-------------------------------- 2 files changed, 191 insertions(+), 140 deletions(-) create mode 100644 python/bootstrap.py diff --git a/python/bootstrap.py b/python/bootstrap.py new file mode 100644 index 000000000..95a87de59 --- /dev/null +++ b/python/bootstrap.py @@ -0,0 +1,185 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import re +import os +import shutil + +# Generate the nanoarrow_c.pxd file used by the Cython extension +class NanoarrowPxdGenerator: + + def __init__(self): + self._define_regexes() + + def generate_nanoarrow_pxd(self, file_in, file_out): + file_in_name = os.path.basename(file_in) + + # Read the nanoarrow.h header + content = None + with open(file_in, 'r') as input: + content = input.read() + + # Strip comments + content = self.re_comment.sub('', content) + + # Find types and function definitions + types = self._find_types(content) + func_defs = self._find_func_defs(content) + + # Make corresponding cython definitions + types_cython = [self._type_to_cython(t, ' ') for t in types] + func_defs_cython = [self._func_def_to_cython(d, ' ') for d in func_defs] + + # Unindent the header + header = self.re_newline_plus_indent.sub('\n', self._pxd_header()) + + # Write nanoarrow_c.pxd + with open(file_out, 'wb') as output: + output.write(header.encode('UTF-8')) + + output.write(f'\ncdef extern from "{file_in_name}":\n'.encode("UTF-8")) + + for type in types_cython: + output.write(type.encode('UTF-8')) + output.write(b'\n\n') + + for func_def in func_defs_cython: + output.write(func_def.encode('UTF-8')) + output.write(b'\n') + + output.write(b'\n') + + def _define_regexes(self): + self.re_comment = re.compile(r'\s*//[^\n]*') + self.re_type = re.compile(r'(?Pstruct|union|enum) (?PArrow[^ ]+) {(?P[^}]*)}') + self.re_func_def = re.compile(r'\n(static inline )?(struct|enum )?(?P[A-Za-z]+) (?PArrow[A-Za-z]+)\((?P[^\)]*)\);') + self.re_tagged_type = re.compile(r'(?Pstruct|union|enum) (?PArrow[A-Za-z]+)') + self.re_struct_delim = re.compile(r';\s*') + self.re_enum_delim = re.compile(r',\s*') + self.re_whitespace = re.compile(r'\s+') + self.re_newline_plus_indent = re.compile(r'\n +') + + def _strip_comments(self, content): + return self.re_comment.sub('', content) + + def _find_types(self, content): + return [m.groupdict() for m in self.re_type.finditer(content)] + + def _find_func_defs(self, content): + return [m.groupdict() for m in self.re_func_def.finditer(content)] + + def _type_to_cython(self, t, indent=''): + type = t['type'] + name = t['name'] + body = self.re_tagged_type.sub(r'\2', t['body'].strip()) + if type == 'enum': + items = [item for item in self.re_enum_delim.split(body) if item] + else: + items = [item for item in self.re_struct_delim.split(body) if item] + + cython_body = f'\n{indent} '.join([''] + items) + return f'{indent}cdef {type} {name}:{cython_body}' + + def _func_def_to_cython(self, d, indent=''): + return_type = d['return_type'] + name = d['name'] + args = re.sub(r'\s+', ' ', d['args'].strip()) + args = self.re_tagged_type.sub(r'\2', args) + return f'{indent}cdef {return_type} {name}({args})' + + def _pxd_header(self): + return """ + # Licensed to the Apache Software Foundation (ASF) under one + # or more contributor license agreements. See the NOTICE file + # distributed with this work for additional information + # regarding copyright ownership. The ASF licenses this file + # to you under the Apache License, Version 2.0 (the + # "License"); you may not use this file except in compliance + # with the License. You may obtain a copy of the License at + # + # http://www.apache.org/licenses/LICENSE-2.0 + # + # Unless required by applicable law or agreed to in writing, + # software distributed under the License is distributed on an + # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + # KIND, either express or implied. See the License for the + # specific language governing permissions and limitations + # under the License. + + # cython: language_level = 3 + + from libc.stdint cimport int8_t, uint8_t, int16_t, uint16_t, int32_t,\ + uint32_t, int64_t, uint64_t + """ + +# Runs cmake -DNANOARROW_BUNDLE=ON if cmake exists or copies nanoarrow.c/h +# from ../dist if it does not. Running cmake is safer because it will sync +# any changes from nanoarrow C library sources in the checkout but is not +# strictly necessary for things like installing from GitHub. +def copy_or_generate_nanoarrow_c(): + this_wd = os.getcwd() + this_dir = os.path.abspath(os.path.dirname(__file__)) + source_dir = os.path.dirname(this_dir) + + maybe_nanoarrow_h = os.path.join(this_dir, 'src/nanoarrow/nanoarrow.h') + maybe_nanoarrow_c = os.path.join(this_dir, 'src/nanoarrow/nanoarrow.c') + for f in (maybe_nanoarrow_c, maybe_nanoarrow_h): + if os.path.exists(f): + os.unlink(f) + + is_cmake_dir = 'CMakeLists.txt' in os.listdir(source_dir) + is_in_nanoarrow_repo = 'nanoarrow.h' in os.listdir(os.path.join(source_dir, 'src', 'nanoarrow')) + has_cmake = os.system('cmake --version') == 0 + build_dir = os.path.join(this_dir, '_cmake') + + if has_cmake and is_cmake_dir and is_in_nanoarrow_repo: + try: + os.mkdir(build_dir) + os.chdir(build_dir) + os.system(f'cmake ../.. -DNANOARROW_BUNDLE=ON -DNANOARROW_NAMESPACE=PythonPkg') + os.system(f'cmake --install . --prefix=../src/nanoarrow') + finally: + if os.path.exists(build_dir): + shutil.rmtree(build_dir) + os.chdir(this_wd) + + elif is_in_nanoarrow_repo: + shutil.copyfile() + else: + raise ValueError('Attempt to build source distribution outside the nanoarrow repo') + + if not os.path.exists(os.path.join(this_dir, 'src/nanoarrow/nanoarrow.h')): + raise ValueError('Attempt to vendor nanoarrow.c/h failed') + + maybe_nanoarrow_hpp = os.path.join(this_dir, 'src/nanoarrow/nanoarrow.hpp') + if os.path.exists(maybe_nanoarrow_hpp): + os.unlink(maybe_nanoarrow_hpp) + +# Runs the pxd generator with some information about the file name +def generate_nanoarrow_pxd(): + this_dir = os.path.abspath(os.path.dirname(__file__)) + maybe_nanoarrow_h = os.path.join(this_dir, 'src/nanoarrow/nanoarrow.h') + maybe_nanoarrow_pxd = os.path.join(this_dir, 'src/nanoarrow/nanoarrow_c.pxd') + + NanoarrowPxdGenerator().generate_nanoarrow_pxd( + maybe_nanoarrow_h, + maybe_nanoarrow_pxd + ) + +if __name__ == '__main__': + copy_or_generate_nanoarrow_c() + generate_nanoarrow_pxd() diff --git a/python/setup.py b/python/setup.py index 68f538e9d..fdf9eaba7 100644 --- a/python/setup.py +++ b/python/setup.py @@ -17,150 +17,16 @@ # specific language governing permissions and limitations # under the License. -import re import os -import shutil - +import sys +import subprocess from setuptools import Extension, setup - import numpy as np -class NanoarrowPxdGenerator: - - def __init__(self): - self._define_regexes() - - def generate_nanoarrow_pxd(self, file_in, file_out): - file_in_name = os.path.basename(file_in) - - # Read the nanoarrow.h header - content = None - with open(file_in, 'r') as input: - content = input.read() - - # Strip comments - content = self.re_comment.sub('', content) - - # Find types and function definitions - types = self._find_types(content) - func_defs = self._find_func_defs(content) - - # Make corresponding cython definitions - types_cython = [self._type_to_cython(t, ' ') for t in types] - func_defs_cython = [self._func_def_to_cython(d, ' ') for d in func_defs] - - # Unindent the header - header = self.re_newline_plus_indent.sub('\n', self._pxd_header()) - - # Write nanoarrow_c.pxd - with open(file_out, 'wb') as output: - output.write(header.encode('UTF-8')) - - output.write(f'\ncdef extern from "{file_in_name}":\n'.encode("UTF-8")) - - for type in types_cython: - output.write(type.encode('UTF-8')) - output.write(b'\n\n') - - for func_def in func_defs_cython: - output.write(func_def.encode('UTF-8')) - output.write(b'\n') - - output.write(b'\n') - - def _define_regexes(self): - self.re_comment = re.compile(r'\s*//[^\n]*') - self.re_type = re.compile(r'(?Pstruct|union|enum) (?PArrow[^ ]+) {(?P[^}]*)}') - self.re_func_def = re.compile(r'\n(static inline )?(struct|enum )?(?P[A-Za-z]+) (?PArrow[A-Za-z]+)\((?P[^\)]*)\);') - self.re_tagged_type = re.compile(r'(?Pstruct|union|enum) (?PArrow[A-Za-z]+)') - self.re_struct_delim = re.compile(r';\s*') - self.re_enum_delim = re.compile(r',\s*') - self.re_whitespace = re.compile(r'\s+') - self.re_newline_plus_indent = re.compile(r'\n +') - - def _strip_comments(self, content): - return self.re_comment.sub('', content) - - def _find_types(self, content): - return [m.groupdict() for m in self.re_type.finditer(content)] - - def _find_func_defs(self, content): - return [m.groupdict() for m in self.re_func_def.finditer(content)] - - def _type_to_cython(self, t, indent=''): - type = t['type'] - name = t['name'] - body = self.re_tagged_type.sub(r'\2', t['body'].strip()) - if type == 'enum': - items = [item for item in self.re_enum_delim.split(body) if item] - else: - items = [item for item in self.re_struct_delim.split(body) if item] - - cython_body = f'\n{indent} '.join([''] + items) - return f'{indent}cdef {type} {name}:{cython_body}' - - def _func_def_to_cython(self, d, indent=''): - return_type = d['return_type'] - name = d['name'] - args = re.sub(r'\s+', ' ', d['args'].strip()) - args = self.re_tagged_type.sub(r'\2', args) - return f'{indent}cdef {return_type} {name}({args})' - - def _pxd_header(self): - return """ - # Licensed to the Apache Software Foundation (ASF) under one - # or more contributor license agreements. See the NOTICE file - # distributed with this work for additional information - # regarding copyright ownership. The ASF licenses this file - # to you under the Apache License, Version 2.0 (the - # "License"); you may not use this file except in compliance - # with the License. You may obtain a copy of the License at - # - # http://www.apache.org/licenses/LICENSE-2.0 - # - # Unless required by applicable law or agreed to in writing, - # software distributed under the License is distributed on an - # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - # KIND, either express or implied. See the License for the - # specific language governing permissions and limitations - # under the License. - - # cython: language_level = 3 - - from libc.stdint cimport int8_t, uint8_t, int16_t, uint16_t, int32_t,\ - uint32_t, int64_t, uint64_t - """ - -def copy_or_generate_nanoarrow_c(): - this_wd = os.getcwd() - this_dir = os.path.abspath(os.path.dirname(__file__)) - source_dir = os.path.dirname(this_dir) - - is_cmake_dir = 'CMakeLists.txt' in os.listdir(source_dir) - is_in_nanoarrow_repo = 'nanoarrow.h' in os.listdir(os.path.join(source_dir, 'src', 'nanoarrow')) - has_cmake = os.system('cmake --version') == 0 - build_dir = os.path.join(this_dir, '_cmake') - - if has_cmake and is_cmake_dir and is_in_nanoarrow_repo: - try: - os.mkdir(build_dir) - os.chdir(build_dir) - os.system(f'cmake .. -DNANOARROW_BUNDLE=ON') - os.system(f'cmake --install . --prefix=../src/nanoarrow') - finally: - os.chdir(this_wd) - - elif is_in_nanoarrow_repo: - shutil.copyfile() - else: - raise ValueError('Attempt to build source distribution outside the nanoarrow repo') - -copy_or_generate_nanoarrow_c() - -NanoarrowPxdGenerator().generate_nanoarrow_pxd( - 'src/nanoarrow/nanoarrow.h', - 'src/nanoarrow/nanoarrow_c.pxd' -) +# Run bootstrap.py to run cmake generating a fresh bundle based on this +# checkout or copy from ../dist if the caller doesn't have cmake available +this_dir = os.path.dirname(__file__) +subprocess.run([sys.executable, os.path.join(this_dir, 'bootstrap.py')]) setup( ext_modules=[ From ca191e7db8a83427197615e7240d73b7719be274 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Wed, 8 Mar 2023 22:37:56 -0400 Subject: [PATCH 07/52] fix indentation --- python/bootstrap.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/bootstrap.py b/python/bootstrap.py index 95a87de59..bf6771bb6 100644 --- a/python/bootstrap.py +++ b/python/bootstrap.py @@ -42,7 +42,7 @@ def generate_nanoarrow_pxd(self, file_in, file_out): # Make corresponding cython definitions types_cython = [self._type_to_cython(t, ' ') for t in types] - func_defs_cython = [self._func_def_to_cython(d, ' ') for d in func_defs] + func_defs_cython = [self._func_def_to_cython(d, ' ') for d in func_defs] # Unindent the header header = self.re_newline_plus_indent.sub('\n', self._pxd_header()) @@ -95,7 +95,7 @@ def _type_to_cython(self, t, indent=''): return f'{indent}cdef {type} {name}:{cython_body}' def _func_def_to_cython(self, d, indent=''): - return_type = d['return_type'] + return_type = d['return_type'].strip() name = d['name'] args = re.sub(r'\s+', ' ', d['args'].strip()) args = self.re_tagged_type.sub(r'\2', args) From 891afb124dab56f0a4901e974df1592a24967bd0 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Wed, 8 Mar 2023 22:46:35 -0400 Subject: [PATCH 08/52] add some typedefs --- python/bootstrap.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/python/bootstrap.py b/python/bootstrap.py index bf6771bb6..1510e5dfa 100644 --- a/python/bootstrap.py +++ b/python/bootstrap.py @@ -53,6 +53,12 @@ def generate_nanoarrow_pxd(self, file_in, file_out): output.write(f'\ncdef extern from "{file_in_name}":\n'.encode("UTF-8")) + # A few things we add in manually + output.write(b'\n') + output.write(b' ctypedef int ArrowErrorCode\n') + output.write(b' cdef int NANOARROW_OK\n') + output.write(b'\n') + for type in types_cython: output.write(type.encode('UTF-8')) output.write(b'\n\n') @@ -122,8 +128,7 @@ def _pxd_header(self): # cython: language_level = 3 - from libc.stdint cimport int8_t, uint8_t, int16_t, uint16_t, int32_t,\ - uint32_t, int64_t, uint64_t + from libc.stdint cimport int8_t, uint8_t, int16_t, uint16_t, int32_t, uint32_t, int64_t, uint64_t """ # Runs cmake -DNANOARROW_BUNDLE=ON if cmake exists or copies nanoarrow.c/h From 26581a541971e1f5eeef32fa8dd887692ac6e0ec Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Wed, 8 Mar 2023 22:48:38 -0400 Subject: [PATCH 09/52] no void() --- python/bootstrap.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/python/bootstrap.py b/python/bootstrap.py index 1510e5dfa..409bd949e 100644 --- a/python/bootstrap.py +++ b/python/bootstrap.py @@ -105,6 +105,11 @@ def _func_def_to_cython(self, d, indent=''): name = d['name'] args = re.sub(r'\s+', ' ', d['args'].strip()) args = self.re_tagged_type.sub(r'\2', args) + + # Cython doesn't do (void) + if args == 'void': + args = '' + return f'{indent}cdef {return_type} {name}({args})' def _pxd_header(self): From 8531234d699784d9b6005aa24ec978267cebfbf8 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Wed, 8 Mar 2023 22:50:18 -0400 Subject: [PATCH 10/52] try without namespace --- python/bootstrap.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/bootstrap.py b/python/bootstrap.py index 409bd949e..8da841f43 100644 --- a/python/bootstrap.py +++ b/python/bootstrap.py @@ -160,7 +160,7 @@ def copy_or_generate_nanoarrow_c(): try: os.mkdir(build_dir) os.chdir(build_dir) - os.system(f'cmake ../.. -DNANOARROW_BUNDLE=ON -DNANOARROW_NAMESPACE=PythonPkg') + os.system(f'cmake ../.. -DNANOARROW_BUNDLE=ON') os.system(f'cmake --install . --prefix=../src/nanoarrow') finally: if os.path.exists(build_dir): From 1bb020fc11dc75f10db28c4b9efc53dc98a3d3c3 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Wed, 8 Mar 2023 22:56:10 -0400 Subject: [PATCH 11/52] better functionr regex --- python/bootstrap.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/bootstrap.py b/python/bootstrap.py index 8da841f43..978314419 100644 --- a/python/bootstrap.py +++ b/python/bootstrap.py @@ -72,7 +72,7 @@ def generate_nanoarrow_pxd(self, file_in, file_out): def _define_regexes(self): self.re_comment = re.compile(r'\s*//[^\n]*') self.re_type = re.compile(r'(?Pstruct|union|enum) (?PArrow[^ ]+) {(?P[^}]*)}') - self.re_func_def = re.compile(r'\n(static inline )?(struct|enum )?(?P[A-Za-z]+) (?PArrow[A-Za-z]+)\((?P[^\)]*)\);') + self.re_func_def = re.compile(r'\n(static inline )?(?Pconst )?(struct|enum )?(?P[A-Za-z0-9_*]+) (?PArrow[A-Za-z]+)\((?P[^\)]*)\);') self.re_tagged_type = re.compile(r'(?Pstruct|union|enum) (?PArrow[A-Za-z]+)') self.re_struct_delim = re.compile(r';\s*') self.re_enum_delim = re.compile(r',\s*') @@ -102,6 +102,8 @@ def _type_to_cython(self, t, indent=''): def _func_def_to_cython(self, d, indent=''): return_type = d['return_type'].strip() + if d['const']: + return_type = 'const ' + return_type name = d['name'] args = re.sub(r'\s+', ' ', d['args'].strip()) args = self.re_tagged_type.sub(r'\2', args) From 368ecbede7b13982ce3f6bf0d08a3cf9415f9e44 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Thu, 9 Mar 2023 10:05:21 -0400 Subject: [PATCH 12/52] move some bits from the other repo --- python/src/nanoarrow/__init__.py | 15 ------- python/src/nanoarrow/_lib.pyx | 77 ++++++++++++++++++++++++++++++-- python/tests/test_nanoarrow.py | 56 ++++++++++++++++++++--- src/nanoarrow/nanoarrow_types.h | 6 +++ 4 files changed, 130 insertions(+), 24 deletions(-) diff --git a/python/src/nanoarrow/__init__.py b/python/src/nanoarrow/__init__.py index 9a148a4fc..e429fb6e0 100644 --- a/python/src/nanoarrow/__init__.py +++ b/python/src/nanoarrow/__init__.py @@ -21,18 +21,3 @@ CSchemaHolder, CSchema, ) - -class Schema(CSchema): - - def __init__(self, parent=None, addr=None) -> None: - if parent is None: - parent = CSchemaHolder() - if addr is None: - addr = parent._addr() - super().__init__(parent, addr) - - @staticmethod - def from_pyarrow(obj): - schema = Schema() - obj._export_to_c(schema._addr()) - return schema diff --git a/python/src/nanoarrow/_lib.pyx b/python/src/nanoarrow/_lib.pyx index ba9cd21f9..f5a0a8923 100644 --- a/python/src/nanoarrow/_lib.pyx +++ b/python/src/nanoarrow/_lib.pyx @@ -102,6 +102,32 @@ cdef class CSchemaHolder: def _addr(self): return &self.c_schema +cdef class CArrayHolder: + cdef ArrowArray c_array + + def __init__(self): + self.c_array.release = NULL + + def __del__(self): + if self.c_array.release != NULL: + self.c_array.release(&self.c_array) + + def _addr(self): + return &self.c_array + +cdef class CArrayViewHolder: + cdef ArrowArrayView c_array_view + + def __init__(self): + ArrowArrayViewInitFromType(&self.c_array_view, NANOARROW_TYPE_UNINITIALIZED) + + def __del__(self): + ArrowArrayViewReset(&self.c_array_view) + + def _addr(self): + return &self.c_array_view + + cdef class CSchemaChildren: cdef CSchema _parent cdef int64_t _length @@ -118,7 +144,7 @@ cdef class CSchemaChildren: if k < 0 or k >= self._length: raise IndexError(f"{k} out of range [0, {self._length})") - return type(self._parent)(self._parent, self._child_addr(k)) + return CSchema(self._parent, self._child_addr(k)) cdef _child_addr(self, int64_t i): cdef ArrowSchema** children = self._parent._ptr.children @@ -129,14 +155,26 @@ cdef class CSchema: cdef object _base cdef ArrowSchema* _ptr - def __init__(self, object base, uintptr_t addr) -> None: + @staticmethod + def Empty(): + base = CSchemaHolder() + return CSchema(base, base._addr()) + + def __init__(self, object base, uintptr_t addr): self._base = base, self._ptr = addr def _addr(self): return self._ptr - def __repr__(self) -> str: + def is_valid(self): + return self._ptr.release != NULL + + cdef void _assert_valid(self): + if self._ptr.release == NULL: + raise RuntimeError("schema is released") + + def __repr__(self): cdef int64_t n_chars = ArrowSchemaToString(self._ptr, NULL, 0, True) cdef char* out = PyMem_Malloc(n_chars + 1) if not out: @@ -150,13 +188,17 @@ cdef class CSchema: @property def format(self): + self._assert_valid() if self._ptr.format != NULL: return self._ptr.format.decode("UTF-8") @property def name(self): + self._assert_valid() if self._ptr.name != NULL: return self._ptr.name.decode("UTF-8") + else: + return None @property def flags(self): @@ -164,4 +206,33 @@ cdef class CSchema: @property def children(self): + self._assert_valid() return CSchemaChildren(self) + + def parse(self): + self._assert_valid() + + cdef ArrowError error + cdef ArrowSchemaView schema_view + + cdef int result = ArrowSchemaViewInit(&schema_view, self._ptr, &error) + if result != NANOARROW_OK: + raise ValueError(ArrowErrorMessage(&error)) + + out = { + 'name': self._ptr.name.decode('UTF-8') if self._ptr.name else None, + 'type': ArrowTypeString(schema_view.type).decode('UTF-8'), + 'storage_type': ArrowTypeString(schema_view.storage_type).decode('UTF-8') + } + + if schema_view.storage_type in (NANOARROW_TYPE_FIXED_SIZE_LIST, + NANOARROW_TYPE_FIXED_SIZE_BINARY): + out['fixed_size'] = schema_view.fixed_size + + if schema_view.storage_type in (NANOARROW_TYPE_DECIMAL128, + NANOARROW_TYPE_DECIMAL256): + out['decimal_bitwidth'] = schema_view.decimal_bitwidth + out['decimal_precision'] = schema_view.decimal_precision + out['decimal_scale'] = schema_view.decimal_scale + + return out diff --git a/python/tests/test_nanoarrow.py b/python/tests/test_nanoarrow.py index 305b6615c..1698b0aad 100644 --- a/python/tests/test_nanoarrow.py +++ b/python/tests/test_nanoarrow.py @@ -28,10 +28,54 @@ def test_as_numpy_array(): with pytest.raises(TypeError, match="Cannot convert a non-primitive array"): na.as_numpy_array(arr) -def test_schema(): +def test_schema_basic():# Blank invalid schema + schema = na.CSchema.Empty() + assert(schema.is_valid() is False) + assert(repr(schema) == "[invalid: schema is released]") + pa_schema = pa.schema([pa.field("some_name", pa.int32())]) - na_schema = na.Schema.from_pyarrow(pa_schema) - assert(na_schema.format == "+s") - assert(na_schema.flags == 0) - assert(len(na_schema.children), 1) - assert(na_schema.children[0].format == "i") + pa_schema._export_to_c(schema._addr()) + + assert(schema.format == "+s") + assert(schema.flags == 0) + assert(len(schema.children), 1) + assert(schema.children[0].format == "i") + assert(schema.children[0].name == "some_name") + assert(repr(schema.children[0]) == "int32") + + with pytest.raises(IndexError): + schema.children[1] + +def test_schema_parse(): + schema = na.CSchema.Empty() + with pytest.raises(ValueError): + schema.parse() + + pa.schema([pa.field("col1", pa.int32())])._export_to_c(schema._addr()) + + info = schema.parse() + assert(info['type'] == 'struct') + assert(info['storage_type'] == 'struct') + assert(info['name'] == '') + + # Check on the child + child = schema.children[0] + child_info = child.parse() + assert(child_info['type'] == 'int32') + assert(child_info['storage_type'] == 'int32') + assert(child_info['name'] == 'col1') + +def test_schema_info_params(): + schema = na.CSchema.Empty() + pa.binary(12)._export_to_c(schema._addr()) + assert(schema.parse()['fixed_size'] == 12) + + schema = na.CSchema.Empty() + pa.list_(pa.int32(), 12)._export_to_c(schema._addr()) + assert(schema.parse()['fixed_size'] == 12) + + schema = na.CSchema.Empty() + pa.decimal128(10, 3)._export_to_c(schema._addr()) + assert(schema.parse()['decimal_bitwidth'] == 128) + assert(schema.parse()['decimal_precision'] == 10) + assert(schema.parse()['decimal_scale'] == 3) diff --git a/src/nanoarrow/nanoarrow_types.h b/src/nanoarrow/nanoarrow_types.h index 45ee3c636..bf85b19b2 100644 --- a/src/nanoarrow/nanoarrow_types.h +++ b/src/nanoarrow/nanoarrow_types.h @@ -298,6 +298,8 @@ enum ArrowType { /// \ingroup nanoarrow-utils /// /// Returns NULL for invalid values for type +static inline const char* ArrowTypeString(enum ArrowType type); + static inline const char* ArrowTypeString(enum ArrowType type) { switch (type) { case NANOARROW_TYPE_NA: @@ -416,6 +418,8 @@ enum ArrowValidationLevel { /// \ingroup nanoarrow-utils /// /// Returns NULL for invalid values for time_unit +static inline const char* ArrowTimeUnitString(enum ArrowTimeUnit time_unit); + static inline const char* ArrowTimeUnitString(enum ArrowTimeUnit time_unit) { switch (time_unit) { case NANOARROW_TIME_UNIT_SECOND: @@ -458,6 +462,8 @@ struct ArrowStringView { /// \brief Return a view of a const C string /// \ingroup nanoarrow-utils +static inline struct ArrowStringView ArrowCharView(const char* value); + static inline struct ArrowStringView ArrowCharView(const char* value) { struct ArrowStringView out; From 5d0a50005604a818a3a785e25242dcdcccd0dd8a Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Thu, 9 Mar 2023 10:23:55 -0400 Subject: [PATCH 13/52] with recursiveness --- python/src/nanoarrow/__init__.py | 2 + python/src/nanoarrow/_lib.pyx | 172 ++++++++++++++++++++++++++----- python/tests/test_nanoarrow.py | 67 ++++++++++++ 3 files changed, 217 insertions(+), 24 deletions(-) diff --git a/python/src/nanoarrow/__init__.py b/python/src/nanoarrow/__init__.py index e429fb6e0..e9c74a974 100644 --- a/python/src/nanoarrow/__init__.py +++ b/python/src/nanoarrow/__init__.py @@ -20,4 +20,6 @@ version, CSchemaHolder, CSchema, + CArray, + CArrayView ) diff --git a/python/src/nanoarrow/_lib.pyx b/python/src/nanoarrow/_lib.pyx index f5a0a8923..098ecb2f2 100644 --- a/python/src/nanoarrow/_lib.pyx +++ b/python/src/nanoarrow/_lib.pyx @@ -127,30 +127,6 @@ cdef class CArrayViewHolder: def _addr(self): return &self.c_array_view - -cdef class CSchemaChildren: - cdef CSchema _parent - cdef int64_t _length - - def __init__(self, CSchema parent): - self._parent = parent - self._length = parent._ptr.n_children - - def __len__(self): - return self._length - - def __getitem__(self, k): - k = int(k) - if k < 0 or k >= self._length: - raise IndexError(f"{k} out of range [0, {self._length})") - - return CSchema(self._parent, self._child_addr(k)) - - cdef _child_addr(self, int64_t i): - cdef ArrowSchema** children = self._parent._ptr.children - cdef ArrowSchema* child = children[i] - return child - cdef class CSchema: cdef object _base cdef ArrowSchema* _ptr @@ -236,3 +212,151 @@ cdef class CSchema: out['decimal_scale'] = schema_view.decimal_scale return out + +cdef class CArray: + cdef object _base + cdef ArrowArray* _ptr + cdef CSchema _schema + + @staticmethod + def Empty(CSchema schema): + base = CArrayHolder() + return CArray(base, base._addr(), schema) + + def __init__(self, object base, uintptr_t addr, CSchema schema): + self._base = base, + self._ptr = addr + self._schema = schema + + def _addr(self): + return self._ptr + + def is_valid(self): + return self._ptr.release != NULL + + cdef void _assert_valid(self): + if self._ptr.release == NULL: + raise RuntimeError("Array is released") + + @property + def schema(self): + return self._schema + + @property + def children(self): + return CArrayChildren(self) + + def validate(self): + cdef CArrayViewHolder holder = CArrayViewHolder() + + cdef ArrowError error + cdef int result = ArrowArrayViewInitFromSchema(&holder.c_array_view, + self._schema._ptr, &error) + if result != NANOARROW_OK: + raise ValueError(ArrowErrorMessage(&error)) + + result = ArrowArrayViewSetArray(&holder.c_array_view, self._ptr, &error) + if result != NANOARROW_OK: + raise ValueError(ArrowErrorMessage(&error)) + + return CArrayView(holder, holder._addr(), self) + + +cdef class CArrayView: + cdef object _base + cdef ArrowArrayView* _ptr + cdef CArray _array + + def __init__(self, object base, uintptr_t addr, CArray array): + self._base = base, + self._ptr = addr + self._array = array + + @property + def children(self): + return CArrayViewChildren(self) + + @property + def array(self): + return self._array + + @property + def schema(self): + return self._array._schema + + def __len__(self): + return self._ptr.array.length + + def value_int(self, int64_t i): + if i < 0 or i >= self._ptr.array.length: + raise IndexError() + return ArrowArrayViewGetIntUnsafe(self._ptr, i) + +cdef class CSchemaChildren: + cdef CSchema _parent + cdef int64_t _length + + def __init__(self, CSchema parent): + self._parent = parent + self._length = parent._ptr.n_children + + def __len__(self): + return self._length + + def __getitem__(self, k): + k = int(k) + if k < 0 or k >= self._length: + raise IndexError(f"{k} out of range [0, {self._length})") + + return CSchema(self._parent, self._child_addr(k)) + + cdef _child_addr(self, int64_t i): + cdef ArrowSchema** children = self._parent._ptr.children + cdef ArrowSchema* child = children[i] + return child + +cdef class CArrayChildren: + cdef CArray _parent + cdef int64_t _length + + def __init__(self, CArray parent): + self._parent = parent + self._length = parent._ptr.n_children + + def __len__(self): + return self._length + + def __getitem__(self, k): + k = int(k) + if k < 0 or k >= self._length: + raise IndexError(f"{k} out of range [0, {self._length})") + + return CArray(self._parent, self._child_addr(k)) + + cdef _child_addr(self, int64_t i): + cdef ArrowArray** children = self._parent._ptr.children + cdef ArrowArray* child = children[i] + return child + +cdef class CArrayViewChildren: + cdef CArrayView _parent + cdef int64_t _length + + def __init__(self, CArrayView parent): + self._parent = parent + self._length = parent._ptr.n_children + + def __len__(self): + return self._length + + def __getitem__(self, k): + k = int(k) + if k < 0 or k >= self._length: + raise IndexError(f"{k} out of range [0, {self._length})") + + return CArrayView(self._parent, self._child_addr(k), self._parent._array) + + cdef _child_addr(self, int64_t i): + cdef ArrowArrayView** children = self._parent._ptr.children + cdef ArrowArrayView* child = children[i] + return child diff --git a/python/tests/test_nanoarrow.py b/python/tests/test_nanoarrow.py index 1698b0aad..2275c3ab7 100644 --- a/python/tests/test_nanoarrow.py +++ b/python/tests/test_nanoarrow.py @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + import numpy as np import pyarrow as pa @@ -79,3 +96,53 @@ def test_schema_info_params(): assert(schema.parse()['decimal_bitwidth'] == 128) assert(schema.parse()['decimal_precision'] == 10) assert(schema.parse()['decimal_scale'] == 3) + +def test_array(): + schema = na.CSchema.Empty() + pa.int32()._export_to_c(schema._addr()) + + array = na.CArray.Empty(schema) + assert(array.is_valid() is False) + + pa.array([1, 2, 3], pa.int32())._export_to_c(array._addr()) + assert(array.is_valid() is True) + + view = array.validate() + + assert(view.array is array) + assert(view.schema is schema) + assert(len(view) == 3) + + assert(view.value_int(0) == 1) + assert(view.value_int(1) == 2) + assert(view.value_int(2) == 3) + +def test_array_recursive(): + pa_array = pa.array([1, 2, 3], pa.int32()) + pa_batch = pa.record_batch([pa_array], names=["some_column"]) + + schema = na.CSchema.Empty() + pa_batch.schema._export_to_c(schema._addr()) + assert(len(schema.children) == 1) + with pytest.raises(IndexError): + schema.children[1] + + array = na.CArray.Empty(schema) + assert(array.is_valid() is False) + + pa_batch._export_to_c(array._addr()) + assert(array.is_valid() is True) + assert(len(array.children) == 1) + with pytest.raises(IndexError): + array.children[1] + + view = array.validate() + assert(len(view.children) == 1) + with pytest.raises(IndexError): + view.children[1] + + child = view.children[0] + assert(len(child) == 3) + assert(child.value_int(0) == 1) + assert(child.value_int(1) == 2) + assert(child.value_int(2) == 3) From a83e2e9bb83e78a991bd121dd758fbdd55d52e81 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Thu, 9 Mar 2023 11:21:17 -0400 Subject: [PATCH 14/52] passing buffer protocol test --- python/src/nanoarrow/_lib.pyx | 51 ++++++++++++++++++++++++++++++++++ python/tests/test_nanoarrow.py | 6 ++++ 2 files changed, 57 insertions(+) diff --git a/python/src/nanoarrow/_lib.pyx b/python/src/nanoarrow/_lib.pyx index 098ecb2f2..03474a086 100644 --- a/python/src/nanoarrow/_lib.pyx +++ b/python/src/nanoarrow/_lib.pyx @@ -21,6 +21,7 @@ from libc.stdint cimport uint8_t, uintptr_t, int64_t from cpython.mem cimport PyMem_Malloc, PyMem_Free +from cpython cimport Py_buffer from nanoarrow_c cimport * import numpy as np @@ -262,6 +263,52 @@ cdef class CArray: return CArrayView(holder, holder._addr(), self) +cdef class CBufferView: + cdef object _base + cdef ArrowBufferView* _ptr + cdef Py_ssize_t _shape + cdef Py_ssize_t _strides + + def __init__(self, object base, uintptr_t addr): + self._base = base + self._ptr = addr + self._shape = self._ptr.size_bytes + self._strides = 1 + + def __getbuffer__(self, Py_buffer *buffer, int flags): + buffer.buf = self._ptr.data.data + buffer.format = NULL + buffer.internal = NULL + buffer.itemsize = 1 + buffer.len = self._ptr.size_bytes + buffer.ndim = 1 + buffer.obj = self + buffer.readonly = 1 + buffer.shape = &self._shape + buffer.strides = &self._strides + buffer.suboffsets = NULL + + def __releasebuffer__(self, Py_buffer *buffer): + pass + +cdef class CArrayViewBuffers: + cdef CArrayView _array_view + cdef int64_t _length + + def __init__(self, CArrayView array_view): + self._array_view = array_view + self._length = array_view._array._ptr.n_buffers + + def __len__(self): + return self._length + + def __getitem__(self, k): + k = int(k) + if k < 0 or k >= self._length: + raise IndexError(f"{k} out of range [0, {self._length})") + cdef ArrowBufferView* buffer_view = &(self._array_view._ptr.buffer_views[k]) + return CBufferView(self._array_view, buffer_view) + cdef class CArrayView: cdef object _base cdef ArrowArrayView* _ptr @@ -276,6 +323,10 @@ cdef class CArrayView: def children(self): return CArrayViewChildren(self) + @property + def buffers(self): + return CArrayViewBuffers(self) + @property def array(self): return self._array diff --git a/python/tests/test_nanoarrow.py b/python/tests/test_nanoarrow.py index 2275c3ab7..437b78120 100644 --- a/python/tests/test_nanoarrow.py +++ b/python/tests/test_nanoarrow.py @@ -117,6 +117,12 @@ def test_array(): assert(view.value_int(1) == 2) assert(view.value_int(2) == 3) + data_buffer = memoryview(view.buffers[1]) + assert(len(data_buffer) == 12) + data_buffer_copy = bytes(data_buffer) + # (needs updating if testing on big endian) + assert(data_buffer_copy == b'\x01\x00\x00\x00\x02\x00\x00\x00\x03\x00\x00\x00') + def test_array_recursive(): pa_array = pa.array([1, 2, 3], pa.int32()) pa_batch = pa.record_batch([pa_array], names=["some_column"]) From 18f1a20273959092a46c6bece1144a9dc20472a6 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Thu, 9 Mar 2023 11:28:14 -0400 Subject: [PATCH 15/52] maybe fix errors --- python/src/nanoarrow/_lib.pyx | 4 ++-- python/tests/test_nanoarrow.py | 7 ++++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/python/src/nanoarrow/_lib.pyx b/python/src/nanoarrow/_lib.pyx index 03474a086..0dd022e95 100644 --- a/python/src/nanoarrow/_lib.pyx +++ b/python/src/nanoarrow/_lib.pyx @@ -147,7 +147,7 @@ cdef class CSchema: def is_valid(self): return self._ptr.release != NULL - cdef void _assert_valid(self): + def _assert_valid(self): if self._ptr.release == NULL: raise RuntimeError("schema is released") @@ -235,7 +235,7 @@ cdef class CArray: def is_valid(self): return self._ptr.release != NULL - cdef void _assert_valid(self): + def _assert_valid(self): if self._ptr.release == NULL: raise RuntimeError("Array is released") diff --git a/python/tests/test_nanoarrow.py b/python/tests/test_nanoarrow.py index 437b78120..7dafa5387 100644 --- a/python/tests/test_nanoarrow.py +++ b/python/tests/test_nanoarrow.py @@ -45,7 +45,8 @@ def test_as_numpy_array(): with pytest.raises(TypeError, match="Cannot convert a non-primitive array"): na.as_numpy_array(arr) -def test_schema_basic():# Blank invalid schema +def test_schema_basic(): + # Blank invalid schema schema = na.CSchema.Empty() assert(schema.is_valid() is False) assert(repr(schema) == "[invalid: schema is released]") @@ -55,7 +56,7 @@ def test_schema_basic():# Blank invalid schema assert(schema.format == "+s") assert(schema.flags == 0) - assert(len(schema.children), 1) + assert(len(schema.children) == 1) assert(schema.children[0].format == "i") assert(schema.children[0].name == "some_name") assert(repr(schema.children[0]) == "int32") @@ -65,7 +66,7 @@ def test_schema_basic():# Blank invalid schema def test_schema_parse(): schema = na.CSchema.Empty() - with pytest.raises(ValueError): + with pytest.raises(RuntimeError): schema.parse() pa.schema([pa.field("col1", pa.int32())])._export_to_c(schema._addr()) From 0cc14b8d8343a1127de292e47ff0e2aa1aa088bf Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Fri, 10 Mar 2023 09:54:31 -0400 Subject: [PATCH 16/52] don't run bootstrap when installing from sdist --- python/MANIFEST.in | 18 ++++++++++++++++++ python/setup.py | 7 +++++-- 2 files changed, 23 insertions(+), 2 deletions(-) create mode 100644 python/MANIFEST.in diff --git a/python/MANIFEST.in b/python/MANIFEST.in new file mode 100644 index 000000000..61380d9a2 --- /dev/null +++ b/python/MANIFEST.in @@ -0,0 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +exclude bootstrap.py diff --git a/python/setup.py b/python/setup.py index fdf9eaba7..0acdb0e5c 100644 --- a/python/setup.py +++ b/python/setup.py @@ -24,9 +24,12 @@ import numpy as np # Run bootstrap.py to run cmake generating a fresh bundle based on this -# checkout or copy from ../dist if the caller doesn't have cmake available +# checkout or copy from ../dist if the caller doesn't have cmake available. +# Note that bootstrap.py won't exist if building from sdist. this_dir = os.path.dirname(__file__) -subprocess.run([sys.executable, os.path.join(this_dir, 'bootstrap.py')]) +bootstrap_py = os.path.join(this_dir, 'bootstrap.py') +if os.path.exists(bootstrap_py): + subprocess.run([sys.executable, bootstrap_py]) setup( ext_modules=[ From 9e72b962044c40339086888bda258576b24a5950 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Fri, 10 Mar 2023 09:57:55 -0400 Subject: [PATCH 17/52] make sure we can install from sdist --- python/MANIFEST.in | 3 +++ 1 file changed, 3 insertions(+) diff --git a/python/MANIFEST.in b/python/MANIFEST.in index 61380d9a2..9fc293725 100644 --- a/python/MANIFEST.in +++ b/python/MANIFEST.in @@ -16,3 +16,6 @@ # under the License. exclude bootstrap.py +include src/nanoarrow/nanoarrow.c +include src/nanoarrow/nanoarrow.h +include src/nanoarrow/nanoarrow_c.pxd From 2cd5599f629c9a0188fdd89976bd51c7c7374c80 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Fri, 10 Mar 2023 10:00:46 -0400 Subject: [PATCH 18/52] remove redundant cdefs --- python/bootstrap.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/python/bootstrap.py b/python/bootstrap.py index 978314419..3ed7b10e2 100644 --- a/python/bootstrap.py +++ b/python/bootstrap.py @@ -67,8 +67,6 @@ def generate_nanoarrow_pxd(self, file_in, file_out): output.write(func_def.encode('UTF-8')) output.write(b'\n') - output.write(b'\n') - def _define_regexes(self): self.re_comment = re.compile(r'\s*//[^\n]*') self.re_type = re.compile(r'(?Pstruct|union|enum) (?PArrow[^ ]+) {(?P[^}]*)}') @@ -98,7 +96,7 @@ def _type_to_cython(self, t, indent=''): items = [item for item in self.re_struct_delim.split(body) if item] cython_body = f'\n{indent} '.join([''] + items) - return f'{indent}cdef {type} {name}:{cython_body}' + return f'{indent}{type} {name}:{cython_body}' def _func_def_to_cython(self, d, indent=''): return_type = d['return_type'].strip() @@ -112,7 +110,7 @@ def _func_def_to_cython(self, d, indent=''): if args == 'void': args = '' - return f'{indent}cdef {return_type} {name}({args})' + return f'{indent}{return_type} {name}({args})' def _pxd_header(self): return """ From 0144afe74977050e4580702f548c46a30efe081e Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Fri, 10 Mar 2023 10:23:00 -0400 Subject: [PATCH 19/52] fix asserts --- python/tests/test_nanoarrow.py | 86 ++++++++++++++++++---------------- 1 file changed, 45 insertions(+), 41 deletions(-) diff --git a/python/tests/test_nanoarrow.py b/python/tests/test_nanoarrow.py index 7dafa5387..0714b8e8b 100644 --- a/python/tests/test_nanoarrow.py +++ b/python/tests/test_nanoarrow.py @@ -15,15 +15,15 @@ # specific language governing permissions and limitations # under the License. +import sys import numpy as np import pyarrow as pa +import pytest import nanoarrow as na -import pytest - def test_version(): - assert(na.version() == "0.2.0-SNAPSHOT") + assert na.version() == "0.2.0-SNAPSHOT" def test_as_numpy_array(): @@ -48,18 +48,18 @@ def test_as_numpy_array(): def test_schema_basic(): # Blank invalid schema schema = na.CSchema.Empty() - assert(schema.is_valid() is False) - assert(repr(schema) == "[invalid: schema is released]") + assert schema.is_valid() is False + assert repr(schema) == "[invalid: schema is released]" pa_schema = pa.schema([pa.field("some_name", pa.int32())]) pa_schema._export_to_c(schema._addr()) - assert(schema.format == "+s") - assert(schema.flags == 0) - assert(len(schema.children) == 1) - assert(schema.children[0].format == "i") - assert(schema.children[0].name == "some_name") - assert(repr(schema.children[0]) == "int32") + assert schema.format == "+s" + assert schema.flags == 0 + assert len(schema.children) == 1 + assert schema.children[0].format == "i" + assert schema.children[0].name == "some_name" + assert repr(schema.children[0]) == "int32" with pytest.raises(IndexError): schema.children[1] @@ -72,57 +72,61 @@ def test_schema_parse(): pa.schema([pa.field("col1", pa.int32())])._export_to_c(schema._addr()) info = schema.parse() - assert(info['type'] == 'struct') - assert(info['storage_type'] == 'struct') - assert(info['name'] == '') + assert info['type'] == 'struct' + assert info['storage_type'] == 'struct' + assert info['name'] == '' # Check on the child child = schema.children[0] child_info = child.parse() - assert(child_info['type'] == 'int32') - assert(child_info['storage_type'] == 'int32') - assert(child_info['name'] == 'col1') + assert child_info['type'] == 'int32' + assert child_info['storage_type'] == 'int32' + assert child_info['name'] == 'col1' def test_schema_info_params(): schema = na.CSchema.Empty() pa.binary(12)._export_to_c(schema._addr()) - assert(schema.parse()['fixed_size'] == 12) + assert schema.parse()['fixed_size'] == 12 schema = na.CSchema.Empty() pa.list_(pa.int32(), 12)._export_to_c(schema._addr()) - assert(schema.parse()['fixed_size'] == 12) + assert schema.parse()['fixed_size'] == 12 schema = na.CSchema.Empty() pa.decimal128(10, 3)._export_to_c(schema._addr()) - assert(schema.parse()['decimal_bitwidth'] == 128) - assert(schema.parse()['decimal_precision'] == 10) - assert(schema.parse()['decimal_scale'] == 3) + assert schema.parse()['decimal_bitwidth'] == 128 + assert schema.parse()['decimal_precision'] == 10 + assert schema.parse()['decimal_scale'] == 3 def test_array(): schema = na.CSchema.Empty() pa.int32()._export_to_c(schema._addr()) array = na.CArray.Empty(schema) - assert(array.is_valid() is False) + assert array.is_valid() is False pa.array([1, 2, 3], pa.int32())._export_to_c(array._addr()) - assert(array.is_valid() is True) + assert array.is_valid() is True view = array.validate() - assert(view.array is array) - assert(view.schema is schema) - assert(len(view) == 3) + assert view.array is array + assert view.schema is schema + assert len(view) == 3 - assert(view.value_int(0) == 1) - assert(view.value_int(1) == 2) - assert(view.value_int(2) == 3) + assert view.value_int(0) == 1 + assert view.value_int(1) == 2 + assert view.value_int(2) == 3 data_buffer = memoryview(view.buffers[1]) - assert(len(data_buffer) == 12) + assert len(data_buffer) == 12 data_buffer_copy = bytes(data_buffer) # (needs updating if testing on big endian) - assert(data_buffer_copy == b'\x01\x00\x00\x00\x02\x00\x00\x00\x03\x00\x00\x00') + + if sys.byteorder == 'little': + assert data_buffer_copy == b'\x01\x00\x00\x00\x02\x00\x00\x00\x03\x00\x00\x00' + else: + assert data_buffer_copy == b'\x00\x00\x00\x01\x00\x00\x00\x02\x00\x00\x00\x03' def test_array_recursive(): pa_array = pa.array([1, 2, 3], pa.int32()) @@ -130,26 +134,26 @@ def test_array_recursive(): schema = na.CSchema.Empty() pa_batch.schema._export_to_c(schema._addr()) - assert(len(schema.children) == 1) + assert len(schema.children) == 1 with pytest.raises(IndexError): schema.children[1] array = na.CArray.Empty(schema) - assert(array.is_valid() is False) + assert array.is_valid() is False pa_batch._export_to_c(array._addr()) - assert(array.is_valid() is True) - assert(len(array.children) == 1) + assert array.is_valid() is True + assert len(array.children) == 1 with pytest.raises(IndexError): array.children[1] view = array.validate() - assert(len(view.children) == 1) + assert len(view.children) == 1 with pytest.raises(IndexError): view.children[1] child = view.children[0] - assert(len(child) == 3) - assert(child.value_int(0) == 1) - assert(child.value_int(1) == 2) - assert(child.value_int(2) == 3) + assert len(child) == 3 + assert child.value_int(0) == 1 + assert child.value_int(1) == 2 + assert child.value_int(2) == 3 From b768724ebe82446523d27d096fa2f087e25027a2 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Fri, 10 Mar 2023 10:26:35 -0400 Subject: [PATCH 20/52] maybe safer version check --- python/tests/test_nanoarrow.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/tests/test_nanoarrow.py b/python/tests/test_nanoarrow.py index 0714b8e8b..d1434974c 100644 --- a/python/tests/test_nanoarrow.py +++ b/python/tests/test_nanoarrow.py @@ -16,6 +16,7 @@ # under the License. import sys +import re import numpy as np import pyarrow as pa import pytest @@ -23,7 +24,8 @@ import nanoarrow as na def test_version(): - assert na.version() == "0.2.0-SNAPSHOT" + re_version = re.compile(r'^[0-9]+\.[0-9]+\.[0-9]+(-SNAPSHOT)?$') + assert re_version.match(na.version()) is not None def test_as_numpy_array(): From 846c8376b4b70e36a948e56db2c6727bfe399e36 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Fri, 10 Mar 2023 10:38:07 -0400 Subject: [PATCH 21/52] drop C class prefix, drop numpy dependency, pythonize quotes --- python/pyproject.toml | 4 +- python/setup.py | 11 ++- python/src/nanoarrow/__init__.py | 8 +- python/src/nanoarrow/_lib.pyx | 141 +++++++++---------------------- python/tests/test_nanoarrow.py | 39 ++------- 5 files changed, 57 insertions(+), 146 deletions(-) diff --git a/python/pyproject.toml b/python/pyproject.toml index 1cc2c17ec..52b7d5bc5 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -23,7 +23,6 @@ description = "" authors = [{name = "Apache Arrow Developers", email = "dev@arrow.apache.org"}] license = {text = "Apache-2.0"} requires-python = ">=3.8" -dependencies = ["numpy"] [project.optional-dependencies] test = ["pyarrow", "pytest"] @@ -36,7 +35,6 @@ repository = "https://github.com/apache/arrow-nanoarrow" requires = [ "setuptools >= 61.0.0", "setuptools-scm", - "Cython", - "oldest-supported-numpy", + "Cython" ] build-backend = "setuptools.build_meta" diff --git a/python/setup.py b/python/setup.py index 0acdb0e5c..8b4b61c42 100644 --- a/python/setup.py +++ b/python/setup.py @@ -21,7 +21,6 @@ import sys import subprocess from setuptools import Extension, setup -import numpy as np # Run bootstrap.py to run cmake generating a fresh bundle based on this # checkout or copy from ../dist if the caller doesn't have cmake available. @@ -34,12 +33,12 @@ setup( ext_modules=[ Extension( - name="nanoarrow._lib", - include_dirs=[np.get_include(), "src/nanoarrow"], - language="c", + name='nanoarrow._lib', + include_dirs=['src/nanoarrow'], + language='c', sources=[ - "src/nanoarrow/_lib.pyx", - "src/nanoarrow/nanoarrow.c", + 'src/nanoarrow/_lib.pyx', + 'src/nanoarrow/nanoarrow.c', ], ) ] diff --git a/python/src/nanoarrow/__init__.py b/python/src/nanoarrow/__init__.py index e9c74a974..18847ccad 100644 --- a/python/src/nanoarrow/__init__.py +++ b/python/src/nanoarrow/__init__.py @@ -16,10 +16,8 @@ # under the License. from ._lib import ( # noqa: F401 - as_numpy_array, version, - CSchemaHolder, - CSchema, - CArray, - CArrayView + Schema, + Array, + ArrayView ) diff --git a/python/src/nanoarrow/_lib.pyx b/python/src/nanoarrow/_lib.pyx index 0dd022e95..a281fa522 100644 --- a/python/src/nanoarrow/_lib.pyx +++ b/python/src/nanoarrow/_lib.pyx @@ -19,78 +19,15 @@ """Low-level nanoarrow Python bindings.""" -from libc.stdint cimport uint8_t, uintptr_t, int64_t +from libc.stdint cimport uintptr_t, int64_t from cpython.mem cimport PyMem_Malloc, PyMem_Free from cpython cimport Py_buffer from nanoarrow_c cimport * -import numpy as np -cimport numpy as cnp - -cnp.import_array() - - -cdef dict _numpy_type_map = { - NANOARROW_TYPE_UINT8: cnp.NPY_UINT8, - NANOARROW_TYPE_INT8: cnp.NPY_INT8, - NANOARROW_TYPE_UINT16: cnp.NPY_UINT16, - NANOARROW_TYPE_INT16: cnp.NPY_INT16, - NANOARROW_TYPE_UINT32: cnp.NPY_UINT32, - NANOARROW_TYPE_INT32: cnp.NPY_INT32, - NANOARROW_TYPE_UINT64: cnp.NPY_UINT64, - NANOARROW_TYPE_INT64: cnp.NPY_INT64, - NANOARROW_TYPE_HALF_FLOAT: cnp.NPY_FLOAT16, - NANOARROW_TYPE_FLOAT: cnp.NPY_FLOAT32, - NANOARROW_TYPE_DOUBLE: cnp.NPY_FLOAT64, -} - - -def as_numpy_array(arr): - cdef ArrowSchema schema - cdef ArrowArray array - cdef ArrowArrayView array_view - cdef ArrowError error - - arr._export_to_c( &array, &schema) - ArrowArrayViewInitFromSchema(&array_view, &schema, &error) - - # primitive arrays have DATA as the second buffer - if array_view.layout.buffer_type[1] != NANOARROW_BUFFER_TYPE_DATA: - raise TypeError("Cannot convert a non-primitive array") - - # disallow nulls for this method - if array.null_count > 0: - raise ValueError("Cannot convert array with nulls") - elif array.null_count < 0: - # not yet computed - if array_view.layout.buffer_type[0] == NANOARROW_BUFFER_TYPE_VALIDITY: - if array.buffers[0] != NULL: - null_count = ArrowBitCountSet( - array.buffers[0], array.offset, array.length - ) - if null_count > 0: - raise ValueError("Cannot convert array with nulls") - - cdef int type_num - if array_view.storage_type in _numpy_type_map: - type_num = _numpy_type_map[array_view.storage_type] - else: - raise NotImplementedError(array_view.storage_type) - - cdef cnp.npy_intp dims[1] - dims[0] = array.length - cdef cnp.ndarray result = cnp.PyArray_New( - np.ndarray, 1, dims, type_num, NULL, array.buffers[1], -1, 0, NULL - ) - # TODO set base - - return result - - def version(): return ArrowNanoarrowVersion().decode("UTF-8") -cdef class CSchemaHolder: +cdef class SchemaHolder: cdef ArrowSchema c_schema def __init__(self): @@ -103,7 +40,7 @@ cdef class CSchemaHolder: def _addr(self): return &self.c_schema -cdef class CArrayHolder: +cdef class ArrayHolder: cdef ArrowArray c_array def __init__(self): @@ -116,7 +53,7 @@ cdef class CArrayHolder: def _addr(self): return &self.c_array -cdef class CArrayViewHolder: +cdef class ArrayViewHolder: cdef ArrowArrayView c_array_view def __init__(self): @@ -128,14 +65,14 @@ cdef class CArrayViewHolder: def _addr(self): return &self.c_array_view -cdef class CSchema: +cdef class Schema: cdef object _base cdef ArrowSchema* _ptr @staticmethod def Empty(): - base = CSchemaHolder() - return CSchema(base, base._addr()) + base = SchemaHolder() + return Schema(base, base._addr()) def __init__(self, object base, uintptr_t addr): self._base = base, @@ -184,7 +121,7 @@ cdef class CSchema: @property def children(self): self._assert_valid() - return CSchemaChildren(self) + return SchemaChildren(self) def parse(self): self._assert_valid() @@ -214,17 +151,17 @@ cdef class CSchema: return out -cdef class CArray: +cdef class Array: cdef object _base cdef ArrowArray* _ptr - cdef CSchema _schema + cdef Schema _schema @staticmethod - def Empty(CSchema schema): - base = CArrayHolder() - return CArray(base, base._addr(), schema) + def Empty(Schema schema): + base = ArrayHolder() + return Array(base, base._addr(), schema) - def __init__(self, object base, uintptr_t addr, CSchema schema): + def __init__(self, object base, uintptr_t addr, Schema schema): self._base = base, self._ptr = addr self._schema = schema @@ -245,10 +182,10 @@ cdef class CArray: @property def children(self): - return CArrayChildren(self) + return ArrayChildren(self) def validate(self): - cdef CArrayViewHolder holder = CArrayViewHolder() + cdef ArrayViewHolder holder = ArrayViewHolder() cdef ArrowError error cdef int result = ArrowArrayViewInitFromSchema(&holder.c_array_view, @@ -260,10 +197,10 @@ cdef class CArray: if result != NANOARROW_OK: raise ValueError(ArrowErrorMessage(&error)) - return CArrayView(holder, holder._addr(), self) + return ArrayView(holder, holder._addr(), self) -cdef class CBufferView: +cdef class BufferView: cdef object _base cdef ArrowBufferView* _ptr cdef Py_ssize_t _shape @@ -291,11 +228,11 @@ cdef class CBufferView: def __releasebuffer__(self, Py_buffer *buffer): pass -cdef class CArrayViewBuffers: - cdef CArrayView _array_view +cdef class ArrayViewBuffers: + cdef ArrayView _array_view cdef int64_t _length - def __init__(self, CArrayView array_view): + def __init__(self, ArrayView array_view): self._array_view = array_view self._length = array_view._array._ptr.n_buffers @@ -307,25 +244,25 @@ cdef class CArrayViewBuffers: if k < 0 or k >= self._length: raise IndexError(f"{k} out of range [0, {self._length})") cdef ArrowBufferView* buffer_view = &(self._array_view._ptr.buffer_views[k]) - return CBufferView(self._array_view, buffer_view) + return BufferView(self._array_view, buffer_view) -cdef class CArrayView: +cdef class ArrayView: cdef object _base cdef ArrowArrayView* _ptr - cdef CArray _array + cdef Array _array - def __init__(self, object base, uintptr_t addr, CArray array): + def __init__(self, object base, uintptr_t addr, Array array): self._base = base, self._ptr = addr self._array = array @property def children(self): - return CArrayViewChildren(self) + return ArrayViewChildren(self) @property def buffers(self): - return CArrayViewBuffers(self) + return ArrayViewBuffers(self) @property def array(self): @@ -343,11 +280,11 @@ cdef class CArrayView: raise IndexError() return ArrowArrayViewGetIntUnsafe(self._ptr, i) -cdef class CSchemaChildren: - cdef CSchema _parent +cdef class SchemaChildren: + cdef Schema _parent cdef int64_t _length - def __init__(self, CSchema parent): + def __init__(self, Schema parent): self._parent = parent self._length = parent._ptr.n_children @@ -359,18 +296,18 @@ cdef class CSchemaChildren: if k < 0 or k >= self._length: raise IndexError(f"{k} out of range [0, {self._length})") - return CSchema(self._parent, self._child_addr(k)) + return Schema(self._parent, self._child_addr(k)) cdef _child_addr(self, int64_t i): cdef ArrowSchema** children = self._parent._ptr.children cdef ArrowSchema* child = children[i] return child -cdef class CArrayChildren: - cdef CArray _parent +cdef class ArrayChildren: + cdef Array _parent cdef int64_t _length - def __init__(self, CArray parent): + def __init__(self, Array parent): self._parent = parent self._length = parent._ptr.n_children @@ -382,18 +319,18 @@ cdef class CArrayChildren: if k < 0 or k >= self._length: raise IndexError(f"{k} out of range [0, {self._length})") - return CArray(self._parent, self._child_addr(k)) + return Array(self._parent, self._child_addr(k)) cdef _child_addr(self, int64_t i): cdef ArrowArray** children = self._parent._ptr.children cdef ArrowArray* child = children[i] return child -cdef class CArrayViewChildren: - cdef CArrayView _parent +cdef class ArrayViewChildren: + cdef ArrayView _parent cdef int64_t _length - def __init__(self, CArrayView parent): + def __init__(self, ArrayView parent): self._parent = parent self._length = parent._ptr.n_children @@ -405,7 +342,7 @@ cdef class CArrayViewChildren: if k < 0 or k >= self._length: raise IndexError(f"{k} out of range [0, {self._length})") - return CArrayView(self._parent, self._child_addr(k), self._parent._array) + return ArrayView(self._parent, self._child_addr(k), self._parent._array) cdef _child_addr(self, int64_t i): cdef ArrowArrayView** children = self._parent._ptr.children diff --git a/python/tests/test_nanoarrow.py b/python/tests/test_nanoarrow.py index d1434974c..37cb273be 100644 --- a/python/tests/test_nanoarrow.py +++ b/python/tests/test_nanoarrow.py @@ -17,7 +17,6 @@ import sys import re -import numpy as np import pyarrow as pa import pytest @@ -27,29 +26,9 @@ def test_version(): re_version = re.compile(r'^[0-9]+\.[0-9]+\.[0-9]+(-SNAPSHOT)?$') assert re_version.match(na.version()) is not None -def test_as_numpy_array(): - - arr = pa.array([1, 2, 3]) - result = na.as_numpy_array(arr) - expected = arr.to_numpy() - np.testing.assert_array_equal(result, expected) - - arr = pa.array([1, 2, 3], pa.uint8()) - result = na.as_numpy_array(arr) - expected = arr.to_numpy() - np.testing.assert_array_equal(result, expected) - - arr = pa.array([1, 2, None]) - with pytest.raises(ValueError, match="Cannot convert array with nulls"): - na.as_numpy_array(arr) - - arr = pa.array([[1], [2, 3]]) - with pytest.raises(TypeError, match="Cannot convert a non-primitive array"): - na.as_numpy_array(arr) - def test_schema_basic(): # Blank invalid schema - schema = na.CSchema.Empty() + schema = na.Schema.Empty() assert schema.is_valid() is False assert repr(schema) == "[invalid: schema is released]" @@ -67,7 +46,7 @@ def test_schema_basic(): schema.children[1] def test_schema_parse(): - schema = na.CSchema.Empty() + schema = na.Schema.Empty() with pytest.raises(RuntimeError): schema.parse() @@ -86,25 +65,25 @@ def test_schema_parse(): assert child_info['name'] == 'col1' def test_schema_info_params(): - schema = na.CSchema.Empty() + schema = na.Schema.Empty() pa.binary(12)._export_to_c(schema._addr()) assert schema.parse()['fixed_size'] == 12 - schema = na.CSchema.Empty() + schema = na.Schema.Empty() pa.list_(pa.int32(), 12)._export_to_c(schema._addr()) assert schema.parse()['fixed_size'] == 12 - schema = na.CSchema.Empty() + schema = na.Schema.Empty() pa.decimal128(10, 3)._export_to_c(schema._addr()) assert schema.parse()['decimal_bitwidth'] == 128 assert schema.parse()['decimal_precision'] == 10 assert schema.parse()['decimal_scale'] == 3 def test_array(): - schema = na.CSchema.Empty() + schema = na.Schema.Empty() pa.int32()._export_to_c(schema._addr()) - array = na.CArray.Empty(schema) + array = na.Array.Empty(schema) assert array.is_valid() is False pa.array([1, 2, 3], pa.int32())._export_to_c(array._addr()) @@ -134,13 +113,13 @@ def test_array_recursive(): pa_array = pa.array([1, 2, 3], pa.int32()) pa_batch = pa.record_batch([pa_array], names=["some_column"]) - schema = na.CSchema.Empty() + schema = na.Schema.Empty() pa_batch.schema._export_to_c(schema._addr()) assert len(schema.children) == 1 with pytest.raises(IndexError): schema.children[1] - array = na.CArray.Empty(schema) + array = na.Array.Empty(schema) assert array.is_valid() is False pa_batch._export_to_c(array._addr()) From ac304af59a6233604fb6404af31022d5ae6a6185 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Fri, 10 Mar 2023 12:48:31 -0400 Subject: [PATCH 22/52] more complete schema field wrapping --- python/src/nanoarrow/_lib.pyx | 243 ++++++++++++++++++++++++--------- python/tests/test_nanoarrow.py | 84 +++++++++--- 2 files changed, 241 insertions(+), 86 deletions(-) diff --git a/python/src/nanoarrow/_lib.pyx b/python/src/nanoarrow/_lib.pyx index a281fa522..2db4ce85e 100644 --- a/python/src/nanoarrow/_lib.pyx +++ b/python/src/nanoarrow/_lib.pyx @@ -21,6 +21,7 @@ from libc.stdint cimport uintptr_t, int64_t from cpython.mem cimport PyMem_Malloc, PyMem_Free +from cpython.bytes cimport PyBytes_FromStringAndSize from cpython cimport Py_buffer from nanoarrow_c cimport * @@ -118,38 +119,121 @@ cdef class Schema: def flags(self): return self._ptr.flags + @property + def metadata(self): + self._assert_valid() + if self._ptr.metadata != NULL: + return SchemaMetadata(self, self._ptr.metadata) + else: + return None + @property def children(self): self._assert_valid() return SchemaChildren(self) - def parse(self): + def view(self): self._assert_valid() - + schema_view = SchemaView() cdef ArrowError error - cdef ArrowSchemaView schema_view - - cdef int result = ArrowSchemaViewInit(&schema_view, self._ptr, &error) + cdef int result = ArrowSchemaViewInit(&schema_view._schema_view, self._ptr, &error) if result != NANOARROW_OK: raise ValueError(ArrowErrorMessage(&error)) + return schema_view - out = { - 'name': self._ptr.name.decode('UTF-8') if self._ptr.name else None, - 'type': ArrowTypeString(schema_view.type).decode('UTF-8'), - 'storage_type': ArrowTypeString(schema_view.storage_type).decode('UTF-8') - } +cdef class SchemaView: + cdef ArrowSchemaView _schema_view - if schema_view.storage_type in (NANOARROW_TYPE_FIXED_SIZE_LIST, - NANOARROW_TYPE_FIXED_SIZE_BINARY): - out['fixed_size'] = schema_view.fixed_size + _fixed_size_types = ( + NANOARROW_TYPE_FIXED_SIZE_LIST, + NANOARROW_TYPE_FIXED_SIZE_BINARY + ) - if schema_view.storage_type in (NANOARROW_TYPE_DECIMAL128, - NANOARROW_TYPE_DECIMAL256): - out['decimal_bitwidth'] = schema_view.decimal_bitwidth - out['decimal_precision'] = schema_view.decimal_precision - out['decimal_scale'] = schema_view.decimal_scale + _decimal_types = ( + NANOARROW_TYPE_DECIMAL128, + NANOARROW_TYPE_DECIMAL256 + ) - return out + _time_unit_types = ( + NANOARROW_TYPE_TIME32, + NANOARROW_TYPE_TIME64, + NANOARROW_TYPE_DURATION, + NANOARROW_TYPE_TIMESTAMP + ) + + _union_types = ( + NANOARROW_TYPE_DENSE_UNION, + NANOARROW_TYPE_SPARSE_UNION + ) + + def __init__(self): + self._schema_view.type = NANOARROW_TYPE_UNINITIALIZED + self._schema_view.storage_type = NANOARROW_TYPE_UNINITIALIZED + + @property + def type(self): + cdef const char* type_str = ArrowTypeString(self._schema_view.type) + if type_str != NULL: + return type_str.decode('UTF-8') + + @property + def storage_type(self): + cdef const char* type_str = ArrowTypeString(self._schema_view.storage_type) + if type_str != NULL: + return type_str.decode('UTF-8') + + @property + def fixed_size(self): + if self._schema_view.type in SchemaView._fixed_size_types: + return self._schema_view.fixed_size + + @property + def decimal_bitwidth(self): + if self._schema_view.type in SchemaView._decimal_types: + return self._schema_view.decimal_bitwidth + + @property + def decimal_precision(self): + if self._schema_view.type in SchemaView._decimal_types: + return self._schema_view.decimal_precision + + @property + def decimal_scale(self): + if self._schema_view.type in SchemaView._decimal_types: + return self._schema_view.decimal_scale + + @property + def time_unit(self): + if self._schema_view.type in SchemaView._time_unit_types: + return ArrowTimeUnitString(self._schema_view.time_unit).decode('UTF-8') + + @property + def timezone(self): + if self._schema_view.type == NANOARROW_TYPE_TIMESTAMP: + return self._schema_view.timezone.decode('UTF_8') + + @property + def union_type_ids(self): + if self._schema_view.type in SchemaView._union_types: + type_ids_str = self._schema_view.union_type_ids.decode('UTF-8').split(',') + return (int(type_id) for type_id in type_ids_str) + + @property + def extension_name(self): + if self._schema_view.extension_name.data != NULL: + name_bytes = PyBytes_FromStringAndSize( + self._schema_view.extension_name.data, + self._schema_view.extension_name.size_bytes + ) + return name_bytes.decode('UTF-8') + + @property + def extension_metadata(self): + if self._schema_view.extension_name.data != NULL: + return PyBytes_FromStringAndSize( + self._schema_view.extension_metadata.data, + self._schema_view.extension_metadata.size_bytes + ) cdef class Array: cdef object _base @@ -199,53 +283,6 @@ cdef class Array: return ArrayView(holder, holder._addr(), self) - -cdef class BufferView: - cdef object _base - cdef ArrowBufferView* _ptr - cdef Py_ssize_t _shape - cdef Py_ssize_t _strides - - def __init__(self, object base, uintptr_t addr): - self._base = base - self._ptr = addr - self._shape = self._ptr.size_bytes - self._strides = 1 - - def __getbuffer__(self, Py_buffer *buffer, int flags): - buffer.buf = self._ptr.data.data - buffer.format = NULL - buffer.internal = NULL - buffer.itemsize = 1 - buffer.len = self._ptr.size_bytes - buffer.ndim = 1 - buffer.obj = self - buffer.readonly = 1 - buffer.shape = &self._shape - buffer.strides = &self._strides - buffer.suboffsets = NULL - - def __releasebuffer__(self, Py_buffer *buffer): - pass - -cdef class ArrayViewBuffers: - cdef ArrayView _array_view - cdef int64_t _length - - def __init__(self, ArrayView array_view): - self._array_view = array_view - self._length = array_view._array._ptr.n_buffers - - def __len__(self): - return self._length - - def __getitem__(self, k): - k = int(k) - if k < 0 or k >= self._length: - raise IndexError(f"{k} out of range [0, {self._length})") - cdef ArrowBufferView* buffer_view = &(self._array_view._ptr.buffer_views[k]) - return BufferView(self._array_view, buffer_view) - cdef class ArrayView: cdef object _base cdef ArrowArrayView* _ptr @@ -303,6 +340,34 @@ cdef class SchemaChildren: cdef ArrowSchema* child = children[i] return child +cdef class SchemaMetadata: + cdef object _parent + cdef const char* _metadata + cdef ArrowMetadataReader _reader + + def __init__(self, object parent, uintptr_t ptr): + self._parent = parent + self._metadata = ptr + + def _init_reader(self): + cdef int result = ArrowMetadataReaderInit(&self._reader, self._metadata) + if result != NANOARROW_OK: + raise ValueError('ArrowMetadataReaderInit() failed') + + def __len__(self): + self._init_reader() + return self._reader.remaining_keys + + def __iter__(self): + cdef ArrowStringView key + cdef ArrowStringView value + self._init_reader() + while self._reader.remaining_keys > 0: + ArrowMetadataReaderRead(&self._reader, &key, &value) + key_obj = PyBytes_FromStringAndSize(key.data, key.size_bytes).decode('UTF-8') + value_obj = PyBytes_FromStringAndSize(value.data, value.size_bytes) + yield key_obj, value_obj + cdef class ArrayChildren: cdef Array _parent cdef int64_t _length @@ -348,3 +413,49 @@ cdef class ArrayViewChildren: cdef ArrowArrayView** children = self._parent._ptr.children cdef ArrowArrayView* child = children[i] return child + +cdef class BufferView: + cdef object _base + cdef ArrowBufferView* _ptr + cdef Py_ssize_t _shape + cdef Py_ssize_t _strides + + def __init__(self, object base, uintptr_t addr): + self._base = base + self._ptr = addr + self._shape = self._ptr.size_bytes + self._strides = 1 + + def __getbuffer__(self, Py_buffer *buffer, int flags): + buffer.buf = self._ptr.data.data + buffer.format = NULL + buffer.internal = NULL + buffer.itemsize = 1 + buffer.len = self._ptr.size_bytes + buffer.ndim = 1 + buffer.obj = self + buffer.readonly = 1 + buffer.shape = &self._shape + buffer.strides = &self._strides + buffer.suboffsets = NULL + + def __releasebuffer__(self, Py_buffer *buffer): + pass + +cdef class ArrayViewBuffers: + cdef ArrayView _array_view + cdef int64_t _length + + def __init__(self, ArrayView array_view): + self._array_view = array_view + self._length = array_view._array._ptr.n_buffers + + def __len__(self): + return self._length + + def __getitem__(self, k): + k = int(k) + if k < 0 or k >= self._length: + raise IndexError(f"{k} out of range [0, {self._length})") + cdef ArrowBufferView* buffer_view = &(self._array_view._ptr.buffer_views[k]) + return BufferView(self._array_view, buffer_view) diff --git a/python/tests/test_nanoarrow.py b/python/tests/test_nanoarrow.py index 37cb273be..0c0077ff8 100644 --- a/python/tests/test_nanoarrow.py +++ b/python/tests/test_nanoarrow.py @@ -37,6 +37,7 @@ def test_schema_basic(): assert schema.format == "+s" assert schema.flags == 0 + assert schema.metadata is None assert len(schema.children) == 1 assert schema.children[0].format == "i" assert schema.children[0].name == "some_name" @@ -45,39 +46,82 @@ def test_schema_basic(): with pytest.raises(IndexError): schema.children[1] -def test_schema_parse(): +def test_schema_metadata(): schema = na.Schema.Empty() - with pytest.raises(RuntimeError): - schema.parse() + meta = {'key1': 'value1', 'key2': 'value2'} + pa.field('', pa.int32(), metadata=meta)._export_to_c(schema._addr()) - pa.schema([pa.field("col1", pa.int32())])._export_to_c(schema._addr()) + assert len(schema.metadata) == 2 - info = schema.parse() - assert info['type'] == 'struct' - assert info['storage_type'] == 'struct' - assert info['name'] == '' + meta2 = {k: v for k, v in schema.metadata} + assert list(meta2.keys()) == ['key1', 'key2'] + assert list(meta2.values()) == [b'value1', b'value2'] - # Check on the child - child = schema.children[0] - child_info = child.parse() - assert child_info['type'] == 'int32' - assert child_info['storage_type'] == 'int32' - assert child_info['name'] == 'col1' +def test_schema_view(): + schema = na.Schema.Empty() + with pytest.raises(RuntimeError): + schema.view() -def test_schema_info_params(): + pa.int32()._export_to_c(schema._addr()) + view = schema.view() + assert view.type == 'int32' + assert view.storage_type == 'int32' + + assert view.fixed_size is None + assert view.decimal_bitwidth is None + assert view.decimal_scale is None + assert view.time_unit is None + assert view.timezone is None + assert view.union_type_ids is None + assert view.extension_name is None + assert view.extension_metadata is None + +def test_schema_view_extra_params(): schema = na.Schema.Empty() pa.binary(12)._export_to_c(schema._addr()) - assert schema.parse()['fixed_size'] == 12 + view = schema.view() + assert view.fixed_size == 12 schema = na.Schema.Empty() pa.list_(pa.int32(), 12)._export_to_c(schema._addr()) - assert schema.parse()['fixed_size'] == 12 + assert view.fixed_size == 12 schema = na.Schema.Empty() pa.decimal128(10, 3)._export_to_c(schema._addr()) - assert schema.parse()['decimal_bitwidth'] == 128 - assert schema.parse()['decimal_precision'] == 10 - assert schema.parse()['decimal_scale'] == 3 + view = schema.view() + assert view.decimal_bitwidth == 128 + assert view.decimal_precision == 10 + assert view.decimal_scale == 3 + + schema = na.Schema.Empty() + pa.decimal256(10, 3)._export_to_c(schema._addr()) + view = schema.view() + assert view.decimal_bitwidth == 256 + assert view.decimal_precision == 10 + assert view.decimal_scale == 3 + + schema = na.Schema.Empty() + pa.duration('us')._export_to_c(schema._addr()) + view = schema.view() + assert view.time_unit == 'us' + + schema = na.Schema.Empty() + pa.timestamp('us', tz='America/Halifax')._export_to_c(schema._addr()) + view = schema.view() + assert view.type == 'timestamp' + assert view.storage_type == 'int64' + assert view.time_unit == 'us' + assert view.timezone == 'America/Halifax' + + schema = na.Schema.Empty() + meta = { + 'ARROW:extension:name': 'some_name', + 'ARROW:extension:metadata': 'some_metadata' + } + pa.field('', pa.int32(), metadata=meta)._export_to_c(schema._addr()) + view = schema.view() + assert view.extension_name == 'some_name' + assert view.extension_metadata == b'some_metadata' def test_array(): schema = na.Schema.Empty() From 05d4a812dc72309f72a4b07003a2bcaf4a9f4063 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Fri, 10 Mar 2023 13:42:45 -0400 Subject: [PATCH 23/52] test a few more array view things --- python/src/nanoarrow/_lib.pyx | 50 +++++++++++++++++------ python/tests/test_nanoarrow.py | 74 ++++++++++++++++++++-------------- 2 files changed, 80 insertions(+), 44 deletions(-) diff --git a/python/src/nanoarrow/_lib.pyx b/python/src/nanoarrow/_lib.pyx index 2db4ce85e..1aa414b01 100644 --- a/python/src/nanoarrow/_lib.pyx +++ b/python/src/nanoarrow/_lib.pyx @@ -132,6 +132,14 @@ cdef class Schema: self._assert_valid() return SchemaChildren(self) + @property + def dictionary(self): + self._assert_valid() + if self._ptr.dictionary != NULL: + return Schema(self, self._ptr.dictionary) + else: + return None + def view(self): self._assert_valid() schema_view = SchemaView() @@ -264,11 +272,37 @@ cdef class Array: def schema(self): return self._schema + @property + def length(self): + self._assert_valid() + return self._ptr.length + + @property + def offset(self): + self._assert_valid() + return self._ptr.offset + + @property + def null_count(self): + return self._ptr.null_count + + @property + def buffers(self): + return tuple(self._ptr.buffers[i] for i in range(self._ptr.n_buffers)) + @property def children(self): return ArrayChildren(self) - def validate(self): + @property + def dictionary(self): + self._assert_valid() + if self._ptr.dictionary != NULL: + return Array(self, self._ptr.dictionary, self._schema.dictionary) + else: + return None + + def view(self): cdef ArrayViewHolder holder = ArrayViewHolder() cdef ArrowError error @@ -309,14 +343,6 @@ cdef class ArrayView: def schema(self): return self._array._schema - def __len__(self): - return self._ptr.array.length - - def value_int(self, int64_t i): - if i < 0 or i >= self._ptr.array.length: - raise IndexError() - return ArrowArrayViewGetIntUnsafe(self._ptr, i) - cdef class SchemaChildren: cdef Schema _parent cdef int64_t _length @@ -383,8 +409,7 @@ cdef class ArrayChildren: k = int(k) if k < 0 or k >= self._length: raise IndexError(f"{k} out of range [0, {self._length})") - - return Array(self._parent, self._child_addr(k)) + return Array(self._parent, self._child_addr(k), self._parent.schema.children[k]) cdef _child_addr(self, int64_t i): cdef ArrowArray** children = self._parent._ptr.children @@ -406,8 +431,7 @@ cdef class ArrayViewChildren: k = int(k) if k < 0 or k >= self._length: raise IndexError(f"{k} out of range [0, {self._length})") - - return ArrayView(self._parent, self._child_addr(k), self._parent._array) + return ArrayView(self._parent, self._child_addr(k), self._parent._array.children[k]) cdef _child_addr(self, int64_t i): cdef ArrowArrayView** children = self._parent._ptr.children diff --git a/python/tests/test_nanoarrow.py b/python/tests/test_nanoarrow.py index 0c0077ff8..af5b24588 100644 --- a/python/tests/test_nanoarrow.py +++ b/python/tests/test_nanoarrow.py @@ -27,7 +27,6 @@ def test_version(): assert re_version.match(na.version()) is not None def test_schema_basic(): - # Blank invalid schema schema = na.Schema.Empty() assert schema.is_valid() is False assert repr(schema) == "[invalid: schema is released]" @@ -42,10 +41,17 @@ def test_schema_basic(): assert schema.children[0].format == "i" assert schema.children[0].name == "some_name" assert repr(schema.children[0]) == "int32" + assert schema.dictionary is None with pytest.raises(IndexError): schema.children[1] +def test_schema_dictionary(): + schema = na.Schema.Empty() + pa.dictionary(pa.int32(), pa.utf8())._export_to_c(schema._addr()) + assert schema.format == 'i' + assert schema.dictionary.format == 'u' + def test_schema_metadata(): schema = na.Schema.Empty() meta = {'key1': 'value1', 'key2': 'value2'} @@ -132,53 +138,59 @@ def test_array(): pa.array([1, 2, 3], pa.int32())._export_to_c(array._addr()) assert array.is_valid() is True + assert array.length == 3 + assert array.offset == 0 + assert array.null_count == 0 + assert len(array.buffers) == 2 + assert array.buffers[0] == 0 + assert len(array.children) == 0 + assert array.dictionary is None - view = array.validate() + with pytest.raises(IndexError): + array.children[1] - assert view.array is array - assert view.schema is schema - assert len(view) == 3 +def test_array_view(): + array = na.Array.Empty(na.Schema.Empty()) + pa.array([1, 2, 3], pa.int32())._export_to_c(array._addr(), array.schema._addr()) + view = array.view() - assert view.value_int(0) == 1 - assert view.value_int(1) == 2 - assert view.value_int(2) == 3 + assert view.array is array + assert view.schema is array.schema data_buffer = memoryview(view.buffers[1]) assert len(data_buffer) == 12 data_buffer_copy = bytes(data_buffer) - # (needs updating if testing on big endian) if sys.byteorder == 'little': assert data_buffer_copy == b'\x01\x00\x00\x00\x02\x00\x00\x00\x03\x00\x00\x00' else: assert data_buffer_copy == b'\x00\x00\x00\x01\x00\x00\x00\x02\x00\x00\x00\x03' -def test_array_recursive(): - pa_array = pa.array([1, 2, 3], pa.int32()) - pa_batch = pa.record_batch([pa_array], names=["some_column"]) - - schema = na.Schema.Empty() - pa_batch.schema._export_to_c(schema._addr()) - assert len(schema.children) == 1 with pytest.raises(IndexError): - schema.children[1] + view.children[1] - array = na.Array.Empty(schema) - assert array.is_valid() is False +def test_array_view_recursive(): + pa_array_child = pa.array([1, 2, 3], pa.int32()) + pa_array = pa.record_batch([pa_array_child], names=["some_column"]) - pa_batch._export_to_c(array._addr()) - assert array.is_valid() is True + array = na.Array.Empty(na.Schema.Empty()) + pa_array._export_to_c(array._addr(), array.schema._addr()) + + assert array.schema.format == '+s' + assert array.length == 3 assert len(array.children) == 1 - with pytest.raises(IndexError): - array.children[1] - view = array.validate() + assert array.children[0].schema.format == 'i' + assert array.children[0].length == 3 + assert array.children[0].schema._addr() == array.schema.children[0]._addr() + + view = array.view() + assert len(view.buffers) == 1 assert len(view.children) == 1 - with pytest.raises(IndexError): - view.children[1] + assert view.array._addr() == array._addr() + assert view.schema._addr() == array.schema._addr() - child = view.children[0] - assert len(child) == 3 - assert child.value_int(0) == 1 - assert child.value_int(1) == 2 - assert child.value_int(2) == 3 + assert len(view.children[0].buffers) == 2 + assert view.children[0].array._addr() == array.children[0]._addr() + assert view.children[0].schema._addr() == array.schema.children[0]._addr() + assert view.children[0].schema._addr() == array.children[0].schema._addr() From bbf18df4425edb9df511699984e858d03bcdadcb Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Fri, 10 Mar 2023 14:06:48 -0400 Subject: [PATCH 24/52] maybe install on Windows, update install instructions --- python/README.md | 28 ++++++++++++++++++++-------- python/bootstrap.py | 6 +++++- 2 files changed, 25 insertions(+), 9 deletions(-) diff --git a/python/README.md b/python/README.md index 701896bb5..04d05898b 100644 --- a/python/README.md +++ b/python/README.md @@ -19,26 +19,38 @@ # nanoarrow for Python -Python bindings for nanoarrow. +Python bindings for nanoarrow. These are in a preliminary state: see open issues +and tests/test_nanoarrow.py for usage. + +## Installation + +Python bindings for nanoarrow are not yet available on PyPI. You can install via +URL (requires a C compiler): + +```bash +python -m pip install "https://github.com/apache/arrow-nanoarrow/archive/refs/heads/main.zip#egg=nanoarrow&subdirectory=python" +``` + ## Building -Python libraries are managed with [setuptools][setuptools]. In general, that -means all projects can be built as follows: +Python bindings for nanoarrow are managed with setuptools[setuptools]. This means you +can build the project using: ```shell -$ cd python -$ pip install -e . +git clone https://github.com/apache/arrow-nanoarrow.git +cd python +pip install -e . ``` Tests use [pytest][pytest]: ```shell # Install dependencies -$ pip install -e .[test] +pip install -e .[test] # Run tests -$ pytest -vvx +pytest -vvx ``` [pytest]: https://docs.pytest.org/ -[setuptools]: https://setuptools.pypa.io/en/latest/index.html \ No newline at end of file +[setuptools]: https://setuptools.pypa.io/en/latest/index.html diff --git a/python/bootstrap.py b/python/bootstrap.py index 3ed7b10e2..2aeb90735 100644 --- a/python/bootstrap.py +++ b/python/bootstrap.py @@ -164,7 +164,11 @@ def copy_or_generate_nanoarrow_c(): os.system(f'cmake --install . --prefix=../src/nanoarrow') finally: if os.path.exists(build_dir): - shutil.rmtree(build_dir) + # Can fail on Windows with permission issues + try: + shutil.rmtree(build_dir) + except Exception as e: + print(f'Failed to remove _cmake temp directory: {str(e)}') os.chdir(this_wd) elif is_in_nanoarrow_repo: From 085b82a47842f94eb7affe94f8c075c4f912f2ef Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Fri, 10 Mar 2023 14:11:18 -0400 Subject: [PATCH 25/52] Empty -> empty --- python/src/nanoarrow/_lib.pyx | 2 +- python/tests/test_nanoarrow.py | 28 ++++++++++++++-------------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/python/src/nanoarrow/_lib.pyx b/python/src/nanoarrow/_lib.pyx index 1aa414b01..b06b26fe7 100644 --- a/python/src/nanoarrow/_lib.pyx +++ b/python/src/nanoarrow/_lib.pyx @@ -71,7 +71,7 @@ cdef class Schema: cdef ArrowSchema* _ptr @staticmethod - def Empty(): + def empty(): base = SchemaHolder() return Schema(base, base._addr()) diff --git a/python/tests/test_nanoarrow.py b/python/tests/test_nanoarrow.py index af5b24588..8935d5442 100644 --- a/python/tests/test_nanoarrow.py +++ b/python/tests/test_nanoarrow.py @@ -27,7 +27,7 @@ def test_version(): assert re_version.match(na.version()) is not None def test_schema_basic(): - schema = na.Schema.Empty() + schema = na.Schema.empty() assert schema.is_valid() is False assert repr(schema) == "[invalid: schema is released]" @@ -47,13 +47,13 @@ def test_schema_basic(): schema.children[1] def test_schema_dictionary(): - schema = na.Schema.Empty() + schema = na.Schema.empty() pa.dictionary(pa.int32(), pa.utf8())._export_to_c(schema._addr()) assert schema.format == 'i' assert schema.dictionary.format == 'u' def test_schema_metadata(): - schema = na.Schema.Empty() + schema = na.Schema.empty() meta = {'key1': 'value1', 'key2': 'value2'} pa.field('', pa.int32(), metadata=meta)._export_to_c(schema._addr()) @@ -64,7 +64,7 @@ def test_schema_metadata(): assert list(meta2.values()) == [b'value1', b'value2'] def test_schema_view(): - schema = na.Schema.Empty() + schema = na.Schema.empty() with pytest.raises(RuntimeError): schema.view() @@ -83,35 +83,35 @@ def test_schema_view(): assert view.extension_metadata is None def test_schema_view_extra_params(): - schema = na.Schema.Empty() + schema = na.Schema.empty() pa.binary(12)._export_to_c(schema._addr()) view = schema.view() assert view.fixed_size == 12 - schema = na.Schema.Empty() + schema = na.Schema.empty() pa.list_(pa.int32(), 12)._export_to_c(schema._addr()) assert view.fixed_size == 12 - schema = na.Schema.Empty() + schema = na.Schema.empty() pa.decimal128(10, 3)._export_to_c(schema._addr()) view = schema.view() assert view.decimal_bitwidth == 128 assert view.decimal_precision == 10 assert view.decimal_scale == 3 - schema = na.Schema.Empty() + schema = na.Schema.empty() pa.decimal256(10, 3)._export_to_c(schema._addr()) view = schema.view() assert view.decimal_bitwidth == 256 assert view.decimal_precision == 10 assert view.decimal_scale == 3 - schema = na.Schema.Empty() + schema = na.Schema.empty() pa.duration('us')._export_to_c(schema._addr()) view = schema.view() assert view.time_unit == 'us' - schema = na.Schema.Empty() + schema = na.Schema.empty() pa.timestamp('us', tz='America/Halifax')._export_to_c(schema._addr()) view = schema.view() assert view.type == 'timestamp' @@ -119,7 +119,7 @@ def test_schema_view_extra_params(): assert view.time_unit == 'us' assert view.timezone == 'America/Halifax' - schema = na.Schema.Empty() + schema = na.Schema.empty() meta = { 'ARROW:extension:name': 'some_name', 'ARROW:extension:metadata': 'some_metadata' @@ -130,7 +130,7 @@ def test_schema_view_extra_params(): assert view.extension_metadata == b'some_metadata' def test_array(): - schema = na.Schema.Empty() + schema = na.Schema.empty() pa.int32()._export_to_c(schema._addr()) array = na.Array.Empty(schema) @@ -150,7 +150,7 @@ def test_array(): array.children[1] def test_array_view(): - array = na.Array.Empty(na.Schema.Empty()) + array = na.Array.Empty(na.Schema.empty()) pa.array([1, 2, 3], pa.int32())._export_to_c(array._addr(), array.schema._addr()) view = array.view() @@ -173,7 +173,7 @@ def test_array_view_recursive(): pa_array_child = pa.array([1, 2, 3], pa.int32()) pa_array = pa.record_batch([pa_array_child], names=["some_column"]) - array = na.Array.Empty(na.Schema.Empty()) + array = na.Array.Empty(na.Schema.empty()) pa_array._export_to_c(array._addr(), array.schema._addr()) assert array.schema.format == '+s' From a4d0490c09ae9a54cd7f86f2b9bf31ea63faa51d Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Fri, 9 Jun 2023 16:32:03 -0300 Subject: [PATCH 26/52] nogil --- python/bootstrap.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/bootstrap.py b/python/bootstrap.py index 2aeb90735..8cdc83dcd 100644 --- a/python/bootstrap.py +++ b/python/bootstrap.py @@ -51,7 +51,7 @@ def generate_nanoarrow_pxd(self, file_in, file_out): with open(file_out, 'wb') as output: output.write(header.encode('UTF-8')) - output.write(f'\ncdef extern from "{file_in_name}":\n'.encode("UTF-8")) + output.write(f'\ncdef extern from "{file_in_name}" nogil:\n'.encode("UTF-8")) # A few things we add in manually output.write(b'\n') From 55dcdb41111e749e245156c9c2e1b30a9cbefd5c Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Fri, 9 Jun 2023 16:51:17 -0300 Subject: [PATCH 27/52] add buffer data types to properly set format --- python/src/nanoarrow/_lib.pyx | 46 ++++++++++++++++++++++++++++++----- 1 file changed, 40 insertions(+), 6 deletions(-) diff --git a/python/src/nanoarrow/_lib.pyx b/python/src/nanoarrow/_lib.pyx index b06b26fe7..20fddc6a5 100644 --- a/python/src/nanoarrow/_lib.pyx +++ b/python/src/nanoarrow/_lib.pyx @@ -34,7 +34,7 @@ cdef class SchemaHolder: def __init__(self): self.c_schema.release = NULL - def __del__(self): + def __dealloc__(self): if self.c_schema.release != NULL: self.c_schema.release(&self.c_schema) @@ -47,7 +47,7 @@ cdef class ArrayHolder: def __init__(self): self.c_array.release = NULL - def __del__(self): + def __dealloc__(self): if self.c_array.release != NULL: self.c_array.release(&self.c_array) @@ -60,7 +60,7 @@ cdef class ArrayViewHolder: def __init__(self): ArrowArrayViewInitFromType(&self.c_array_view, NANOARROW_TYPE_UNINITIALIZED) - def __del__(self): + def __dealloc__(self): ArrowArrayViewReset(&self.c_array_view) def _addr(self): @@ -441,18 +441,47 @@ cdef class ArrayViewChildren: cdef class BufferView: cdef object _base cdef ArrowBufferView* _ptr + cdef ArrowBufferType _buffer_type + cdef ArrowType _buffer_data_type cdef Py_ssize_t _shape cdef Py_ssize_t _strides - def __init__(self, object base, uintptr_t addr): + def __init__(self, object base, uintptr_t addr, + ArrowBufferType buffer_type, ArrowType buffer_data_type): self._base = base self._ptr = addr + self._buffer_type = buffer_type + self._buffer_data_type = buffer_data_type self._shape = self._ptr.size_bytes self._strides = 1 + cdef const char* _get_format(self): + if self._buffer_data_type == NANOARROW_TYPE_INT8: + return "h" + elif self._buffer_data_type == NANOARROW_TYPE_UINT8: + return "B" + elif self._buffer_data_type == NANOARROW_TYPE_INT16: + return "h" + elif self._buffer_data_type == NANOARROW_TYPE_UINT16: + return "H" + elif self._buffer_data_type == NANOARROW_TYPE_INT32: + return "i" + elif self._buffer_data_type == NANOARROW_TYPE_UINT32: + return "I" + elif self._buffer_data_type == NANOARROW_TYPE_INT64: + return "l" + elif self._buffer_data_type == NANOARROW_TYPE_UINT64: + return "L" + elif self._buffer_data_type == NANOARROW_TYPE_FLOAT: + return "f" + elif self._buffer_data_type == NANOARROW_TYPE_DOUBLE: + return "B" + else: + return "z" + def __getbuffer__(self, Py_buffer *buffer, int flags): buffer.buf = self._ptr.data.data - buffer.format = NULL + buffer.format = self._get_format() buffer.internal = NULL buffer.itemsize = 1 buffer.len = self._ptr.size_bytes @@ -482,4 +511,9 @@ cdef class ArrayViewBuffers: if k < 0 or k >= self._length: raise IndexError(f"{k} out of range [0, {self._length})") cdef ArrowBufferView* buffer_view = &(self._array_view._ptr.buffer_views[k]) - return BufferView(self._array_view, buffer_view) + return BufferView( + self._array_view, + buffer_view, + self._array_view._ptr.layout.buffer_type[k], + self._array_view._ptr.layout.buffer_data_type[k] + ) From 984ea76592ca3bbe8f5e51a8fafc8a6baedca549 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Fri, 9 Jun 2023 16:53:46 -0300 Subject: [PATCH 28/52] add dictionary member --- python/src/nanoarrow/_lib.pyx | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/python/src/nanoarrow/_lib.pyx b/python/src/nanoarrow/_lib.pyx index 20fddc6a5..3d18650f3 100644 --- a/python/src/nanoarrow/_lib.pyx +++ b/python/src/nanoarrow/_lib.pyx @@ -335,6 +335,10 @@ cdef class ArrayView: def buffers(self): return ArrayViewBuffers(self) + @property + def dictionary(self): + return ArrayView(self, self._ptr.dictionary, self._array.dictionary) + @property def array(self): return self._array From fc0a7dcc6b1371f056a10f28a8173f3890f91e10 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Fri, 9 Jun 2023 23:04:22 -0300 Subject: [PATCH 29/52] more buffer info --- python/src/nanoarrow/_lib.pyx | 31 ++++++++++++++++++++++++------- python/tests/test_nanoarrow.py | 15 ++++++++++++++- 2 files changed, 38 insertions(+), 8 deletions(-) diff --git a/python/src/nanoarrow/_lib.pyx b/python/src/nanoarrow/_lib.pyx index 3d18650f3..06909db73 100644 --- a/python/src/nanoarrow/_lib.pyx +++ b/python/src/nanoarrow/_lib.pyx @@ -447,17 +447,31 @@ cdef class BufferView: cdef ArrowBufferView* _ptr cdef ArrowBufferType _buffer_type cdef ArrowType _buffer_data_type + cdef Py_ssize_t _element_size_bits cdef Py_ssize_t _shape cdef Py_ssize_t _strides def __init__(self, object base, uintptr_t addr, - ArrowBufferType buffer_type, ArrowType buffer_data_type): + ArrowBufferType buffer_type, ArrowType buffer_data_type, + Py_ssize_t element_size_bits): self._base = base self._ptr = addr self._buffer_type = buffer_type self._buffer_data_type = buffer_data_type - self._shape = self._ptr.size_bytes - self._strides = 1 + self._element_size_bits = element_size_bits + self._strides = self._item_size() + self._shape = self._ptr.size_bytes // self._strides + + + cdef Py_ssize_t _item_size(self): + if self._buffer_data_type == NANOARROW_TYPE_BOOL: + return 1 + elif self._buffer_data_type == NANOARROW_TYPE_STRING: + return 1 + elif self._buffer_data_type == NANOARROW_TYPE_BINARY: + return 1 + else: + return self._element_size_bits // 8 cdef const char* _get_format(self): if self._buffer_data_type == NANOARROW_TYPE_INT8: @@ -479,15 +493,17 @@ cdef class BufferView: elif self._buffer_data_type == NANOARROW_TYPE_FLOAT: return "f" elif self._buffer_data_type == NANOARROW_TYPE_DOUBLE: - return "B" + return "d" + elif self._buffer_data_type == NANOARROW_TYPE_STRING: + return "c" else: - return "z" + return "B" def __getbuffer__(self, Py_buffer *buffer, int flags): buffer.buf = self._ptr.data.data buffer.format = self._get_format() buffer.internal = NULL - buffer.itemsize = 1 + buffer.itemsize = self._strides buffer.len = self._ptr.size_bytes buffer.ndim = 1 buffer.obj = self @@ -519,5 +535,6 @@ cdef class ArrayViewBuffers: self._array_view, buffer_view, self._array_view._ptr.layout.buffer_type[k], - self._array_view._ptr.layout.buffer_data_type[k] + self._array_view._ptr.layout.buffer_data_type[k], + self._array_view._ptr.layout.element_size_bits[k] ) diff --git a/python/tests/test_nanoarrow.py b/python/tests/test_nanoarrow.py index 8935d5442..b436b6e85 100644 --- a/python/tests/test_nanoarrow.py +++ b/python/tests/test_nanoarrow.py @@ -158,8 +158,8 @@ def test_array_view(): assert view.schema is array.schema data_buffer = memoryview(view.buffers[1]) - assert len(data_buffer) == 12 data_buffer_copy = bytes(data_buffer) + assert len(data_buffer_copy) == 12 if sys.byteorder == 'little': assert data_buffer_copy == b'\x01\x00\x00\x00\x02\x00\x00\x00\x03\x00\x00\x00' @@ -194,3 +194,16 @@ def test_array_view_recursive(): assert view.children[0].array._addr() == array.children[0]._addr() assert view.children[0].schema._addr() == array.schema.children[0]._addr() assert view.children[0].schema._addr() == array.children[0].schema._addr() + +def test_array_view_dictionary(): + pa_array = pa.array(["a", "b", "b"], pa.dictionary(pa.int32(), pa.utf8())) + + array = na.Array.Empty(na.Schema.empty()) + pa_array._export_to_c(array._addr(), array.schema._addr()) + + assert array.schema.format == 'i' + assert array.dictionary.schema.format == 'u' + + view = array.view() + assert len(view.buffers) == 2 + assert len(view.dictionary.buffers) == 3 From 33dcbe7e6e31d47615e1c395a94d978744380d4e Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Fri, 9 Jun 2023 23:40:41 -0300 Subject: [PATCH 30/52] test buffer access with numpy --- python/pyproject.toml | 2 +- python/src/nanoarrow/_lib.pyx | 5 ++- python/tests/test_nanoarrow.py | 58 ++++++++++++++++++++++++++++++++++ 3 files changed, 63 insertions(+), 2 deletions(-) diff --git a/python/pyproject.toml b/python/pyproject.toml index 52b7d5bc5..3e4ee75a8 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -25,7 +25,7 @@ license = {text = "Apache-2.0"} requires-python = ">=3.8" [project.optional-dependencies] -test = ["pyarrow", "pytest"] +test = ["pyarrow", "pytest", "numpy"] [project.urls] homepage = "https://arrow.apache.org" diff --git a/python/src/nanoarrow/_lib.pyx b/python/src/nanoarrow/_lib.pyx index 06909db73..7fc86780a 100644 --- a/python/src/nanoarrow/_lib.pyx +++ b/python/src/nanoarrow/_lib.pyx @@ -475,7 +475,7 @@ cdef class BufferView: cdef const char* _get_format(self): if self._buffer_data_type == NANOARROW_TYPE_INT8: - return "h" + return "b" elif self._buffer_data_type == NANOARROW_TYPE_UINT8: return "B" elif self._buffer_data_type == NANOARROW_TYPE_INT16: @@ -531,6 +531,9 @@ cdef class ArrayViewBuffers: if k < 0 or k >= self._length: raise IndexError(f"{k} out of range [0, {self._length})") cdef ArrowBufferView* buffer_view = &(self._array_view._ptr.buffer_views[k]) + if buffer_view.data.data == NULL: + return None + return BufferView( self._array_view, buffer_view, diff --git a/python/tests/test_nanoarrow.py b/python/tests/test_nanoarrow.py index b436b6e85..68b809d35 100644 --- a/python/tests/test_nanoarrow.py +++ b/python/tests/test_nanoarrow.py @@ -17,6 +17,7 @@ import sys import re +import numpy as np import pyarrow as pa import pytest @@ -207,3 +208,60 @@ def test_array_view_dictionary(): view = array.view() assert len(view.buffers) == 2 assert len(view.dictionary.buffers) == 3 + +def test_buffers_data(): + data_types = [ + (pa.uint8(), np.uint8()), + (pa.int8(), np.int8()), + (pa.uint16(), np.uint16()), + (pa.int16(), np.int16()), + (pa.uint32(), np.uint32()), + (pa.int32(), np.int32()), + (pa.uint64(), np.uint64()), + (pa.int64(), np.int64()), + (pa.float32(), np.float32()), + (pa.float64(), np.float64()) + ] + + for pa_type, np_type in data_types: + pa_array = pa.array([0, 1, 2], pa_type) + array = na.Array.Empty(na.Schema.empty()) + pa_array._export_to_c(array._addr(), array.schema._addr()) + view = array.view() + + np.testing.assert_array_equal( + np.array(view.buffers[1]), + np.array([0, 1, 2], np_type) + ) + +def test_buffers_string(): + pa_array = pa.array(["a", "bc", "def"]) + array = na.Array.Empty(na.Schema.empty()) + pa_array._export_to_c(array._addr(), array.schema._addr()) + view = array.view() + + assert view.buffers[0] is None + np.testing.assert_array_equal( + np.array(view.buffers[1]), + np.array([0, 1, 3, 6], np.int32()) + ) + np.testing.assert_array_equal( + np.array(view.buffers[2]), + np.array(list("abcdef"), dtype='|S1') + ) + +def test_buffers_binary(): + pa_array = pa.array([b"a", b"bc", b"def"]) + array = na.Array.Empty(na.Schema.empty()) + pa_array._export_to_c(array._addr(), array.schema._addr()) + view = array.view() + + assert view.buffers[0] is None + np.testing.assert_array_equal( + np.array(view.buffers[1]), + np.array([0, 1, 3, 6], np.int32()) + ) + np.testing.assert_array_equal( + np.array(view.buffers[2]), + np.array(list(b"abcdef")) + ) From 280193b35454a492291535ad8ddc6b706b9f06e3 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Fri, 9 Jun 2023 23:46:01 -0300 Subject: [PATCH 31/52] format with black --- python/src/nanoarrow/__init__.py | 7 +-- python/tests/test_nanoarrow.py | 88 +++++++++++++++++--------------- 2 files changed, 48 insertions(+), 47 deletions(-) diff --git a/python/src/nanoarrow/__init__.py b/python/src/nanoarrow/__init__.py index 18847ccad..9a96e58e8 100644 --- a/python/src/nanoarrow/__init__.py +++ b/python/src/nanoarrow/__init__.py @@ -15,9 +15,4 @@ # specific language governing permissions and limitations # under the License. -from ._lib import ( # noqa: F401 - version, - Schema, - Array, - ArrayView -) +from ._lib import version, Schema, Array, ArrayView diff --git a/python/tests/test_nanoarrow.py b/python/tests/test_nanoarrow.py index 68b809d35..ca31ebbfc 100644 --- a/python/tests/test_nanoarrow.py +++ b/python/tests/test_nanoarrow.py @@ -23,10 +23,12 @@ import nanoarrow as na + def test_version(): - re_version = re.compile(r'^[0-9]+\.[0-9]+\.[0-9]+(-SNAPSHOT)?$') + re_version = re.compile(r"^[0-9]+\.[0-9]+\.[0-9]+(-SNAPSHOT)?$") assert re_version.match(na.version()) is not None + def test_schema_basic(): schema = na.Schema.empty() assert schema.is_valid() is False @@ -47,22 +49,25 @@ def test_schema_basic(): with pytest.raises(IndexError): schema.children[1] + def test_schema_dictionary(): schema = na.Schema.empty() pa.dictionary(pa.int32(), pa.utf8())._export_to_c(schema._addr()) - assert schema.format == 'i' - assert schema.dictionary.format == 'u' + assert schema.format == "i" + assert schema.dictionary.format == "u" + def test_schema_metadata(): schema = na.Schema.empty() - meta = {'key1': 'value1', 'key2': 'value2'} - pa.field('', pa.int32(), metadata=meta)._export_to_c(schema._addr()) + meta = {"key1": "value1", "key2": "value2"} + pa.field("", pa.int32(), metadata=meta)._export_to_c(schema._addr()) assert len(schema.metadata) == 2 meta2 = {k: v for k, v in schema.metadata} - assert list(meta2.keys()) == ['key1', 'key2'] - assert list(meta2.values()) == [b'value1', b'value2'] + assert list(meta2.keys()) == ["key1", "key2"] + assert list(meta2.values()) == [b"value1", b"value2"] + def test_schema_view(): schema = na.Schema.empty() @@ -71,8 +76,8 @@ def test_schema_view(): pa.int32()._export_to_c(schema._addr()) view = schema.view() - assert view.type == 'int32' - assert view.storage_type == 'int32' + assert view.type == "int32" + assert view.storage_type == "int32" assert view.fixed_size is None assert view.decimal_bitwidth is None @@ -83,6 +88,7 @@ def test_schema_view(): assert view.extension_name is None assert view.extension_metadata is None + def test_schema_view_extra_params(): schema = na.Schema.empty() pa.binary(12)._export_to_c(schema._addr()) @@ -108,27 +114,28 @@ def test_schema_view_extra_params(): assert view.decimal_scale == 3 schema = na.Schema.empty() - pa.duration('us')._export_to_c(schema._addr()) + pa.duration("us")._export_to_c(schema._addr()) view = schema.view() - assert view.time_unit == 'us' + assert view.time_unit == "us" schema = na.Schema.empty() - pa.timestamp('us', tz='America/Halifax')._export_to_c(schema._addr()) + pa.timestamp("us", tz="America/Halifax")._export_to_c(schema._addr()) view = schema.view() - assert view.type == 'timestamp' - assert view.storage_type == 'int64' - assert view.time_unit == 'us' - assert view.timezone == 'America/Halifax' + assert view.type == "timestamp" + assert view.storage_type == "int64" + assert view.time_unit == "us" + assert view.timezone == "America/Halifax" schema = na.Schema.empty() meta = { - 'ARROW:extension:name': 'some_name', - 'ARROW:extension:metadata': 'some_metadata' + "ARROW:extension:name": "some_name", + "ARROW:extension:metadata": "some_metadata", } - pa.field('', pa.int32(), metadata=meta)._export_to_c(schema._addr()) + pa.field("", pa.int32(), metadata=meta)._export_to_c(schema._addr()) view = schema.view() - assert view.extension_name == 'some_name' - assert view.extension_metadata == b'some_metadata' + assert view.extension_name == "some_name" + assert view.extension_metadata == b"some_metadata" + def test_array(): schema = na.Schema.empty() @@ -150,6 +157,7 @@ def test_array(): with pytest.raises(IndexError): array.children[1] + def test_array_view(): array = na.Array.Empty(na.Schema.empty()) pa.array([1, 2, 3], pa.int32())._export_to_c(array._addr(), array.schema._addr()) @@ -162,14 +170,15 @@ def test_array_view(): data_buffer_copy = bytes(data_buffer) assert len(data_buffer_copy) == 12 - if sys.byteorder == 'little': - assert data_buffer_copy == b'\x01\x00\x00\x00\x02\x00\x00\x00\x03\x00\x00\x00' + if sys.byteorder == "little": + assert data_buffer_copy == b"\x01\x00\x00\x00\x02\x00\x00\x00\x03\x00\x00\x00" else: - assert data_buffer_copy == b'\x00\x00\x00\x01\x00\x00\x00\x02\x00\x00\x00\x03' + assert data_buffer_copy == b"\x00\x00\x00\x01\x00\x00\x00\x02\x00\x00\x00\x03" with pytest.raises(IndexError): view.children[1] + def test_array_view_recursive(): pa_array_child = pa.array([1, 2, 3], pa.int32()) pa_array = pa.record_batch([pa_array_child], names=["some_column"]) @@ -177,11 +186,11 @@ def test_array_view_recursive(): array = na.Array.Empty(na.Schema.empty()) pa_array._export_to_c(array._addr(), array.schema._addr()) - assert array.schema.format == '+s' + assert array.schema.format == "+s" assert array.length == 3 assert len(array.children) == 1 - assert array.children[0].schema.format == 'i' + assert array.children[0].schema.format == "i" assert array.children[0].length == 3 assert array.children[0].schema._addr() == array.schema.children[0]._addr() @@ -196,19 +205,21 @@ def test_array_view_recursive(): assert view.children[0].schema._addr() == array.schema.children[0]._addr() assert view.children[0].schema._addr() == array.children[0].schema._addr() + def test_array_view_dictionary(): pa_array = pa.array(["a", "b", "b"], pa.dictionary(pa.int32(), pa.utf8())) array = na.Array.Empty(na.Schema.empty()) pa_array._export_to_c(array._addr(), array.schema._addr()) - assert array.schema.format == 'i' - assert array.dictionary.schema.format == 'u' + assert array.schema.format == "i" + assert array.dictionary.schema.format == "u" view = array.view() assert len(view.buffers) == 2 assert len(view.dictionary.buffers) == 3 + def test_buffers_data(): data_types = [ (pa.uint8(), np.uint8()), @@ -220,7 +231,7 @@ def test_buffers_data(): (pa.uint64(), np.uint64()), (pa.int64(), np.int64()), (pa.float32(), np.float32()), - (pa.float64(), np.float64()) + (pa.float64(), np.float64()), ] for pa_type, np_type in data_types: @@ -230,10 +241,10 @@ def test_buffers_data(): view = array.view() np.testing.assert_array_equal( - np.array(view.buffers[1]), - np.array([0, 1, 2], np_type) + np.array(view.buffers[1]), np.array([0, 1, 2], np_type) ) + def test_buffers_string(): pa_array = pa.array(["a", "bc", "def"]) array = na.Array.Empty(na.Schema.empty()) @@ -242,14 +253,13 @@ def test_buffers_string(): assert view.buffers[0] is None np.testing.assert_array_equal( - np.array(view.buffers[1]), - np.array([0, 1, 3, 6], np.int32()) + np.array(view.buffers[1]), np.array([0, 1, 3, 6], np.int32()) ) np.testing.assert_array_equal( - np.array(view.buffers[2]), - np.array(list("abcdef"), dtype='|S1') + np.array(view.buffers[2]), np.array(list("abcdef"), dtype="|S1") ) + def test_buffers_binary(): pa_array = pa.array([b"a", b"bc", b"def"]) array = na.Array.Empty(na.Schema.empty()) @@ -258,10 +268,6 @@ def test_buffers_binary(): assert view.buffers[0] is None np.testing.assert_array_equal( - np.array(view.buffers[1]), - np.array([0, 1, 3, 6], np.int32()) - ) - np.testing.assert_array_equal( - np.array(view.buffers[2]), - np.array(list(b"abcdef")) + np.array(view.buffers[1]), np.array([0, 1, 3, 6], np.int32()) ) + np.testing.assert_array_equal(np.array(view.buffers[2]), np.array(list(b"abcdef"))) From 97df28ece7a6adcf58fa27b0fba24af7f931f5d2 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Sat, 10 Jun 2023 23:22:34 -0300 Subject: [PATCH 32/52] some helpers --- python/pyproject.toml | 2 +- python/src/nanoarrow/__init__.py | 1 + python/src/nanoarrow/_lib.pyx | 2 +- python/src/nanoarrow/lib.py | 31 +++++++++++ python/tests/test_nanoarrow.py | 88 +++++++++++++++----------------- 5 files changed, 74 insertions(+), 50 deletions(-) create mode 100644 python/src/nanoarrow/lib.py diff --git a/python/pyproject.toml b/python/pyproject.toml index 3e4ee75a8..743cebe0c 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -19,7 +19,7 @@ [project] name = "nanoarrow" version = "1.0.0-alpha0" -description = "" +description = "Python bindings to the nanoarrow C library" authors = [{name = "Apache Arrow Developers", email = "dev@arrow.apache.org"}] license = {text = "Apache-2.0"} requires-python = ">=3.8" diff --git a/python/src/nanoarrow/__init__.py b/python/src/nanoarrow/__init__.py index 9a96e58e8..5b648d247 100644 --- a/python/src/nanoarrow/__init__.py +++ b/python/src/nanoarrow/__init__.py @@ -16,3 +16,4 @@ # under the License. from ._lib import version, Schema, Array, ArrayView +from .lib import schema, array diff --git a/python/src/nanoarrow/_lib.pyx b/python/src/nanoarrow/_lib.pyx index 7fc86780a..18bcf7fbf 100644 --- a/python/src/nanoarrow/_lib.pyx +++ b/python/src/nanoarrow/_lib.pyx @@ -249,7 +249,7 @@ cdef class Array: cdef Schema _schema @staticmethod - def Empty(Schema schema): + def empty(Schema schema): base = ArrayHolder() return Array(base, base._addr(), schema) diff --git a/python/src/nanoarrow/lib.py b/python/src/nanoarrow/lib.py new file mode 100644 index 000000000..e0c8c508d --- /dev/null +++ b/python/src/nanoarrow/lib.py @@ -0,0 +1,31 @@ +from ._lib import Schema, Array + + +def schema(obj): + if isinstance(obj, Schema): + return obj + + # Not entirely safe but will have to do until there's a dunder method + if hasattr(obj, "_export_to_c"): + out = Schema.empty() + obj._export_to_c(out._addr()) + return out + else: + raise TypeError( + f"Can't convert object of type {type(obj).__name__} to nanoarrow.Schema" + ) + + +def array(obj): + if isinstance(obj, Array): + return obj + + # Not entirely safe but will have to do until there's a dunder method + if hasattr(obj, "_export_to_c"): + out = Array.empty(Schema.empty()) + obj._export_to_c(out._addr(), out.schema._addr()) + return out + else: + raise TypeError( + f"Can't convert object of type {type(obj).__name__} to nanoarrow.Array" + ) diff --git a/python/tests/test_nanoarrow.py b/python/tests/test_nanoarrow.py index ca31ebbfc..e340c7c86 100644 --- a/python/tests/test_nanoarrow.py +++ b/python/tests/test_nanoarrow.py @@ -29,13 +29,34 @@ def test_version(): assert re_version.match(na.version()) is not None +def test_schema_helper(): + schema = na.Schema.empty() + assert na.schema(schema) is schema + + schema = na.schema(pa.null()) + assert isinstance(schema, na.Schema) + + with pytest.raises(TypeError): + na.schema(None) + + +def test_array_helper(): + array = na.Array.empty(na.Schema.empty()) + assert na.array(array) is array + + array = na.array(pa.array([], pa.null())) + assert isinstance(array, na.Array) + + with pytest.raises(TypeError): + na.schema(None) + + def test_schema_basic(): schema = na.Schema.empty() assert schema.is_valid() is False assert repr(schema) == "[invalid: schema is released]" - pa_schema = pa.schema([pa.field("some_name", pa.int32())]) - pa_schema._export_to_c(schema._addr()) + schema = na.schema(pa.schema([pa.field("some_name", pa.int32())])) assert schema.format == "+s" assert schema.flags == 0 @@ -51,16 +72,14 @@ def test_schema_basic(): def test_schema_dictionary(): - schema = na.Schema.empty() - pa.dictionary(pa.int32(), pa.utf8())._export_to_c(schema._addr()) + schema = na.schema(pa.dictionary(pa.int32(), pa.utf8())) assert schema.format == "i" assert schema.dictionary.format == "u" def test_schema_metadata(): - schema = na.Schema.empty() meta = {"key1": "value1", "key2": "value2"} - pa.field("", pa.int32(), metadata=meta)._export_to_c(schema._addr()) + schema = na.schema(pa.field("", pa.int32(), metadata=meta)) assert len(schema.metadata) == 2 @@ -74,7 +93,7 @@ def test_schema_view(): with pytest.raises(RuntimeError): schema.view() - pa.int32()._export_to_c(schema._addr()) + schema = na.schema(pa.int32()) view = schema.view() assert view.type == "int32" assert view.storage_type == "int32" @@ -90,61 +109,48 @@ def test_schema_view(): def test_schema_view_extra_params(): - schema = na.Schema.empty() - pa.binary(12)._export_to_c(schema._addr()) + schema = na.schema(pa.binary(12)) view = schema.view() assert view.fixed_size == 12 - schema = na.Schema.empty() - pa.list_(pa.int32(), 12)._export_to_c(schema._addr()) + schema = na.schema(pa.list_(pa.int32(), 12)) assert view.fixed_size == 12 - schema = na.Schema.empty() - pa.decimal128(10, 3)._export_to_c(schema._addr()) + schema = na.schema(pa.decimal128(10, 3)) view = schema.view() assert view.decimal_bitwidth == 128 assert view.decimal_precision == 10 assert view.decimal_scale == 3 - schema = na.Schema.empty() - pa.decimal256(10, 3)._export_to_c(schema._addr()) + schema = na.schema(pa.decimal256(10, 3)) view = schema.view() assert view.decimal_bitwidth == 256 assert view.decimal_precision == 10 assert view.decimal_scale == 3 - schema = na.Schema.empty() - pa.duration("us")._export_to_c(schema._addr()) + schema = na.schema(pa.duration("us")) view = schema.view() assert view.time_unit == "us" - schema = na.Schema.empty() - pa.timestamp("us", tz="America/Halifax")._export_to_c(schema._addr()) + schema = na.schema(pa.timestamp("us", tz="America/Halifax")) view = schema.view() assert view.type == "timestamp" assert view.storage_type == "int64" assert view.time_unit == "us" assert view.timezone == "America/Halifax" - schema = na.Schema.empty() meta = { "ARROW:extension:name": "some_name", "ARROW:extension:metadata": "some_metadata", } - pa.field("", pa.int32(), metadata=meta)._export_to_c(schema._addr()) + schema = na.schema(pa.field("", pa.int32(), metadata=meta)) view = schema.view() assert view.extension_name == "some_name" assert view.extension_metadata == b"some_metadata" def test_array(): - schema = na.Schema.empty() - pa.int32()._export_to_c(schema._addr()) - - array = na.Array.Empty(schema) - assert array.is_valid() is False - - pa.array([1, 2, 3], pa.int32())._export_to_c(array._addr()) + array = na.array(pa.array([1, 2, 3], pa.int32())) assert array.is_valid() is True assert array.length == 3 assert array.offset == 0 @@ -159,8 +165,7 @@ def test_array(): def test_array_view(): - array = na.Array.Empty(na.Schema.empty()) - pa.array([1, 2, 3], pa.int32())._export_to_c(array._addr(), array.schema._addr()) + array = na.array(pa.array([1, 2, 3], pa.int32())) view = array.view() assert view.array is array @@ -183,8 +188,7 @@ def test_array_view_recursive(): pa_array_child = pa.array([1, 2, 3], pa.int32()) pa_array = pa.record_batch([pa_array_child], names=["some_column"]) - array = na.Array.Empty(na.Schema.empty()) - pa_array._export_to_c(array._addr(), array.schema._addr()) + array = na.array(pa_array) assert array.schema.format == "+s" assert array.length == 3 @@ -208,9 +212,7 @@ def test_array_view_recursive(): def test_array_view_dictionary(): pa_array = pa.array(["a", "b", "b"], pa.dictionary(pa.int32(), pa.utf8())) - - array = na.Array.Empty(na.Schema.empty()) - pa_array._export_to_c(array._addr(), array.schema._addr()) + array = na.array(pa_array) assert array.schema.format == "i" assert array.dictionary.schema.format == "u" @@ -235,21 +237,14 @@ def test_buffers_data(): ] for pa_type, np_type in data_types: - pa_array = pa.array([0, 1, 2], pa_type) - array = na.Array.Empty(na.Schema.empty()) - pa_array._export_to_c(array._addr(), array.schema._addr()) - view = array.view() - + view = na.array(pa.array([0, 1, 2], pa_type)).view() np.testing.assert_array_equal( np.array(view.buffers[1]), np.array([0, 1, 2], np_type) ) def test_buffers_string(): - pa_array = pa.array(["a", "bc", "def"]) - array = na.Array.Empty(na.Schema.empty()) - pa_array._export_to_c(array._addr(), array.schema._addr()) - view = array.view() + view = na.array(pa.array(["a", "bc", "def"])).view() assert view.buffers[0] is None np.testing.assert_array_equal( @@ -261,10 +256,7 @@ def test_buffers_string(): def test_buffers_binary(): - pa_array = pa.array([b"a", b"bc", b"def"]) - array = na.Array.Empty(na.Schema.empty()) - pa_array._export_to_c(array._addr(), array.schema._addr()) - view = array.view() + view = na.array(pa.array([b"a", b"bc", b"def"])).view() assert view.buffers[0] is None np.testing.assert_array_equal( From d7fd6e181f7c5c0e672e626af92294912079bace Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Sat, 10 Jun 2023 23:24:44 -0300 Subject: [PATCH 33/52] rename version --- python/src/nanoarrow/__init__.py | 2 +- python/src/nanoarrow/_lib.pyx | 2 +- python/tests/test_nanoarrow.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/python/src/nanoarrow/__init__.py b/python/src/nanoarrow/__init__.py index 5b648d247..4b41e0674 100644 --- a/python/src/nanoarrow/__init__.py +++ b/python/src/nanoarrow/__init__.py @@ -15,5 +15,5 @@ # specific language governing permissions and limitations # under the License. -from ._lib import version, Schema, Array, ArrayView +from ._lib import c_version, Schema, Array, ArrayView from .lib import schema, array diff --git a/python/src/nanoarrow/_lib.pyx b/python/src/nanoarrow/_lib.pyx index 18bcf7fbf..c8873b661 100644 --- a/python/src/nanoarrow/_lib.pyx +++ b/python/src/nanoarrow/_lib.pyx @@ -25,7 +25,7 @@ from cpython.bytes cimport PyBytes_FromStringAndSize from cpython cimport Py_buffer from nanoarrow_c cimport * -def version(): +def c_version(): return ArrowNanoarrowVersion().decode("UTF-8") cdef class SchemaHolder: diff --git a/python/tests/test_nanoarrow.py b/python/tests/test_nanoarrow.py index e340c7c86..eeb04a7e2 100644 --- a/python/tests/test_nanoarrow.py +++ b/python/tests/test_nanoarrow.py @@ -24,9 +24,9 @@ import nanoarrow as na -def test_version(): +def test_c_version(): re_version = re.compile(r"^[0-9]+\.[0-9]+\.[0-9]+(-SNAPSHOT)?$") - assert re_version.match(na.version()) is not None + assert re_version.match(na.c_version()) is not None def test_schema_helper(): From 5c02647f82e465a72f212667670e7b7452280573 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Sun, 11 Jun 2023 15:19:51 -0300 Subject: [PATCH 34/52] start on stream --- python/src/nanoarrow/_lib.pyx | 117 +++++++++++++++++++++++++++++++++- python/src/nanoarrow/lib.py | 25 +++++++- 2 files changed, 138 insertions(+), 4 deletions(-) diff --git a/python/src/nanoarrow/_lib.pyx b/python/src/nanoarrow/_lib.pyx index c8873b661..69a9b4e59 100644 --- a/python/src/nanoarrow/_lib.pyx +++ b/python/src/nanoarrow/_lib.pyx @@ -31,7 +31,7 @@ def c_version(): cdef class SchemaHolder: cdef ArrowSchema c_schema - def __init__(self): + def __cinit__(self): self.c_schema.release = NULL def __dealloc__(self): @@ -44,7 +44,7 @@ cdef class SchemaHolder: cdef class ArrayHolder: cdef ArrowArray c_array - def __init__(self): + def __cinit__(self): self.c_array.release = NULL def __dealloc__(self): @@ -54,6 +54,19 @@ cdef class ArrayHolder: def _addr(self): return &self.c_array +cdef class ArrayStreamHolder: + cdef ArrowArrayStream c_array_stream + + def __cinit__(self): + self.c_array_stream.release = NULL + + def __dealloc__(self): + if self.c_array_stream.release != NULL: + self.c_array_stream.release(&self.c_array_stream) + + def _addr(self): + return &self.c_array_stream + cdef class ArrayViewHolder: cdef ArrowArrayView c_array_view @@ -66,6 +79,34 @@ cdef class ArrayViewHolder: def _addr(self): return &self.c_array_view + +class NanoarrowException(RuntimeError): + + def __init__(self, what, code, message): + self.what = what + self.code = code + self.message = message + + if self.message == "": + super().__init__(f"{self.what} failed ({self.code})") + else: + super().__init__(f"{self.what} failed ({self.code}): {self.message}") + + +cdef class Error: + cdef ArrowError c_error + + def __cinit__(self): + self.c_error.message[0] = 0 + + def raise_message(self, what, code): + raise Exception(what, code, self.c_error.message.decode("UTF-8")) + + @staticmethod + def raise_error(what, code): + raise Exception(what, code, "") + + cdef class Schema: cdef object _base cdef ArrowSchema* _ptr @@ -317,6 +358,7 @@ cdef class Array: return ArrayView(holder, holder._addr(), self) + cdef class ArrayView: cdef object _base cdef ArrowArrayView* _ptr @@ -541,3 +583,74 @@ cdef class ArrayViewBuffers: self._array_view._ptr.layout.buffer_data_type[k], self._array_view._ptr.layout.element_size_bits[k] ) + + +cdef class ArrayStream: + cdef object _base + cdef ArrowArrayStream* _ptr + + def __init__(self, object base, uintptr_t addr): + self._base = base, + self._ptr = addr + self._cached_schema = None + + def is_valid(self): + return self._ptr != NULL and self._ptr.release != NULL + + def _assert_valid(self): + if self._ptr == NULL: + raise RuntimeError("array stream pointer is NULL") + if self._ptr.release == NULL: + raise RuntimeError("array stream is released") + + def _get_schema(self, Schema schema): + self._assert_valid() + cdef int code = self._ptr.get_schema(self._ptr, schema._ptr) + cdef const char* message = NULL + if code != NANOARROW_OK: + message = self._ptr.get_last_error(self._ptr) + if message != NULL: + raise NanoarrowException( + "ArrowArrayStream::get_schema()", + code, + message.decode("UTF-8") + ) + else: + Error.raise_error("ArrowArrayStream::get_schema()", code) + + self._cached_schema = schema + + def get_schema(self): + # Update the cached copy of the schema as an independent object + if self._cached_schema is not None: + del self._cached_schema + self._cached_schema = Schema.empty() + self._get_schema(self._cached_schema) + + # Return an independent copy + out = Schema.empty() + self._get_schema(out) + return out + + def get_next(self): + self._assert_valid() + + if self._cached_schema is None: + self._cached_schema = Schema.empty() + self._get_schema(self._cached_schema) + + cdef Array array = Array.empty(self._cached_schema) + cdef int code = self._ptr.get_next(self._ptr, array._ptr) + cdef const char* message = NULL + if code != NANOARROW_OK: + message = self._ptr.get_last_error(self._ptr) + if message != NULL: + raise NanoarrowException( + "ArrowArrayStream::get_next()", + code, + message.decode("UTF-8") + ) + else: + Error.raise_error("ArrowArrayStream::get_next()", code) + + return array diff --git a/python/src/nanoarrow/lib.py b/python/src/nanoarrow/lib.py index e0c8c508d..549e9d096 100644 --- a/python/src/nanoarrow/lib.py +++ b/python/src/nanoarrow/lib.py @@ -5,7 +5,9 @@ def schema(obj): if isinstance(obj, Schema): return obj - # Not entirely safe but will have to do until there's a dunder method + # Not particularly safe because _export_to_c() could be exporting an + # array, schema, or array_stream. The ideal + # solution here would be something like __arrow_c_schema__() if hasattr(obj, "_export_to_c"): out = Schema.empty() obj._export_to_c(out._addr()) @@ -20,7 +22,9 @@ def array(obj): if isinstance(obj, Array): return obj - # Not entirely safe but will have to do until there's a dunder method + # Somewhat safe because calling _export_to_c() with two arguments will + # not fail with a crash (but will fail with a confusing error). The ideal + # solution here would be something like __arrow_c_array__() if hasattr(obj, "_export_to_c"): out = Array.empty(Schema.empty()) obj._export_to_c(out._addr(), out.schema._addr()) @@ -29,3 +33,20 @@ def array(obj): raise TypeError( f"Can't convert object of type {type(obj).__name__} to nanoarrow.Array" ) + + +def array_stream(obj): + if isinstance(obj, Schema): + return obj + + # Not particularly safe because _export_to_c() could be exporting an + # array, schema, or array_stream. The ideal + # solution here would be something like __arrow_c_array_stream__() + if hasattr(obj, "_export_to_c"): + out = Schema.empty() + obj._export_to_c(out._addr()) + return out + else: + raise TypeError( + f"Can't convert object of type {type(obj).__name__} to nanoarrow.Schema" + ) From ed5e5450b6760c53f0fe374acc1f7097ecf62a18 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Mon, 12 Jun 2023 15:39:45 -0300 Subject: [PATCH 35/52] some array stream madness --- python/src/nanoarrow/__init__.py | 4 ++-- python/src/nanoarrow/_lib.pyx | 22 +++++++++++++++++++--- python/src/nanoarrow/lib.py | 4 ++-- python/tests/test_nanoarrow.py | 29 +++++++++++++++++++++++++++++ 4 files changed, 52 insertions(+), 7 deletions(-) diff --git a/python/src/nanoarrow/__init__.py b/python/src/nanoarrow/__init__.py index 4b41e0674..bb4372642 100644 --- a/python/src/nanoarrow/__init__.py +++ b/python/src/nanoarrow/__init__.py @@ -15,5 +15,5 @@ # specific language governing permissions and limitations # under the License. -from ._lib import c_version, Schema, Array, ArrayView -from .lib import schema, array +from ._lib import c_version, Schema, Array, ArrayView, ArrayStream +from .lib import schema, array, array_stream diff --git a/python/src/nanoarrow/_lib.pyx b/python/src/nanoarrow/_lib.pyx index 69a9b4e59..7ef9d9321 100644 --- a/python/src/nanoarrow/_lib.pyx +++ b/python/src/nanoarrow/_lib.pyx @@ -588,12 +588,16 @@ cdef class ArrayViewBuffers: cdef class ArrayStream: cdef object _base cdef ArrowArrayStream* _ptr + cdef object _cached_schema def __init__(self, object base, uintptr_t addr): self._base = base, self._ptr = addr self._cached_schema = None + def _addr(self): + return self._ptr + def is_valid(self): return self._ptr != NULL and self._ptr.release != NULL @@ -622,8 +626,6 @@ cdef class ArrayStream: def get_schema(self): # Update the cached copy of the schema as an independent object - if self._cached_schema is not None: - del self._cached_schema self._cached_schema = Schema.empty() self._get_schema(self._cached_schema) @@ -653,4 +655,18 @@ cdef class ArrayStream: else: Error.raise_error("ArrowArrayStream::get_next()", code) - return array + if not array.is_valid(): + return None + else: + return array + + def __iter__(self): + array = self.get_next() + while array is not None: + yield array + array = self.get_next() + + @staticmethod + def empty(): + base = ArrayStreamHolder() + return ArrayStream(base, base._addr()) diff --git a/python/src/nanoarrow/lib.py b/python/src/nanoarrow/lib.py index 549e9d096..b0a2a326d 100644 --- a/python/src/nanoarrow/lib.py +++ b/python/src/nanoarrow/lib.py @@ -1,4 +1,4 @@ -from ._lib import Schema, Array +from ._lib import Schema, Array, ArrayStream def schema(obj): @@ -43,7 +43,7 @@ def array_stream(obj): # array, schema, or array_stream. The ideal # solution here would be something like __arrow_c_array_stream__() if hasattr(obj, "_export_to_c"): - out = Schema.empty() + out = ArrayStream.empty() obj._export_to_c(out._addr()) return out else: diff --git a/python/tests/test_nanoarrow.py b/python/tests/test_nanoarrow.py index eeb04a7e2..6f7477dfe 100644 --- a/python/tests/test_nanoarrow.py +++ b/python/tests/test_nanoarrow.py @@ -263,3 +263,32 @@ def test_buffers_binary(): np.array(view.buffers[1]), np.array([0, 1, 3, 6], np.int32()) ) np.testing.assert_array_equal(np.array(view.buffers[2]), np.array(list(b"abcdef"))) + + +def test_array_stream(): + array_stream = na.ArrayStream.empty() + assert array_stream.is_valid() is False + with pytest.raises(RuntimeError): + array_stream.get_schema() + with pytest.raises(RuntimeError): + array_stream.get_next() + + pa_array_child = pa.array([1, 2, 3], pa.int32()) + pa_array = pa.record_batch([pa_array_child], names=["some_column"]) + reader = pa.RecordBatchReader.from_batches(pa_array.schema, [pa_array]) + array_stream = na.array_stream(reader) + + assert array_stream.is_valid() is True + array = array_stream.get_next() + assert array.schema.children[0].name == "some_column" + assert array_stream.get_next() is None + +def test_array_stream_iter(): + pa_array_child = pa.array([1, 2, 3], pa.int32()) + pa_array = pa.record_batch([pa_array_child], names=["some_column"]) + reader = pa.RecordBatchReader.from_batches(pa_array.schema, [pa_array]) + array_stream = na.array_stream(reader) + + arrays = list(array_stream) + assert len(arrays) == 1 + assert arrays[0].schema.children[0].name == "some_column" From 1321545c689de9cb8200d9a0cb98fc2e261b25d9 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Mon, 12 Jun 2023 16:30:48 -0300 Subject: [PATCH 36/52] some tidying --- python/.coveragerc | 20 +++++ python/setup.py | 14 ++++ python/src/nanoarrow/_lib.pyx | 142 ++++++++++++++++++++++++++++++++-- python/src/nanoarrow/lib.py | 17 ++++ 4 files changed, 188 insertions(+), 5 deletions(-) create mode 100644 python/.coveragerc diff --git a/python/.coveragerc b/python/.coveragerc new file mode 100644 index 000000000..1fb6a24ea --- /dev/null +++ b/python/.coveragerc @@ -0,0 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# .coveragerc to control coverage.py +[run] +plugins = Cython.Coverage diff --git a/python/setup.py b/python/setup.py index 8b4b61c42..3ede82c86 100644 --- a/python/setup.py +++ b/python/setup.py @@ -30,6 +30,17 @@ if os.path.exists(bootstrap_py): subprocess.run([sys.executable, bootstrap_py]) + +# Set some extra flags for compiling with coverage support +if os.getenv('NANOARROW_PYTHON_COVERAGE') == "1": + coverage_compile_args = ['--coverage'] + coverage_link_args = ['--coverage'] + coverage_define_macros = [("CYTHON_TRACE", 1)] +else: + coverage_compile_args = [] + coverage_link_args = [] + coverage_define_macros = [] + setup( ext_modules=[ Extension( @@ -40,6 +51,9 @@ 'src/nanoarrow/_lib.pyx', 'src/nanoarrow/nanoarrow.c', ], + extra_compile_args = coverage_compile_args, + extra_link_args = [] + coverage_link_args, + define_macros= [] + coverage_define_macros, ) ] ) diff --git a/python/src/nanoarrow/_lib.pyx b/python/src/nanoarrow/_lib.pyx index 7ef9d9321..769a2ec61 100644 --- a/python/src/nanoarrow/_lib.pyx +++ b/python/src/nanoarrow/_lib.pyx @@ -16,8 +16,16 @@ # under the License. # cython: language_level = 3 +# cython: linetrace=True -"""Low-level nanoarrow Python bindings.""" +"""Low-level nanoarrow Python bindings + +This Cython extension provides low-level Python wrappers around the +Arrow C Data and Arrow C Stream interface structs. In general, there +is one wrapper per C struct and pointer validity is managed by keeping +strong references to Python objects. These wrappers are intended to +be literal and stay close to the structure definitions. +""" from libc.stdint cimport uintptr_t, int64_t from cpython.mem cimport PyMem_Malloc, PyMem_Free @@ -26,9 +34,17 @@ from cpython cimport Py_buffer from nanoarrow_c cimport * def c_version(): + """Return the nanoarrow C library version string + """ return ArrowNanoarrowVersion().decode("UTF-8") cdef class SchemaHolder: + """Memory holder for an ArrowSchema + + This class is responsible for the lifecycle of the ArrowSchema + whose memory it is responsible. When this object is deleted, + a non-NULL release callback is invoked. + """ cdef ArrowSchema c_schema def __cinit__(self): @@ -42,6 +58,12 @@ cdef class SchemaHolder: return &self.c_schema cdef class ArrayHolder: + """Memory holder for an ArrowArray + + This class is responsible for the lifecycle of the ArrowArray + whose memory it is responsible. When this object is deleted, + a non-NULL release callback is invoked. + """ cdef ArrowArray c_array def __cinit__(self): @@ -55,6 +77,12 @@ cdef class ArrayHolder: return &self.c_array cdef class ArrayStreamHolder: + """Memory holder for an ArrowArrayStream + + This class is responsible for the lifecycle of the ArrowArrayStream + whose memory it is responsible. When this object is deleted, + a non-NULL release callback is invoked. + """ cdef ArrowArrayStream c_array_stream def __cinit__(self): @@ -68,6 +96,12 @@ cdef class ArrayStreamHolder: return &self.c_array_stream cdef class ArrayViewHolder: + """Memory holder for an ArrowArrayView + + This class is responsible for the lifecycle of the ArrowArrayView + whose memory it is responsible. When this object is deleted, + ArrowArrayViewReset() is called on the contents. + """ cdef ArrowArrayView c_array_view def __init__(self): @@ -81,6 +115,13 @@ cdef class ArrayViewHolder: class NanoarrowException(RuntimeError): + """An error resulting from a call to the nanoarrow C library + + Calls to the nanoarrow C library and/or the Arrow C Stream interface + callbacks return an errno error code and sometimes a message with extra + detail. This exception wraps a RuntimeError to format a suitable message + and store the components of the original error. + """ def __init__(self, what, code, message): self.what = what @@ -94,20 +135,56 @@ class NanoarrowException(RuntimeError): cdef class Error: + """Memory holder for an ArrowError + + ArrowError is the C struct that is optionally passed to nanoarrow functions + when a detailed error message might be returned. This class holds a C + reference to the object and provides helpers for raising exceptions based + on the contained message. + """ cdef ArrowError c_error def __cinit__(self): self.c_error.message[0] = 0 def raise_message(self, what, code): - raise Exception(what, code, self.c_error.message.decode("UTF-8")) + """Raise a NanoarrowException from this message + """ + raise NanoarrowException(what, code, self.c_error.message.decode("UTF-8")) @staticmethod def raise_error(what, code): - raise Exception(what, code, "") + """Raise a NanoarrowException without a message + """ + raise NanoarrowException(what, code, "") cdef class Schema: + """ArrowSchema wrapper + + This class provides a user-facing interface to access the fields of + an ArrowSchema as defined in the Arrow C Data interface. These objects + are usually created using `nanoarrow.schema()`. This Python wrapper + allows access to schema fields but does not automatically deserialize + their content: use `.view()` to validate and deserialize the content + into a more easily inspectable object. + + Examples + -------- + + >>> import pyarrow as pa + >>> import nanoarrow as na + >>> schema = na.schema(pa.int32()) + >>> schema.is_valid() + True + >>> schema.format + 'i' + >>> schema.name + '' + >>> schema_view = schema.view() + >>> schema_view.type + 'int32' + """ cdef object _base cdef ArrowSchema* _ptr @@ -124,9 +201,11 @@ cdef class Schema: return self._ptr def is_valid(self): - return self._ptr.release != NULL + return self._ptr != NULL and self._ptr.release != NULL def _assert_valid(self): + if self._ptr == NULL: + raise RuntimeError("schema is NULL") if self._ptr.release == NULL: raise RuntimeError("schema is released") @@ -190,7 +269,30 @@ cdef class Schema: raise ValueError(ArrowErrorMessage(&error)) return schema_view + cdef class SchemaView: + """ArrowSchemaView wrapper + + The ArrowSchemaView is a nanoarrow C library structure that facilitates + access to the deserialized content of an ArrowSchema (e.g., parameter + values for parameterized types). This wrapper extends that facility to Python. + + Examples + -------- + + >>> import pyarrow as pa + >>> import nanoarrow as na + >>> schema = na.schema(pa.decimal128(10, 3)) + >>> schema_view = schema.view() + >>> schema_view.type + 'decimal128' + >>> schema_view.decimal_bitwidth + 128 + >>> schema_view.decimal_precision + 10 + >>> schema_view.decimal_scale + 3 + """ cdef ArrowSchemaView _schema_view _fixed_size_types = ( @@ -285,6 +387,34 @@ cdef class SchemaView: ) cdef class Array: + """ArrowArray wrapper + + This class provides a user-facing interface to access the fields of + an ArrowArray as defined in the Arrow C Data interface, holding an + optional reference to a Schema that can be used to safely deserialize + the content. These objects are usually created using `nanoarrow.array()`. + This Python wrapper allows access to array fields but does not + automatically deserialize their content: use `.view()` to validate and + deserialize the content into a more easily inspectable object. + + Examples + -------- + + >>> import pyarrow as pa + >>> import numpy as np + >>> import nanoarrow as na + >>> array = na.array(pa.array(["one", "two", "three", None])) + >>> array.length + 4 + >>> array.null_count + 1 + >>> array_view = array.view() + >>> np.array(array_view.buffers[1]) + array([ 0, 3, 6, 11, 11], dtype=int32) + >>> np.array(array_view.buffers[2]) + array([b'o', b'n', b'e', b't', b'w', b'o', b't', b'h', b'r', b'e', b'e'], + dtype='|S1') + """ cdef object _base cdef ArrowArray* _ptr cdef Schema _schema @@ -303,9 +433,11 @@ cdef class Array: return self._ptr def is_valid(self): - return self._ptr.release != NULL + return self._ptr != NULL and self._ptr.release != NULL def _assert_valid(self): + if self._ptr == NULL: + raise RuntimeError("Array is NULL") if self._ptr.release == NULL: raise RuntimeError("Array is released") diff --git a/python/src/nanoarrow/lib.py b/python/src/nanoarrow/lib.py index b0a2a326d..ef8bca692 100644 --- a/python/src/nanoarrow/lib.py +++ b/python/src/nanoarrow/lib.py @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + from ._lib import Schema, Array, ArrayStream From c1c2df24ed7a5da73b48c591e9a54993589abed1 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Mon, 12 Jun 2023 16:48:40 -0300 Subject: [PATCH 37/52] more documentation --- python/src/nanoarrow/_lib.pyx | 73 ++++++++++++++++++++++++++++++++--- 1 file changed, 68 insertions(+), 5 deletions(-) diff --git a/python/src/nanoarrow/_lib.pyx b/python/src/nanoarrow/_lib.pyx index 769a2ec61..53f1dbdfc 100644 --- a/python/src/nanoarrow/_lib.pyx +++ b/python/src/nanoarrow/_lib.pyx @@ -409,11 +409,6 @@ cdef class Array: >>> array.null_count 1 >>> array_view = array.view() - >>> np.array(array_view.buffers[1]) - array([ 0, 3, 6, 11, 11], dtype=int32) - >>> np.array(array_view.buffers[2]) - array([b'o', b'n', b'e', b't', b'w', b'o', b't', b'h', b'r', b'e', b'e'], - dtype='|S1') """ cdef object _base cdef ArrowArray* _ptr @@ -492,6 +487,25 @@ cdef class Array: cdef class ArrayView: + """ArrowArrayView wrapper + + The ArrowArrayView is a nanoarrow C library structure that facilitates + access to the deserialized content of an ArrowArray (e.g., buffer types, + lengths, and content). This wrapper extends that facility to Python. + + Examples + -------- + + >>> import pyarrow as pa + >>> import numpy as np + >>> import nanoarrow as na + >>> array_view = na.array(pa.array(["one", "two", "three", None])).view() + >>> np.array(array_view.buffers[1]) + array([ 0, 3, 6, 11, 11], dtype=int32) + >>> np.array(array_view.buffers[2]) + array([b'o', b'n', b'e', b't', b'w', b'o', b't', b'h', b'r', b'e', b'e'], + dtype='|S1') + """ cdef object _base cdef ArrowArrayView* _ptr cdef Array _array @@ -522,6 +536,8 @@ cdef class ArrayView: return self._array._schema cdef class SchemaChildren: + """Wrapper for a lazily-resolved list of Schema children + """ cdef Schema _parent cdef int64_t _length @@ -544,7 +560,11 @@ cdef class SchemaChildren: cdef ArrowSchema* child = children[i] return child + cdef class SchemaMetadata: + """Wrapper for a lazily-parsed Schema.metadata string + """ + cdef object _parent cdef const char* _metadata cdef ArrowMetadataReader _reader @@ -572,7 +592,10 @@ cdef class SchemaMetadata: value_obj = PyBytes_FromStringAndSize(value.data, value.size_bytes) yield key_obj, value_obj + cdef class ArrayChildren: + """Wrapper for a lazily-resolved list of Array children + """ cdef Array _parent cdef int64_t _length @@ -594,7 +617,10 @@ cdef class ArrayChildren: cdef ArrowArray* child = children[i] return child + cdef class ArrayViewChildren: + """Wrapper for a lazily-resolved list of ArrayView children + """ cdef ArrayView _parent cdef int64_t _length @@ -617,6 +643,13 @@ cdef class ArrayViewChildren: return child cdef class BufferView: + """Wrapper for Array buffer content + + This object is a Python wrapper around a buffer held by an Array. + It implements the Python buffer protocol and is best accessed through + another implementor (e.g., `np.array(array_view.buffers[1])`)). Note that + this buffer content does not apply any parent offset. + """ cdef object _base cdef ArrowBufferView* _ptr cdef ArrowBufferType _buffer_type @@ -689,7 +722,10 @@ cdef class BufferView: def __releasebuffer__(self, Py_buffer *buffer): pass + cdef class ArrayViewBuffers: + """A lazily-resolved list of ArrayView buffers + """ cdef ArrayView _array_view cdef int64_t _length @@ -718,6 +754,27 @@ cdef class ArrayViewBuffers: cdef class ArrayStream: + """ArrowArrayStream wrapper + + This class provides a user-facing interface to access the fields of + an ArrowArrayStream as defined in the Arrow C Stream interface. + These objects are usually created using `nanoarrow.array_stream()`. + + Examples + -------- + + >>> import pyarrow as pa + >>> import nanoarrow as na + >>> pa_column = pa.array([1, 2, 3], pa.int32()) + >>> pa_batch = pa.record_batch([pa_column], names=["col1"]) + >>> pa_reader = pa.RecordBatchReader.from_batches(pa_batch.schema, [pa_batch]) + >>> array_stream = na.array_stream(pa_reader) + >>> array_stream.get_schema() + struct + >>> array_stream.get_next().length + >>> array_stream.get_next() is None + True + """ cdef object _base cdef ArrowArrayStream* _ptr cdef object _cached_schema @@ -757,6 +814,8 @@ cdef class ArrayStream: self._cached_schema = schema def get_schema(self): + """Get the schema associated with this stream + """ # Update the cached copy of the schema as an independent object self._cached_schema = Schema.empty() self._get_schema(self._cached_schema) @@ -767,6 +826,10 @@ cdef class ArrayStream: return out def get_next(self): + """Get the next Array from this stream + + Returns None when there are no more arrays in this stream. + """ self._assert_valid() if self._cached_schema is None: From ee4dbb6a9b3e136082ff3e219429bc622530ef2a Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Mon, 12 Jun 2023 16:50:43 -0300 Subject: [PATCH 38/52] empty -> allocate --- python/src/nanoarrow/_lib.pyx | 10 +++++----- python/src/nanoarrow/lib.py | 8 ++++---- python/tests/test_nanoarrow.py | 11 ++++++----- 3 files changed, 15 insertions(+), 14 deletions(-) diff --git a/python/src/nanoarrow/_lib.pyx b/python/src/nanoarrow/_lib.pyx index 53f1dbdfc..4f3c1b270 100644 --- a/python/src/nanoarrow/_lib.pyx +++ b/python/src/nanoarrow/_lib.pyx @@ -189,7 +189,7 @@ cdef class Schema: cdef ArrowSchema* _ptr @staticmethod - def empty(): + def allocate(): base = SchemaHolder() return Schema(base, base._addr()) @@ -817,11 +817,11 @@ cdef class ArrayStream: """Get the schema associated with this stream """ # Update the cached copy of the schema as an independent object - self._cached_schema = Schema.empty() + self._cached_schema = Schema.allocate() self._get_schema(self._cached_schema) # Return an independent copy - out = Schema.empty() + out = Schema.allocate() self._get_schema(out) return out @@ -833,7 +833,7 @@ cdef class ArrayStream: self._assert_valid() if self._cached_schema is None: - self._cached_schema = Schema.empty() + self._cached_schema = Schema.allocate() self._get_schema(self._cached_schema) cdef Array array = Array.empty(self._cached_schema) @@ -862,6 +862,6 @@ cdef class ArrayStream: array = self.get_next() @staticmethod - def empty(): + def allocate(): base = ArrayStreamHolder() return ArrayStream(base, base._addr()) diff --git a/python/src/nanoarrow/lib.py b/python/src/nanoarrow/lib.py index ef8bca692..8841ade47 100644 --- a/python/src/nanoarrow/lib.py +++ b/python/src/nanoarrow/lib.py @@ -26,7 +26,7 @@ def schema(obj): # array, schema, or array_stream. The ideal # solution here would be something like __arrow_c_schema__() if hasattr(obj, "_export_to_c"): - out = Schema.empty() + out = Schema.allocate() obj._export_to_c(out._addr()) return out else: @@ -43,7 +43,7 @@ def array(obj): # not fail with a crash (but will fail with a confusing error). The ideal # solution here would be something like __arrow_c_array__() if hasattr(obj, "_export_to_c"): - out = Array.empty(Schema.empty()) + out = Array.empty(Schema.allocate()) obj._export_to_c(out._addr(), out.schema._addr()) return out else: @@ -60,10 +60,10 @@ def array_stream(obj): # array, schema, or array_stream. The ideal # solution here would be something like __arrow_c_array_stream__() if hasattr(obj, "_export_to_c"): - out = ArrayStream.empty() + out = ArrayStream.allocate() obj._export_to_c(out._addr()) return out else: raise TypeError( - f"Can't convert object of type {type(obj).__name__} to nanoarrow.Schema" + f"Can't convert object of type {type(obj).__name__} to nanoarrow.ArrowArrayStream" ) diff --git a/python/tests/test_nanoarrow.py b/python/tests/test_nanoarrow.py index 6f7477dfe..9e3170caf 100644 --- a/python/tests/test_nanoarrow.py +++ b/python/tests/test_nanoarrow.py @@ -30,7 +30,7 @@ def test_c_version(): def test_schema_helper(): - schema = na.Schema.empty() + schema = na.Schema.allocate() assert na.schema(schema) is schema schema = na.schema(pa.null()) @@ -41,7 +41,7 @@ def test_schema_helper(): def test_array_helper(): - array = na.Array.empty(na.Schema.empty()) + array = na.Array.empty(na.Schema.allocate()) assert na.array(array) is array array = na.array(pa.array([], pa.null())) @@ -52,7 +52,7 @@ def test_array_helper(): def test_schema_basic(): - schema = na.Schema.empty() + schema = na.Schema.allocate() assert schema.is_valid() is False assert repr(schema) == "[invalid: schema is released]" @@ -89,7 +89,7 @@ def test_schema_metadata(): def test_schema_view(): - schema = na.Schema.empty() + schema = na.Schema.allocate() with pytest.raises(RuntimeError): schema.view() @@ -266,7 +266,7 @@ def test_buffers_binary(): def test_array_stream(): - array_stream = na.ArrayStream.empty() + array_stream = na.ArrayStream.allocate() assert array_stream.is_valid() is False with pytest.raises(RuntimeError): array_stream.get_schema() @@ -283,6 +283,7 @@ def test_array_stream(): assert array.schema.children[0].name == "some_column" assert array_stream.get_next() is None + def test_array_stream_iter(): pa_array_child = pa.array([1, 2, 3], pa.int32()) pa_array = pa.record_batch([pa_array_child], names=["some_column"]) From 4bb8f860a316b199d17ed93c201e09acc95f3dce Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Mon, 12 Jun 2023 16:57:15 -0300 Subject: [PATCH 39/52] in theory use nanoarrowexcpetion --- python/src/nanoarrow/_lib.pyx | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/python/src/nanoarrow/_lib.pyx b/python/src/nanoarrow/_lib.pyx index 4f3c1b270..c1d45821e 100644 --- a/python/src/nanoarrow/_lib.pyx +++ b/python/src/nanoarrow/_lib.pyx @@ -263,10 +263,11 @@ cdef class Schema: def view(self): self._assert_valid() schema_view = SchemaView() - cdef ArrowError error - cdef int result = ArrowSchemaViewInit(&schema_view._schema_view, self._ptr, &error) + cdef Error error = Error() + cdef int result = ArrowSchemaViewInit(&schema_view._schema_view, self._ptr, &error.c_error) if result != NANOARROW_OK: - raise ValueError(ArrowErrorMessage(&error)) + error.raise_message("ArrowSchemaViewInit()", result) + return schema_view @@ -473,15 +474,15 @@ cdef class Array: def view(self): cdef ArrayViewHolder holder = ArrayViewHolder() - cdef ArrowError error + cdef Error error = Error() cdef int result = ArrowArrayViewInitFromSchema(&holder.c_array_view, - self._schema._ptr, &error) + self._schema._ptr, &error.c_error) if result != NANOARROW_OK: - raise ValueError(ArrowErrorMessage(&error)) + error.raise_message("ArrowArrayViewInitFromSchema()", result) - result = ArrowArrayViewSetArray(&holder.c_array_view, self._ptr, &error) + result = ArrowArrayViewSetArray(&holder.c_array_view, self._ptr, &error.c_error) if result != NANOARROW_OK: - raise ValueError(ArrowErrorMessage(&error)) + error.raise_message("ArrowArrayViewSetArray()", result) return ArrayView(holder, holder._addr(), self) @@ -576,7 +577,7 @@ cdef class SchemaMetadata: def _init_reader(self): cdef int result = ArrowMetadataReaderInit(&self._reader, self._metadata) if result != NANOARROW_OK: - raise ValueError('ArrowMetadataReaderInit() failed') + Error.raise_error("ArrowMetadataReaderInit()", result) def __len__(self): self._init_reader() From 1d3d8c1af8a0a1915ce761432caa59abc4a4322a Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Mon, 12 Jun 2023 17:06:13 -0300 Subject: [PATCH 40/52] attempt starting coverage --- python/.gitignore | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/.gitignore b/python/.gitignore index 8abd5d0de..b3724522b 100644 --- a/python/.gitignore +++ b/python/.gitignore @@ -16,10 +16,10 @@ # specific language governing permissions and limitations # under the License. -src/nanoarrow/nanoarrow.c -src/nanoarrow/nanoarrow.h -src/nanoarrow/nanoarrow_c.pxd -src/nanoarrow/*.c +nanoarrow/nanoarrow.c +nanoarrow/nanoarrow.h +nanoarrow/nanoarrow_c.pxd +nanoarrow/*.c # Byte-compiled / optimized / DLL files __pycache__/ From 3d03cc61de98833da9cda6f8a61f2439d3fd4ac3 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Mon, 12 Jun 2023 17:06:20 -0300 Subject: [PATCH 41/52] attempt starting coverage again --- python/.coverage | Bin 0 -> 53248 bytes python/MANIFEST.in | 6 +++--- python/bootstrap.py | 14 +++++++------- python/{src => }/nanoarrow/__init__.py | 0 python/{src => }/nanoarrow/_lib.pyx | 0 python/{src => }/nanoarrow/lib.py | 0 python/setup.py | 24 ++++++++++++------------ 7 files changed, 22 insertions(+), 22 deletions(-) create mode 100644 python/.coverage rename python/{src => }/nanoarrow/__init__.py (100%) rename python/{src => }/nanoarrow/_lib.pyx (100%) rename python/{src => }/nanoarrow/lib.py (100%) diff --git a/python/.coverage b/python/.coverage new file mode 100644 index 0000000000000000000000000000000000000000..cb940059e3ace43b858468e85ebfdb74569e3700 GIT binary patch literal 53248 zcmeI)O^@3|7zc3MNs}g<;6hbJt5w;#z;?ASVL=rF4zSQYR8%V4E~pX`?!=jFOcOh` zo$l_TLTQmIA@L0mNBA6ExbqPzA+r?}UaUkb2)i;C?-?fy(=vnTXuj;&8yEg9<Ju{J+%-ntpfMn}b=bXBPD ztDjxk+T7U^JDZdp%VbHC|f#`UC5~d${u^0G4?JKt*tLC8rt}_8S5w`ADS{Jb@uz+29bZ;@AmlQP!4_IIGC1mTHZKFgMIFEbI7?I z#XE62GjcwoI#=KKuJ_gOUntfV7R+0vq#31WD}5+V zG@?bIoA zpnC+5OME*v5d?1;b%Dnwg6FS(PsVNWWZii)Reb4Kq4v?KX%*)xI2(Llxc#-2RQb+) zqTi)KpKrN|D)W|6l|NSpYd2(vMw6cOLh7aKfgf#Ih0eXc0`Jk0MIAVPs6%Y8vD6(@^SPaLK(#pb;^jk z=woGOg{FIX#LCKF?~w_Y+LcY+-T9=9mAKbdPZXzTqjNG=vDa0ClO&Z%8rGhSgc~Z_ zNptD)QtoNZfX|k*!S{TXe4Jm(4HnH}ZDqwA^wZ9$#~CN0NxdtxYNt=FiAM6+&QW{0 zNCPgt+8J*rY&XWiPxTRaaMNe7p%t=J>1&|Z?I zvCZ5q>rbw)DaZ9lQR=@Ba-?99AFz0SG_<0uX=z1Rwwb2tWV=vnY@= zb4Hav|2OUb4En|NsBP*^d?x hLI45~fB*y_009U<00Izz00fSbK+d9{{!f1Y|9^f9w@?58 literal 0 HcmV?d00001 diff --git a/python/MANIFEST.in b/python/MANIFEST.in index 9fc293725..93ed2fd0a 100644 --- a/python/MANIFEST.in +++ b/python/MANIFEST.in @@ -16,6 +16,6 @@ # under the License. exclude bootstrap.py -include src/nanoarrow/nanoarrow.c -include src/nanoarrow/nanoarrow.h -include src/nanoarrow/nanoarrow_c.pxd +include nanoarrow/nanoarrow.c +include nanoarrow/nanoarrow.h +include nanoarrow/nanoarrow_c.pxd diff --git a/python/bootstrap.py b/python/bootstrap.py index 8cdc83dcd..9a41446c9 100644 --- a/python/bootstrap.py +++ b/python/bootstrap.py @@ -145,8 +145,8 @@ def copy_or_generate_nanoarrow_c(): this_dir = os.path.abspath(os.path.dirname(__file__)) source_dir = os.path.dirname(this_dir) - maybe_nanoarrow_h = os.path.join(this_dir, 'src/nanoarrow/nanoarrow.h') - maybe_nanoarrow_c = os.path.join(this_dir, 'src/nanoarrow/nanoarrow.c') + maybe_nanoarrow_h = os.path.join(this_dir, 'nanoarrow/nanoarrow.h') + maybe_nanoarrow_c = os.path.join(this_dir, 'nanoarrow/nanoarrow.c') for f in (maybe_nanoarrow_c, maybe_nanoarrow_h): if os.path.exists(f): os.unlink(f) @@ -161,7 +161,7 @@ def copy_or_generate_nanoarrow_c(): os.mkdir(build_dir) os.chdir(build_dir) os.system(f'cmake ../.. -DNANOARROW_BUNDLE=ON') - os.system(f'cmake --install . --prefix=../src/nanoarrow') + os.system(f'cmake --install . --prefix=../nanoarrow') finally: if os.path.exists(build_dir): # Can fail on Windows with permission issues @@ -176,18 +176,18 @@ def copy_or_generate_nanoarrow_c(): else: raise ValueError('Attempt to build source distribution outside the nanoarrow repo') - if not os.path.exists(os.path.join(this_dir, 'src/nanoarrow/nanoarrow.h')): + if not os.path.exists(os.path.join(this_dir, 'nanoarrow/nanoarrow.h')): raise ValueError('Attempt to vendor nanoarrow.c/h failed') - maybe_nanoarrow_hpp = os.path.join(this_dir, 'src/nanoarrow/nanoarrow.hpp') + maybe_nanoarrow_hpp = os.path.join(this_dir, 'nanoarrow/nanoarrow.hpp') if os.path.exists(maybe_nanoarrow_hpp): os.unlink(maybe_nanoarrow_hpp) # Runs the pxd generator with some information about the file name def generate_nanoarrow_pxd(): this_dir = os.path.abspath(os.path.dirname(__file__)) - maybe_nanoarrow_h = os.path.join(this_dir, 'src/nanoarrow/nanoarrow.h') - maybe_nanoarrow_pxd = os.path.join(this_dir, 'src/nanoarrow/nanoarrow_c.pxd') + maybe_nanoarrow_h = os.path.join(this_dir, 'nanoarrow/nanoarrow.h') + maybe_nanoarrow_pxd = os.path.join(this_dir, 'nanoarrow/nanoarrow_c.pxd') NanoarrowPxdGenerator().generate_nanoarrow_pxd( maybe_nanoarrow_h, diff --git a/python/src/nanoarrow/__init__.py b/python/nanoarrow/__init__.py similarity index 100% rename from python/src/nanoarrow/__init__.py rename to python/nanoarrow/__init__.py diff --git a/python/src/nanoarrow/_lib.pyx b/python/nanoarrow/_lib.pyx similarity index 100% rename from python/src/nanoarrow/_lib.pyx rename to python/nanoarrow/_lib.pyx diff --git a/python/src/nanoarrow/lib.py b/python/nanoarrow/lib.py similarity index 100% rename from python/src/nanoarrow/lib.py rename to python/nanoarrow/lib.py diff --git a/python/setup.py b/python/setup.py index 3ede82c86..4222cd85d 100644 --- a/python/setup.py +++ b/python/setup.py @@ -26,15 +26,15 @@ # checkout or copy from ../dist if the caller doesn't have cmake available. # Note that bootstrap.py won't exist if building from sdist. this_dir = os.path.dirname(__file__) -bootstrap_py = os.path.join(this_dir, 'bootstrap.py') +bootstrap_py = os.path.join(this_dir, "bootstrap.py") if os.path.exists(bootstrap_py): subprocess.run([sys.executable, bootstrap_py]) # Set some extra flags for compiling with coverage support -if os.getenv('NANOARROW_PYTHON_COVERAGE') == "1": - coverage_compile_args = ['--coverage'] - coverage_link_args = ['--coverage'] +if os.getenv("NANOARROW_PYTHON_COVERAGE") == "1": + coverage_compile_args = ["--coverage"] + coverage_link_args = ["--coverage"] coverage_define_macros = [("CYTHON_TRACE", 1)] else: coverage_compile_args = [] @@ -44,16 +44,16 @@ setup( ext_modules=[ Extension( - name='nanoarrow._lib', - include_dirs=['src/nanoarrow'], - language='c', + name="nanoarrow._lib", + include_dirs=["nanoarrow"], + language="c", sources=[ - 'src/nanoarrow/_lib.pyx', - 'src/nanoarrow/nanoarrow.c', + "nanoarrow/_lib.pyx", + "nanoarrow/nanoarrow.c", ], - extra_compile_args = coverage_compile_args, - extra_link_args = [] + coverage_link_args, - define_macros= [] + coverage_define_macros, + extra_compile_args=coverage_compile_args, + extra_link_args=coverage_link_args, + define_macros=coverage_define_macros, ) ] ) From 3413af1eab825e91033e6216ff1d6789243931f3 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Mon, 12 Jun 2023 20:37:56 -0300 Subject: [PATCH 42/52] remove coverage file --- python/.coverage | Bin 53248 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 python/.coverage diff --git a/python/.coverage b/python/.coverage deleted file mode 100644 index cb940059e3ace43b858468e85ebfdb74569e3700..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 53248 zcmeI)O^@3|7zc3MNs}g<;6hbJt5w;#z;?ASVL=rF4zSQYR8%V4E~pX`?!=jFOcOh` zo$l_TLTQmIA@L0mNBA6ExbqPzA+r?}UaUkb2)i;C?-?fy(=vnTXuj;&8yEg9<Ju{J+%-ntpfMn}b=bXBPD ztDjxk+T7U^JDZdp%VbHC|f#`UC5~d${u^0G4?JKt*tLC8rt}_8S5w`ADS{Jb@uz+29bZ;@AmlQP!4_IIGC1mTHZKFgMIFEbI7?I z#XE62GjcwoI#=KKuJ_gOUntfV7R+0vq#31WD}5+V zG@?bIoA zpnC+5OME*v5d?1;b%Dnwg6FS(PsVNWWZii)Reb4Kq4v?KX%*)xI2(Llxc#-2RQb+) zqTi)KpKrN|D)W|6l|NSpYd2(vMw6cOLh7aKfgf#Ih0eXc0`Jk0MIAVPs6%Y8vD6(@^SPaLK(#pb;^jk z=woGOg{FIX#LCKF?~w_Y+LcY+-T9=9mAKbdPZXzTqjNG=vDa0ClO&Z%8rGhSgc~Z_ zNptD)QtoNZfX|k*!S{TXe4Jm(4HnH}ZDqwA^wZ9$#~CN0NxdtxYNt=FiAM6+&QW{0 zNCPgt+8J*rY&XWiPxTRaaMNe7p%t=J>1&|Z?I zvCZ5q>rbw)DaZ9lQR=@Ba-?99AFz0SG_<0uX=z1Rwwb2tWV=vnY@= zb4Hav|2OUb4En|NsBP*^d?x hLI45~fB*y_009U<00Izz00fSbK+d9{{!f1Y|9^f9w@?58 From 4f8cef20c9dd5f7fce5cd20742c1f068d0a18c86 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Mon, 12 Jun 2023 20:48:31 -0300 Subject: [PATCH 43/52] add coverage to python job --- .github/workflows/python.yaml | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml index 7d9357809..82b0eede9 100644 --- a/.github/workflows/python.yaml +++ b/.github/workflows/python.yaml @@ -40,7 +40,7 @@ jobs: steps: - uses: actions/checkout@v3 - + - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v4 with: @@ -57,3 +57,30 @@ jobs: - name: Run tests run: | pytest python/tests -v -s + + - name: Run doctests + if: success() && matrix.python-version == '3.10' + run: | + # Needs editable install to run --doctest-modules + pip install -e python + pytest python --doctest-modules + + - name: Coverage + if: success() && matrix.python-version == '3.10' + run: | + pip uninstall --yes nanoarrow + pip install pytest-cov Cython + pushd python + + # Build with Cython + gcc coverage options + NANOARROW_PYTHON_COVERAGE=1 python setup.py build_ext --inplace + + # Run tests + coverage.py (generates .coverage + coverage.xml files) + python -m pytest --cov ./nanoarrow + python -m coverage xml + + - name: Upload coverage to codecov + if: success() && matrix.python-version == '3.10' + uses: codecov/codecov-action@v2 + with: + files: 'python/coverage.xml' From f6be55c731365b8f7f6b8e14e173bb4b041fa399 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Mon, 12 Jun 2023 20:56:48 -0300 Subject: [PATCH 44/52] fix + test doctests from Cython --- .github/workflows/python.yaml | 5 +++-- python/nanoarrow/_lib.pyx | 1 + 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml index 82b0eede9..4b599f7ee 100644 --- a/.github/workflows/python.yaml +++ b/.github/workflows/python.yaml @@ -61,9 +61,10 @@ jobs: - name: Run doctests if: success() && matrix.python-version == '3.10' run: | - # Needs editable install to run --doctest-modules + # Needs editable install to run --doctest-cython + pip install pytest-cython pip install -e python - pytest python --doctest-modules + pytest python --doctest-cython - name: Coverage if: success() && matrix.python-version == '3.10' diff --git a/python/nanoarrow/_lib.pyx b/python/nanoarrow/_lib.pyx index c1d45821e..7317a4b5d 100644 --- a/python/nanoarrow/_lib.pyx +++ b/python/nanoarrow/_lib.pyx @@ -773,6 +773,7 @@ cdef class ArrayStream: >>> array_stream.get_schema() struct >>> array_stream.get_next().length + 3 >>> array_stream.get_next() is None True """ From ce3463fee03d1ba46d5fd8e4f1dc5180b31a8630 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Mon, 12 Jun 2023 22:10:06 -0300 Subject: [PATCH 45/52] basic readme --- python/README.ipynb | 392 +++++++++++++++++++++++++++++++++ python/README.md | 176 ++++++++++++++- python/nanoarrow/_lib.pyx | 4 +- python/nanoarrow/lib.py | 2 +- python/tests/test_nanoarrow.py | 2 +- 5 files changed, 562 insertions(+), 14 deletions(-) create mode 100644 python/README.ipynb diff --git a/python/README.ipynb b/python/README.ipynb new file mode 100644 index 000000000..d89d4c4a6 --- /dev/null +++ b/python/README.ipynb @@ -0,0 +1,392 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "\n", + "\n", + "# nanoarrow for Python\n", + "\n", + "The nanoarrow Python package provides bindings to the nanoarrow C library. Like\n", + "the nanoarrow C library, it provides tools to facilitate the use of the\n", + "[Arrow C Data](https://arrow.apache.org/docs/format/CDataInterface.html) \n", + "and [Arrow C Stream](https://arrow.apache.org/docs/format/CStreamInterface.html) \n", + "interfaces.\n", + "\n", + "## Installation\n", + "\n", + "Python bindings for nanoarrow are not yet available on PyPI. You can install via\n", + "URL (requires a C compiler):\n", + "\n", + "```bash\n", + "python -m pip install \"https://github.com/apache/arrow-nanoarrow/archive/refs/heads/main.zip#egg=nanoarrow&subdirectory=python\"\n", + "```\n", + "\n", + "If you can import the namespace, you're good to go!" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import nanoarrow as na" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Example\n", + "\n", + "The Arrow C Data and Arrow C Stream interfaces are comprised of three structures: the `ArrowSchema` which represents a data type of an array, the `ArrowArray` which represents the values of an array, and an `ArrowArrayStream`, which represents zero or more `ArrowArray`s with a common `ArrowSchema`. All three can be wrapped by Python objects using the nanoarrow Python package.\n", + "\n", + "### Schemas\n", + "\n", + "Use `nanoarrow.schema()` to convert a data type-like object to an `ArrowSchema`. This is currently only implemented for pyarrow objects." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import pyarrow as pa\n", + "schema = na.schema(pa.decimal128(10, 3))" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can extract the fields of a `Schema` object one at a time or parse it into a view to extract deserialized parameters." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "d:10,3\n", + "10\n", + "3\n" + ] + } + ], + "source": [ + "print(schema.format)\n", + "print(schema.view().decimal_precision)\n", + "print(schema.view().decimal_scale)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The `nanoarrow.schema()` helper is currently only implemented for pyarrow objects. If your data type has an `_export_to_c()`-like function, you can get the address of a freshly-allocated `ArrowSchema` as well:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'int32'" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "schema = na.Schema.allocate()\n", + "pa.int32()._export_to_c(schema._addr())\n", + "schema.view().type" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The `Schema` object cleans up after itself: when the object is deleted, the underlying `Schema` is released." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Arrays\n", + "\n", + "You can use `nanoarrow.array()` to convert an array-like object to a `nanoarrow.Array`, optionally attaching a `Schema` that can be used to interpret its contents. This is currently only implemented for pyarrow objects." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "array = na.array(pa.array([\"one\", \"two\", \"three\", None]))" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Like the `Schema`, you can inspect an `Array` by extracting fields individually:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "4\n", + "1\n" + ] + } + ], + "source": [ + "print(array.length)\n", + "print(array.null_count)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "...and parse the `Array`/`Schema` combination into a view whose contents is more readily accessible." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[array([7], dtype=uint8),\n", + " array([ 0, 3, 6, 11, 11], dtype=int32),\n", + " array([b'o', b'n', b'e', b't', b'w', b'o', b't', b'h', b'r', b'e', b'e'],\n", + " dtype='|S1')]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import numpy as np\n", + "view = array.view()\n", + "[np.array(buffer) for buffer in view.buffers]" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Like the `Schema`, you can allocate an empty one and access its address with `_addr()` to pass to other array-exporting functions." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "3" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "array = na.Array.allocate(na.Schema.allocate())\n", + "pa.array([1, 2, 3])._export_to_c(array._addr(), array.schema._addr())\n", + "array.length" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Array streams\n", + "\n", + "You can use `nanoarrow.array_stream()` to convert an object representing a sequence of `Array`s with a common `Schema` to a `nanoarrow.ArrayStream`. This is currently only implemented for pyarrow objects." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "pa_array_child = pa.array([1, 2, 3], pa.int32())\n", + "pa_array = pa.record_batch([pa_array_child], names=[\"some_column\"])\n", + "reader = pa.RecordBatchReader.from_batches(pa_array.schema, [pa_array])\n", + "array_stream = na.array_stream(reader)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can pull the next array from the stream using `.get_next()` or use it like an interator. The `.get_next()` method will return `None` when there are no more arrays in the stream." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "struct\n", + "3\n", + "True\n" + ] + } + ], + "source": [ + "print(array_stream.get_schema())\n", + "\n", + "for array in array_stream:\n", + " print(array.length)\n", + "\n", + "print(array_stream.get_next() is None)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can also get the address of a freshly-allocated stream to pass to a suitable exporting function:" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "struct" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "array_stream = na.ArrayStream.allocate()\n", + "reader._export_to_c(array_stream._addr())\n", + "array_stream.get_schema()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Development\n", + "\n", + "Python bindings for nanoarrow are managed with [setuptools](https://setuptools.pypa.io/en/latest/index.html).\n", + "This means you can build the project using:\n", + "\n", + "```shell\n", + "git clone https://github.com/apache/arrow-nanoarrow.git\n", + "cd arrow-nanoarrow/python\n", + "pip install -e .\n", + "```\n", + "\n", + "Tests use [pytest](https://docs.pytest.org/):\n", + "\n", + "```shell\n", + "# Install dependencies\n", + "pip install -e .[test]\n", + "\n", + "# Run tests\n", + "pytest -vvx\n", + "```" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.6" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/python/README.md b/python/README.md index 04d05898b..db898d24a 100644 --- a/python/README.md +++ b/python/README.md @@ -17,10 +17,15 @@ under the License. --> + + # nanoarrow for Python -Python bindings for nanoarrow. These are in a preliminary state: see open issues -and tests/test_nanoarrow.py for usage. +The nanoarrow Python package provides bindings to the nanoarrow C library. Like +the nanoarrow C library, it provides tools to facilitate the use of the +[Arrow C Data](https://arrow.apache.org/docs/format/CDataInterface.html) +and [Arrow C Stream](https://arrow.apache.org/docs/format/CStreamInterface.html) +interfaces. ## Installation @@ -31,18 +36,172 @@ URL (requires a C compiler): python -m pip install "https://github.com/apache/arrow-nanoarrow/archive/refs/heads/main.zip#egg=nanoarrow&subdirectory=python" ``` -## Building +If you can import the namespace, you're good to go! + + +```python +import nanoarrow as na +``` + +## Example + +The Arrow C Data and Arrow C Stream interfaces are comprised of three structures: the `ArrowSchema` which represents a data type of an array, the `ArrowArray` which represents the values of an array, and an `ArrowArrayStream`, which represents zero or more `ArrowArray`s with a common `ArrowSchema`. All three can be wrapped by Python objects using the nanoarrow Python package. + +### Schemas + +Use `nanoarrow.schema()` to convert a data type-like object to an `ArrowSchema`. This is currently only implemented for pyarrow objects. + + +```python +import pyarrow as pa +schema = na.schema(pa.decimal128(10, 3)) +``` + +You can extract the fields of a `Schema` object one at a time or parse it into a view to extract deserialized parameters. + + +```python +print(schema.format) +print(schema.view().decimal_precision) +print(schema.view().decimal_scale) +``` + + d:10,3 + 10 + 3 + + +The `nanoarrow.schema()` helper is currently only implemented for pyarrow objects. If your data type has an `_export_to_c()`-like function, you can get the address of a freshly-allocated `ArrowSchema` as well: + + +```python +schema = na.Schema.allocate() +pa.int32()._export_to_c(schema._addr()) +schema.view().type +``` + + + + + 'int32' + + + +The `Schema` object cleans up after itself: when the object is deleted, the underlying `Schema` is released. + +### Arrays + +You can use `nanoarrow.array()` to convert an array-like object to a `nanoarrow.Array`, optionally attaching a `Schema` that can be used to interpret its contents. This is currently only implemented for pyarrow objects. + + +```python +array = na.array(pa.array(["one", "two", "three", None])) +``` + +Like the `Schema`, you can inspect an `Array` by extracting fields individually: + + +```python +print(array.length) +print(array.null_count) +``` + + 4 + 1 + + +...and parse the `Array`/`Schema` combination into a view whose contents is more readily accessible. + + +```python +import numpy as np +view = array.view() +[np.array(buffer) for buffer in view.buffers] +``` + + + + + [array([7], dtype=uint8), + array([ 0, 3, 6, 11, 11], dtype=int32), + array([b'o', b'n', b'e', b't', b'w', b'o', b't', b'h', b'r', b'e', b'e'], + dtype='|S1')] + + + +Like the `Schema`, you can allocate an empty one and access its address with `_addr()` to pass to other array-exporting functions. + + +```python +array = na.Array.allocate(na.Schema.allocate()) +pa.array([1, 2, 3])._export_to_c(array._addr(), array.schema._addr()) +array.length +``` + + + + + 3 + + + +### Array streams + +You can use `nanoarrow.array_stream()` to convert an object representing a sequence of `Array`s with a common `Schema` to a `nanoarrow.ArrayStream`. This is currently only implemented for pyarrow objects. + + +```python +pa_array_child = pa.array([1, 2, 3], pa.int32()) +pa_array = pa.record_batch([pa_array_child], names=["some_column"]) +reader = pa.RecordBatchReader.from_batches(pa_array.schema, [pa_array]) +array_stream = na.array_stream(reader) +``` + +You can pull the next array from the stream using `.get_next()` or use it like an interator. The `.get_next()` method will return `None` when there are no more arrays in the stream. + -Python bindings for nanoarrow are managed with setuptools[setuptools]. This means you -can build the project using: +```python +print(array_stream.get_schema()) + +for array in array_stream: + print(array.length) + +print(array_stream.get_next() is None) +``` + + struct + 3 + True + + +You can also get the address of a freshly-allocated stream to pass to a suitable exporting function: + + +```python +array_stream = na.ArrayStream.allocate() +reader._export_to_c(array_stream._addr()) +array_stream.get_schema() +``` + + + + + struct + + + +## Development + +Python bindings for nanoarrow are managed with [setuptools](https://setuptools.pypa.io/en/latest/index.html). +This means you can build the project using: ```shell git clone https://github.com/apache/arrow-nanoarrow.git -cd python +cd arrow-nanoarrow/python pip install -e . ``` -Tests use [pytest][pytest]: +Tests use [pytest](https://docs.pytest.org/): ```shell # Install dependencies @@ -51,6 +210,3 @@ pip install -e .[test] # Run tests pytest -vvx ``` - -[pytest]: https://docs.pytest.org/ -[setuptools]: https://setuptools.pypa.io/en/latest/index.html diff --git a/python/nanoarrow/_lib.pyx b/python/nanoarrow/_lib.pyx index 7317a4b5d..0564ffa7b 100644 --- a/python/nanoarrow/_lib.pyx +++ b/python/nanoarrow/_lib.pyx @@ -416,7 +416,7 @@ cdef class Array: cdef Schema _schema @staticmethod - def empty(Schema schema): + def allocate(Schema schema): base = ArrayHolder() return Array(base, base._addr(), schema) @@ -838,7 +838,7 @@ cdef class ArrayStream: self._cached_schema = Schema.allocate() self._get_schema(self._cached_schema) - cdef Array array = Array.empty(self._cached_schema) + cdef Array array = Array.allocate(self._cached_schema) cdef int code = self._ptr.get_next(self._ptr, array._ptr) cdef const char* message = NULL if code != NANOARROW_OK: diff --git a/python/nanoarrow/lib.py b/python/nanoarrow/lib.py index 8841ade47..a3c27e72f 100644 --- a/python/nanoarrow/lib.py +++ b/python/nanoarrow/lib.py @@ -43,7 +43,7 @@ def array(obj): # not fail with a crash (but will fail with a confusing error). The ideal # solution here would be something like __arrow_c_array__() if hasattr(obj, "_export_to_c"): - out = Array.empty(Schema.allocate()) + out = Array.allocate(Schema.allocate()) obj._export_to_c(out._addr(), out.schema._addr()) return out else: diff --git a/python/tests/test_nanoarrow.py b/python/tests/test_nanoarrow.py index 9e3170caf..3f5bea1a4 100644 --- a/python/tests/test_nanoarrow.py +++ b/python/tests/test_nanoarrow.py @@ -41,7 +41,7 @@ def test_schema_helper(): def test_array_helper(): - array = na.Array.empty(na.Schema.allocate()) + array = na.Array.allocate(na.Schema.allocate()) assert na.array(array) is array array = na.array(pa.array([], pa.null())) From 78cd7973f28035a0b3fcb12884207b0ecb0d369f Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Wed, 14 Jun 2023 11:12:58 -0300 Subject: [PATCH 46/52] Update python/nanoarrow/_lib.pyx Co-authored-by: Joris Van den Bossche --- python/nanoarrow/_lib.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/nanoarrow/_lib.pyx b/python/nanoarrow/_lib.pyx index 0564ffa7b..d3e43e4cf 100644 --- a/python/nanoarrow/_lib.pyx +++ b/python/nanoarrow/_lib.pyx @@ -42,7 +42,7 @@ cdef class SchemaHolder: """Memory holder for an ArrowSchema This class is responsible for the lifecycle of the ArrowSchema - whose memory it is responsible. When this object is deleted, + whose memory it is responsible for. When this object is deleted, a non-NULL release callback is invoked. """ cdef ArrowSchema c_schema From dc7d91cb2048c064cbbffd6f435307adc1b5d9e9 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Wed, 14 Jun 2023 16:21:21 -0300 Subject: [PATCH 47/52] use the namespace --- python/bootstrap.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/bootstrap.py b/python/bootstrap.py index 9a41446c9..39b4fd950 100644 --- a/python/bootstrap.py +++ b/python/bootstrap.py @@ -160,7 +160,7 @@ def copy_or_generate_nanoarrow_c(): try: os.mkdir(build_dir) os.chdir(build_dir) - os.system(f'cmake ../.. -DNANOARROW_BUNDLE=ON') + os.system(f'cmake ../.. -DNANOARROW_BUNDLE=ON -DNANOARROW_NAMESPACE=PythonPkg') os.system(f'cmake --install . --prefix=../nanoarrow') finally: if os.path.exists(build_dir): From a1150dd75329066d9e0fec21e8d7461b51937054 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Wed, 14 Jun 2023 16:24:20 -0300 Subject: [PATCH 48/52] use cinit instead of init --- python/nanoarrow/_lib.pyx | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/python/nanoarrow/_lib.pyx b/python/nanoarrow/_lib.pyx index d3e43e4cf..f42359c1f 100644 --- a/python/nanoarrow/_lib.pyx +++ b/python/nanoarrow/_lib.pyx @@ -104,7 +104,7 @@ cdef class ArrayViewHolder: """ cdef ArrowArrayView c_array_view - def __init__(self): + def __cinit__(self): ArrowArrayViewInitFromType(&self.c_array_view, NANOARROW_TYPE_UNINITIALIZED) def __dealloc__(self): @@ -193,7 +193,7 @@ cdef class Schema: base = SchemaHolder() return Schema(base, base._addr()) - def __init__(self, object base, uintptr_t addr): + def __cinit__(self, object base, uintptr_t addr): self._base = base, self._ptr = addr @@ -318,7 +318,7 @@ cdef class SchemaView: NANOARROW_TYPE_SPARSE_UNION ) - def __init__(self): + def __cinit__(self): self._schema_view.type = NANOARROW_TYPE_UNINITIALIZED self._schema_view.storage_type = NANOARROW_TYPE_UNINITIALIZED @@ -420,7 +420,7 @@ cdef class Array: base = ArrayHolder() return Array(base, base._addr(), schema) - def __init__(self, object base, uintptr_t addr, Schema schema): + def __cinit__(self, object base, uintptr_t addr, Schema schema): self._base = base, self._ptr = addr self._schema = schema @@ -511,7 +511,7 @@ cdef class ArrayView: cdef ArrowArrayView* _ptr cdef Array _array - def __init__(self, object base, uintptr_t addr, Array array): + def __cinit__(self, object base, uintptr_t addr, Array array): self._base = base, self._ptr = addr self._array = array @@ -542,7 +542,7 @@ cdef class SchemaChildren: cdef Schema _parent cdef int64_t _length - def __init__(self, Schema parent): + def __cinit__(self, Schema parent): self._parent = parent self._length = parent._ptr.n_children @@ -570,7 +570,7 @@ cdef class SchemaMetadata: cdef const char* _metadata cdef ArrowMetadataReader _reader - def __init__(self, object parent, uintptr_t ptr): + def __cinit__(self, object parent, uintptr_t ptr): self._parent = parent self._metadata = ptr @@ -600,7 +600,7 @@ cdef class ArrayChildren: cdef Array _parent cdef int64_t _length - def __init__(self, Array parent): + def __cinit__(self, Array parent): self._parent = parent self._length = parent._ptr.n_children @@ -625,7 +625,7 @@ cdef class ArrayViewChildren: cdef ArrayView _parent cdef int64_t _length - def __init__(self, ArrayView parent): + def __cinit__(self, ArrayView parent): self._parent = parent self._length = parent._ptr.n_children @@ -659,7 +659,7 @@ cdef class BufferView: cdef Py_ssize_t _shape cdef Py_ssize_t _strides - def __init__(self, object base, uintptr_t addr, + def __cinit__(self, object base, uintptr_t addr, ArrowBufferType buffer_type, ArrowType buffer_data_type, Py_ssize_t element_size_bits): self._base = base @@ -730,7 +730,7 @@ cdef class ArrayViewBuffers: cdef ArrayView _array_view cdef int64_t _length - def __init__(self, ArrayView array_view): + def __cinit__(self, ArrayView array_view): self._array_view = array_view self._length = array_view._array._ptr.n_buffers @@ -781,7 +781,7 @@ cdef class ArrayStream: cdef ArrowArrayStream* _ptr cdef object _cached_schema - def __init__(self, object base, uintptr_t addr): + def __cinit__(self, object base, uintptr_t addr): self._base = base, self._ptr = addr self._cached_schema = None From b73804cb01e8d9d2411f27501ffa77e2b55418a7 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Wed, 14 Jun 2023 16:44:12 -0300 Subject: [PATCH 49/52] clean up Cython --- python/nanoarrow/_lib.pyx | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/python/nanoarrow/_lib.pyx b/python/nanoarrow/_lib.pyx index f42359c1f..4fffea2ad 100644 --- a/python/nanoarrow/_lib.pyx +++ b/python/nanoarrow/_lib.pyx @@ -38,6 +38,7 @@ def c_version(): """ return ArrowNanoarrowVersion().decode("UTF-8") + cdef class SchemaHolder: """Memory holder for an ArrowSchema @@ -57,6 +58,7 @@ cdef class SchemaHolder: def _addr(self): return &self.c_schema + cdef class ArrayHolder: """Memory holder for an ArrowArray @@ -95,6 +97,7 @@ cdef class ArrayStreamHolder: def _addr(self): return &self.c_array_stream + cdef class ArrayViewHolder: """Memory holder for an ArrowArrayView @@ -123,7 +126,7 @@ class NanoarrowException(RuntimeError): and store the components of the original error. """ - def __init__(self, what, code, message): + def __init__(self, what, code, message=""): self.what = what self.code = code self.message = message @@ -708,7 +711,7 @@ cdef class BufferView: return "B" def __getbuffer__(self, Py_buffer *buffer, int flags): - buffer.buf = self._ptr.data.data + buffer.buf = self._ptr.data.data buffer.format = self._get_format() buffer.internal = NULL buffer.itemsize = self._strides @@ -782,7 +785,7 @@ cdef class ArrayStream: cdef object _cached_schema def __cinit__(self, object base, uintptr_t addr): - self._base = base, + self._base = base self._ptr = addr self._cached_schema = None @@ -811,18 +814,13 @@ cdef class ArrayStream: message.decode("UTF-8") ) else: - Error.raise_error("ArrowArrayStream::get_schema()", code) + raise NanoarrowException("ArrowArrayStream::get_schema()", code) self._cached_schema = schema def get_schema(self): """Get the schema associated with this stream """ - # Update the cached copy of the schema as an independent object - self._cached_schema = Schema.allocate() - self._get_schema(self._cached_schema) - - # Return an independent copy out = Schema.allocate() self._get_schema(out) return out @@ -834,6 +832,10 @@ cdef class ArrayStream: """ self._assert_valid() + # We return a reference to the same Python object for each + # Array that is returned. This is independent of get_schema(), + # which is guaranteed to call the C object's callback and + # faithfully pass on the returned value. if self._cached_schema is None: self._cached_schema = Schema.allocate() self._get_schema(self._cached_schema) @@ -850,7 +852,7 @@ cdef class ArrayStream: message.decode("UTF-8") ) else: - Error.raise_error("ArrowArrayStream::get_next()", code) + raise NanoarrowException("ArrowArrayStream::get_next()", code) if not array.is_valid(): return None From a5d4479e569b783d283fd64612199491ce4670f2 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Wed, 14 Jun 2023 16:48:28 -0300 Subject: [PATCH 50/52] use StopIteration --- python/nanoarrow/_lib.pyx | 8 +++----- python/tests/test_nanoarrow.py | 3 ++- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/python/nanoarrow/_lib.pyx b/python/nanoarrow/_lib.pyx index 4fffea2ad..aa3ee2b61 100644 --- a/python/nanoarrow/_lib.pyx +++ b/python/nanoarrow/_lib.pyx @@ -855,15 +855,13 @@ cdef class ArrayStream: raise NanoarrowException("ArrowArrayStream::get_next()", code) if not array.is_valid(): - return None + raise StopIteration() else: return array def __iter__(self): - array = self.get_next() - while array is not None: - yield array - array = self.get_next() + while True: + yield self.get_next() @staticmethod def allocate(): diff --git a/python/tests/test_nanoarrow.py b/python/tests/test_nanoarrow.py index 3f5bea1a4..9501281d6 100644 --- a/python/tests/test_nanoarrow.py +++ b/python/tests/test_nanoarrow.py @@ -281,7 +281,8 @@ def test_array_stream(): assert array_stream.is_valid() is True array = array_stream.get_next() assert array.schema.children[0].name == "some_column" - assert array_stream.get_next() is None + with pytest.raises(StopIteration): + array_stream.get_next() def test_array_stream_iter(): From 6a825047a57228ecbb8a1f56f2bbec0c0236bce9 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Wed, 14 Jun 2023 17:23:27 -0300 Subject: [PATCH 51/52] more clear distinction from the array--array view interaction --- python/nanoarrow/_lib.pyx | 64 +++++++++++++++++++++++++--------- python/tests/test_nanoarrow.py | 3 -- 2 files changed, 48 insertions(+), 19 deletions(-) diff --git a/python/nanoarrow/_lib.pyx b/python/nanoarrow/_lib.pyx index aa3ee2b61..b1851f162 100644 --- a/python/nanoarrow/_lib.pyx +++ b/python/nanoarrow/_lib.pyx @@ -487,15 +487,18 @@ cdef class Array: if result != NANOARROW_OK: error.raise_message("ArrowArrayViewSetArray()", result) - return ArrayView(holder, holder._addr(), self) + return ArrayView(holder, holder._addr(), self._schema, self) cdef class ArrayView: """ArrowArrayView wrapper - The ArrowArrayView is a nanoarrow C library structure that facilitates - access to the deserialized content of an ArrowArray (e.g., buffer types, - lengths, and content). This wrapper extends that facility to Python. + The ArrowArrayView is a nanoarrow C library structure that provides + structured access to buffers addresses, buffer sizes, and buffer + data types. The buffer data is usually propagated from an ArrowArray + but can also be propagated from other types of objects (e.g., serialized + IPC). The offset and length of this view are independent of its parent + (i.e., this object can also represent a slice of its parent). Examples -------- @@ -512,12 +515,26 @@ cdef class ArrayView: """ cdef object _base cdef ArrowArrayView* _ptr - cdef Array _array + cdef Schema _schema + cdef object _base_buffer - def __cinit__(self, object base, uintptr_t addr, Array array): - self._base = base, + def __cinit__(self, object base, uintptr_t addr, Schema schema, object base_buffer): + self._base = base self._ptr = addr - self._array = array + self._schema = schema + self._base_buffer = base_buffer + + @property + def length(self): + return self._ptr.length + + @property + def offset(self): + return self._ptr.offset + + @property + def null_count(self): + return self._ptr.null_count @property def children(self): @@ -529,15 +546,20 @@ cdef class ArrayView: @property def dictionary(self): - return ArrayView(self, self._ptr.dictionary, self._array.dictionary) - - @property - def array(self): - return self._array + if self._ptr.dictionary == NULL: + return None + else: + return ArrayView( + self, + self._ptr.dictionary, + self._schema.dictionary, + None + ) @property def schema(self): - return self._array._schema + return self._schema + cdef class SchemaChildren: """Wrapper for a lazily-resolved list of Schema children @@ -639,13 +661,19 @@ cdef class ArrayViewChildren: k = int(k) if k < 0 or k >= self._length: raise IndexError(f"{k} out of range [0, {self._length})") - return ArrayView(self._parent, self._child_addr(k), self._parent._array.children[k]) + return ArrayView( + self._parent, + self._child_addr(k), + self._parent._schema.children[k], + None + ) cdef _child_addr(self, int64_t i): cdef ArrowArrayView** children = self._parent._ptr.children cdef ArrowArrayView* child = children[i] return child + cdef class BufferView: """Wrapper for Array buffer content @@ -735,7 +763,11 @@ cdef class ArrayViewBuffers: def __cinit__(self, ArrayView array_view): self._array_view = array_view - self._length = array_view._array._ptr.n_buffers + self._length = 3 + for i in range(3): + if self._array_view._ptr.layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_NONE: + self._length = i + break def __len__(self): return self._length diff --git a/python/tests/test_nanoarrow.py b/python/tests/test_nanoarrow.py index 9501281d6..316227407 100644 --- a/python/tests/test_nanoarrow.py +++ b/python/tests/test_nanoarrow.py @@ -168,7 +168,6 @@ def test_array_view(): array = na.array(pa.array([1, 2, 3], pa.int32())) view = array.view() - assert view.array is array assert view.schema is array.schema data_buffer = memoryview(view.buffers[1]) @@ -201,11 +200,9 @@ def test_array_view_recursive(): view = array.view() assert len(view.buffers) == 1 assert len(view.children) == 1 - assert view.array._addr() == array._addr() assert view.schema._addr() == array.schema._addr() assert len(view.children[0].buffers) == 2 - assert view.children[0].array._addr() == array.children[0]._addr() assert view.children[0].schema._addr() == array.schema.children[0]._addr() assert view.children[0].schema._addr() == array.children[0].schema._addr() From 6f530e888edb26f581cae5b0892bf934903644dc Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Wed, 14 Jun 2023 21:24:32 -0300 Subject: [PATCH 52/52] fix doctest --- python/nanoarrow/_lib.pyx | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/nanoarrow/_lib.pyx b/python/nanoarrow/_lib.pyx index b1851f162..b5210e3e9 100644 --- a/python/nanoarrow/_lib.pyx +++ b/python/nanoarrow/_lib.pyx @@ -810,7 +810,9 @@ cdef class ArrayStream: >>> array_stream.get_next().length 3 >>> array_stream.get_next() is None - True + Traceback (most recent call last): + ... + StopIteration """ cdef object _base cdef ArrowArrayStream* _ptr