From 8f888a9b11433b4803d9096b1e633958bc584706 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Thu, 23 Feb 2023 15:06:03 -0400 Subject: [PATCH] maybe schema class --- python/.gitignore | 2 +- python/setup.py | 3 +- python/src/nanoarrow/__init__.py | 18 +++++ python/src/nanoarrow/_lib.pyx | 85 +++++++++++++++++++++- python/src/nanoarrow/arrow_c.pxd | 55 +++++++++++++++ python/src/nanoarrow/nanoarrow_c.pxd | 101 ++++++++++++++++++--------- python/tests/test_nanoarrow.py | 22 ++++-- 7 files changed, 241 insertions(+), 45 deletions(-) create mode 100644 python/src/nanoarrow/arrow_c.pxd diff --git a/python/.gitignore b/python/.gitignore index fcf8363ba..a73fd3d06 100644 --- a/python/.gitignore +++ b/python/.gitignore @@ -18,7 +18,7 @@ src/nanoarrow/nanoarrow.c src/nanoarrow/nanoarrow.h -src/nanoarrow/*.cpp +src/nanoarrow/*.c # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/python/setup.py b/python/setup.py index f6f7efb1c..b89cf1903 100644 --- a/python/setup.py +++ b/python/setup.py @@ -24,7 +24,6 @@ import numpy as np - # setuptools gets confused by relative paths that extend above the project root target = Path(__file__).parent / "src" / "nanoarrow" shutil.copy( @@ -39,7 +38,7 @@ Extension( name="nanoarrow._lib", include_dirs=[np.get_include(), "src/nanoarrow"], - language="c++", + language="c", sources=[ "src/nanoarrow/_lib.pyx", "src/nanoarrow/nanoarrow.c", diff --git a/python/src/nanoarrow/__init__.py b/python/src/nanoarrow/__init__.py index 1586e60ab..9a148a4fc 100644 --- a/python/src/nanoarrow/__init__.py +++ b/python/src/nanoarrow/__init__.py @@ -17,4 +17,22 @@ from ._lib import ( # noqa: F401 as_numpy_array, + version, + CSchemaHolder, + CSchema, ) + +class Schema(CSchema): + + def __init__(self, parent=None, addr=None) -> None: + if parent is None: + parent = CSchemaHolder() + if addr is None: + addr = parent._addr() + super().__init__(parent, addr) + + @staticmethod + def from_pyarrow(obj): + schema = Schema() + obj._export_to_c(schema._addr()) + return schema diff --git a/python/src/nanoarrow/_lib.pyx b/python/src/nanoarrow/_lib.pyx index a6b4da153..ba9cd21f9 100644 --- a/python/src/nanoarrow/_lib.pyx +++ b/python/src/nanoarrow/_lib.pyx @@ -19,8 +19,8 @@ """Low-level nanoarrow Python bindings.""" -from libc.stdint cimport uint8_t, uintptr_t - +from libc.stdint cimport uint8_t, uintptr_t, int64_t +from cpython.mem cimport PyMem_Malloc, PyMem_Free from nanoarrow_c cimport * import numpy as np @@ -84,3 +84,84 @@ def as_numpy_array(arr): # TODO set base return result + + +def version(): + return ArrowNanoarrowVersion().decode("UTF-8") + +cdef class CSchemaHolder: + cdef ArrowSchema c_schema + + def __init__(self): + self.c_schema.release = NULL + + def __del__(self): + if self.c_schema.release != NULL: + self.c_schema.release(&self.c_schema) + + def _addr(self): + return &self.c_schema + +cdef class CSchemaChildren: + cdef CSchema _parent + cdef int64_t _length + + def __init__(self, CSchema parent): + self._parent = parent + self._length = parent._ptr.n_children + + def __len__(self): + return self._length + + def __getitem__(self, k): + k = int(k) + if k < 0 or k >= self._length: + raise IndexError(f"{k} out of range [0, {self._length})") + + return type(self._parent)(self._parent, self._child_addr(k)) + + cdef _child_addr(self, int64_t i): + cdef ArrowSchema** children = self._parent._ptr.children + cdef ArrowSchema* child = children[i] + return child + +cdef class CSchema: + cdef object _base + cdef ArrowSchema* _ptr + + def __init__(self, object base, uintptr_t addr) -> None: + self._base = base, + self._ptr = addr + + def _addr(self): + return self._ptr + + def __repr__(self) -> str: + cdef int64_t n_chars = ArrowSchemaToString(self._ptr, NULL, 0, True) + cdef char* out = PyMem_Malloc(n_chars + 1) + if not out: + raise MemoryError() + + ArrowSchemaToString(self._ptr, out, n_chars + 1, True) + out_str = out.decode("UTF-8") + PyMem_Free(out) + + return out_str + + @property + def format(self): + if self._ptr.format != NULL: + return self._ptr.format.decode("UTF-8") + + @property + def name(self): + if self._ptr.name != NULL: + return self._ptr.name.decode("UTF-8") + + @property + def flags(self): + return self._ptr.flags + + @property + def children(self): + return CSchemaChildren(self) diff --git a/python/src/nanoarrow/arrow_c.pxd b/python/src/nanoarrow/arrow_c.pxd new file mode 100644 index 000000000..a5f98c8af --- /dev/null +++ b/python/src/nanoarrow/arrow_c.pxd @@ -0,0 +1,55 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# cython: language_level = 3 + +from libc.stdint cimport int64_t + +cdef extern from "nanoarrow.h": + cdef int ARROW_FLAG_DICTIONARY_ORDERED + cdef int ARROW_FLAG_NULLABLE + cdef int ARROW_FLAG_MAP_KEYS_SORTED + + cdef struct ArrowSchema: + const char* format + const char* name + const char* metadata + int64_t flags + int64_t n_children + ArrowSchema** children + ArrowSchema* dictionary + void (*release)(ArrowSchema*) + void* private_data + + cdef struct ArrowArray: + int64_t length + int64_t null_count + int64_t offset + int64_t n_buffers + int64_t n_children + const void** buffers + ArrowArray** children + ArrowArray* dictionary + void (*release)(ArrowArray*) + void* private_data + + cdef struct ArrowArrayStream: + int (*get_schema)(ArrowArrayStream* stream, ArrowSchema* out) + int (*get_next)(ArrowArrayStream* stream, ArrowArray* out) + const char* (*get_last_error)(ArrowArrayStream*) + void (*release)(ArrowArrayStream* stream) + void* private_data diff --git a/python/src/nanoarrow/nanoarrow_c.pxd b/python/src/nanoarrow/nanoarrow_c.pxd index 440f449c1..2d76e0d8a 100644 --- a/python/src/nanoarrow/nanoarrow_c.pxd +++ b/python/src/nanoarrow/nanoarrow_c.pxd @@ -17,30 +17,20 @@ # cython: language_level = 3 -from libc.stdint cimport int64_t, int8_t, uint8_t +from libc.stdint cimport int8_t, uint8_t, int16_t, uint16_t, int32_t, uint32_t, int64_t, uint64_t +from arrow_c cimport ArrowSchema, ArrowArray, ArrowArrayStream cdef extern from "nanoarrow.h": - struct ArrowSchema: - const char* format - int64_t n_children - void (*release)(ArrowSchema*) - - struct ArrowArray: - int64_t length - int64_t null_count - int64_t offset - const void** buffers - void (*release)(ArrowArray*) - - struct ArrowArrayStream: - int (*get_schema)(ArrowArrayStream* stream, ArrowSchema* out) - ctypedef int ArrowErrorCode + cdef int NANOARROW_OK + + cdef struct ArrowError: + pass enum ArrowType: - NANOARROW_TYPE_UNINITIALIZED = 0 - NANOARROW_TYPE_NA = 1 + NANOARROW_TYPE_UNINITIALIZED + NANOARROW_TYPE_NA NANOARROW_TYPE_BOOL NANOARROW_TYPE_UINT8 NANOARROW_TYPE_INT8 @@ -87,34 +77,53 @@ cdef extern from "nanoarrow.h": NANOARROW_BUFFER_TYPE_DATA_OFFSET NANOARROW_BUFFER_TYPE_DATA - struct ArrowError: - pass + enum ArrowTimeUnit: + NANOARROW_TIME_UNIT_SECOND + NANOARROW_TIME_UNIT_MILLI + NANOARROW_TIME_UNIT_MICRO + NANOARROW_TIME_UNIT_NANO - const char* ArrowErrorMessage(ArrowError* error) - - struct ArrowLayout: - ArrowBufferType buffer_type[3] - int64_t element_size_bits[3] - int64_t child_size_elements + cdef struct ArrowStringView: + const char* data + int64_t size_bytes cdef union buffer_data: const void* data const int8_t* as_int8 const uint8_t* as_uint8 - - struct ArrowBufferView: + const int16_t* as_int16 + const uint16_t* as_uint16 + const int32_t* as_int32 + const uint32_t* as_uint32 + const int64_t* as_int64 + const uint64_t* as_uint64 + const double* as_double + const float* as_float + const char* as_char + + cdef struct ArrowBufferView: buffer_data data int64_t size_bytes - struct ArrowBuffer: + cdef struct ArrowBufferAllocator: + pass + + cdef struct ArrowBuffer: uint8_t* data int64_t size_bytes + int64_t capacity_bytes + ArrowBufferAllocator allocator - struct ArrowBitmap: + cdef struct ArrowBitmap: ArrowBuffer buffer int64_t size_bits - struct ArrowArrayView: + cdef struct ArrowLayout: + ArrowBufferType buffer_type[3] + int64_t element_size_bits[3] + int64_t child_size_elements + + cdef struct ArrowArrayView: ArrowArray* array ArrowType storage_type ArrowLayout layout @@ -122,6 +131,30 @@ cdef extern from "nanoarrow.h": int64_t n_children ArrowArrayView** children - ArrowErrorCode ArrowArrayViewInitFromSchema(ArrowArrayView* array_view, ArrowSchema* schema, ArrowError* error) - ArrowErrorCode ArrowArrayViewSetArray(ArrowArrayView* array_view, ArrowArray* array, ArrowError* error) - int64_t ArrowBitCountSet(const uint8_t* bits, int64_t i_from, int64_t i_to) + cdef const char* ArrowNanoarrowVersion() + cdef const char* ArrowErrorMessage(ArrowError* error) + + cdef void ArrowSchemaMove(ArrowSchema* src, ArrowSchema* dst) + cdef void ArrowArrayMove(ArrowArray* src, ArrowArray* dst) + cdef void ArrowArrayStreamMove(ArrowArrayStream* src, ArrowArrayStream* dst) + + cdef int64_t ArrowSchemaToString(ArrowSchema* schema, char* out, int64_t n, + char recursive) + cdef ArrowErrorCode ArrowSchemaDeepCopy(ArrowSchema* schema, + ArrowSchema* schema_out) + cdef ArrowErrorCode ArrowSchemaSetType(ArrowSchema* schema,ArrowType type_) + ArrowErrorCode ArrowSchemaSetTypeStruct(ArrowSchema* schema, int64_t n_children) + + cdef struct ArrowMetadataReader: + pass + + cdef ArrowErrorCode ArrowMetadataReaderInit(ArrowMetadataReader* reader, + const char* metadata) + + cdef ArrowErrorCode ArrowMetadataReaderRead(ArrowMetadataReader* reader, + ArrowStringView* key_out, + ArrowStringView* value_out) + + cdef ArrowErrorCode ArrowArrayViewInitFromSchema(ArrowArrayView* array_view, ArrowSchema* schema, ArrowError* error) + cdef ArrowErrorCode ArrowArrayViewSetArray(ArrowArrayView* array_view, ArrowArray* array, ArrowError* error) + cdef int64_t ArrowBitCountSet(const uint8_t* bits, int64_t i_from, int64_t i_to) diff --git a/python/tests/test_nanoarrow.py b/python/tests/test_nanoarrow.py index fd76534e1..2e3bbb709 100644 --- a/python/tests/test_nanoarrow.py +++ b/python/tests/test_nanoarrow.py @@ -1,27 +1,37 @@ import numpy as np import pyarrow as pa -import nanoarrow +import nanoarrow as na import pytest +def test_version(): + assert(na.version() == "0.1.0-SNAPSHOT") def test_as_numpy_array(): - + arr = pa.array([1, 2, 3]) - result = nanoarrow.as_numpy_array(arr) + result = na.as_numpy_array(arr) expected = arr.to_numpy() np.testing.assert_array_equal(result, expected) arr = pa.array([1, 2, 3], pa.uint8()) - result = nanoarrow.as_numpy_array(arr) + result = na.as_numpy_array(arr) expected = arr.to_numpy() np.testing.assert_array_equal(result, expected) arr = pa.array([1, 2, None]) with pytest.raises(ValueError, match="Cannot convert array with nulls"): - nanoarrow.as_numpy_array(arr) + na.as_numpy_array(arr) arr = pa.array([[1], [2, 3]]) with pytest.raises(TypeError, match="Cannot convert a non-primitive array"): - nanoarrow.as_numpy_array(arr) + na.as_numpy_array(arr) + +def test_schema(): + pa_schema = pa.schema([pa.field("some_name", pa.int32())]) + na_schema = na.Schema.from_pyarrow(pa_schema) + assert(na_schema.format == "+s") + assert(na_schema.flags == 0) + assert(len(na_schema.children), 1) + assert(na_schema.children[0].format == "i")