Skip to content

Commit

Permalink
maybe schema class
Browse files Browse the repository at this point in the history
  • Loading branch information
paleolimbot committed Mar 8, 2023
1 parent 9128f21 commit 8f888a9
Show file tree
Hide file tree
Showing 7 changed files with 241 additions and 45 deletions.
2 changes: 1 addition & 1 deletion python/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

src/nanoarrow/nanoarrow.c
src/nanoarrow/nanoarrow.h
src/nanoarrow/*.cpp
src/nanoarrow/*.c

# Byte-compiled / optimized / DLL files
__pycache__/
Expand Down
3 changes: 1 addition & 2 deletions python/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@

import numpy as np


# setuptools gets confused by relative paths that extend above the project root
target = Path(__file__).parent / "src" / "nanoarrow"
shutil.copy(
Expand All @@ -39,7 +38,7 @@
Extension(
name="nanoarrow._lib",
include_dirs=[np.get_include(), "src/nanoarrow"],
language="c++",
language="c",
sources=[
"src/nanoarrow/_lib.pyx",
"src/nanoarrow/nanoarrow.c",
Expand Down
18 changes: 18 additions & 0 deletions python/src/nanoarrow/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,22 @@

from ._lib import ( # noqa: F401
as_numpy_array,
version,
CSchemaHolder,
CSchema,
)

class Schema(CSchema):

def __init__(self, parent=None, addr=None) -> None:
if parent is None:
parent = CSchemaHolder()
if addr is None:
addr = parent._addr()
super().__init__(parent, addr)

@staticmethod
def from_pyarrow(obj):
schema = Schema()
obj._export_to_c(schema._addr())
return schema
85 changes: 83 additions & 2 deletions python/src/nanoarrow/_lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@

"""Low-level nanoarrow Python bindings."""

from libc.stdint cimport uint8_t, uintptr_t

from libc.stdint cimport uint8_t, uintptr_t, int64_t
from cpython.mem cimport PyMem_Malloc, PyMem_Free
from nanoarrow_c cimport *

import numpy as np
Expand Down Expand Up @@ -84,3 +84,84 @@ def as_numpy_array(arr):
# TODO set base

return result


def version():
return ArrowNanoarrowVersion().decode("UTF-8")

cdef class CSchemaHolder:
cdef ArrowSchema c_schema

def __init__(self):
self.c_schema.release = NULL

def __del__(self):
if self.c_schema.release != NULL:
self.c_schema.release(&self.c_schema)

def _addr(self):
return <uintptr_t>&self.c_schema

cdef class CSchemaChildren:
cdef CSchema _parent
cdef int64_t _length

def __init__(self, CSchema parent):
self._parent = parent
self._length = parent._ptr.n_children

def __len__(self):
return self._length

def __getitem__(self, k):
k = int(k)
if k < 0 or k >= self._length:
raise IndexError(f"{k} out of range [0, {self._length})")

return type(self._parent)(self._parent, self._child_addr(k))

cdef _child_addr(self, int64_t i):
cdef ArrowSchema** children = self._parent._ptr.children
cdef ArrowSchema* child = children[i]
return <uintptr_t>child

cdef class CSchema:
cdef object _base
cdef ArrowSchema* _ptr

def __init__(self, object base, uintptr_t addr) -> None:
self._base = base,
self._ptr = <ArrowSchema*>addr

def _addr(self):
return <uintptr_t>self._ptr

def __repr__(self) -> str:
cdef int64_t n_chars = ArrowSchemaToString(self._ptr, NULL, 0, True)
cdef char* out = <char*>PyMem_Malloc(n_chars + 1)
if not out:
raise MemoryError()

ArrowSchemaToString(self._ptr, out, n_chars + 1, True)
out_str = out.decode("UTF-8")
PyMem_Free(out)

return out_str

@property
def format(self):
if self._ptr.format != NULL:
return self._ptr.format.decode("UTF-8")

@property
def name(self):
if self._ptr.name != NULL:
return self._ptr.name.decode("UTF-8")

@property
def flags(self):
return self._ptr.flags

@property
def children(self):
return CSchemaChildren(self)
55 changes: 55 additions & 0 deletions python/src/nanoarrow/arrow_c.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

# cython: language_level = 3

from libc.stdint cimport int64_t

cdef extern from "nanoarrow.h":
cdef int ARROW_FLAG_DICTIONARY_ORDERED
cdef int ARROW_FLAG_NULLABLE
cdef int ARROW_FLAG_MAP_KEYS_SORTED

cdef struct ArrowSchema:
const char* format
const char* name
const char* metadata
int64_t flags
int64_t n_children
ArrowSchema** children
ArrowSchema* dictionary
void (*release)(ArrowSchema*)
void* private_data

cdef struct ArrowArray:
int64_t length
int64_t null_count
int64_t offset
int64_t n_buffers
int64_t n_children
const void** buffers
ArrowArray** children
ArrowArray* dictionary
void (*release)(ArrowArray*)
void* private_data

cdef struct ArrowArrayStream:
int (*get_schema)(ArrowArrayStream* stream, ArrowSchema* out)
int (*get_next)(ArrowArrayStream* stream, ArrowArray* out)
const char* (*get_last_error)(ArrowArrayStream*)
void (*release)(ArrowArrayStream* stream)
void* private_data
101 changes: 67 additions & 34 deletions python/src/nanoarrow/nanoarrow_c.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -17,30 +17,20 @@

# cython: language_level = 3

from libc.stdint cimport int64_t, int8_t, uint8_t
from libc.stdint cimport int8_t, uint8_t, int16_t, uint16_t, int32_t, uint32_t, int64_t, uint64_t

from arrow_c cimport ArrowSchema, ArrowArray, ArrowArrayStream

cdef extern from "nanoarrow.h":
struct ArrowSchema:
const char* format
int64_t n_children
void (*release)(ArrowSchema*)

struct ArrowArray:
int64_t length
int64_t null_count
int64_t offset
const void** buffers
void (*release)(ArrowArray*)

struct ArrowArrayStream:
int (*get_schema)(ArrowArrayStream* stream, ArrowSchema* out)

ctypedef int ArrowErrorCode
cdef int NANOARROW_OK

cdef struct ArrowError:
pass

enum ArrowType:
NANOARROW_TYPE_UNINITIALIZED = 0
NANOARROW_TYPE_NA = 1
NANOARROW_TYPE_UNINITIALIZED
NANOARROW_TYPE_NA
NANOARROW_TYPE_BOOL
NANOARROW_TYPE_UINT8
NANOARROW_TYPE_INT8
Expand Down Expand Up @@ -87,41 +77,84 @@ cdef extern from "nanoarrow.h":
NANOARROW_BUFFER_TYPE_DATA_OFFSET
NANOARROW_BUFFER_TYPE_DATA

struct ArrowError:
pass
enum ArrowTimeUnit:
NANOARROW_TIME_UNIT_SECOND
NANOARROW_TIME_UNIT_MILLI
NANOARROW_TIME_UNIT_MICRO
NANOARROW_TIME_UNIT_NANO

const char* ArrowErrorMessage(ArrowError* error)

struct ArrowLayout:
ArrowBufferType buffer_type[3]
int64_t element_size_bits[3]
int64_t child_size_elements
cdef struct ArrowStringView:
const char* data
int64_t size_bytes

cdef union buffer_data:
const void* data
const int8_t* as_int8
const uint8_t* as_uint8

struct ArrowBufferView:
const int16_t* as_int16
const uint16_t* as_uint16
const int32_t* as_int32
const uint32_t* as_uint32
const int64_t* as_int64
const uint64_t* as_uint64
const double* as_double
const float* as_float
const char* as_char

cdef struct ArrowBufferView:
buffer_data data
int64_t size_bytes

struct ArrowBuffer:
cdef struct ArrowBufferAllocator:
pass

cdef struct ArrowBuffer:
uint8_t* data
int64_t size_bytes
int64_t capacity_bytes
ArrowBufferAllocator allocator

struct ArrowBitmap:
cdef struct ArrowBitmap:
ArrowBuffer buffer
int64_t size_bits

struct ArrowArrayView:
cdef struct ArrowLayout:
ArrowBufferType buffer_type[3]
int64_t element_size_bits[3]
int64_t child_size_elements

cdef struct ArrowArrayView:
ArrowArray* array
ArrowType storage_type
ArrowLayout layout
ArrowBufferView buffer_views[3]
int64_t n_children
ArrowArrayView** children

ArrowErrorCode ArrowArrayViewInitFromSchema(ArrowArrayView* array_view, ArrowSchema* schema, ArrowError* error)
ArrowErrorCode ArrowArrayViewSetArray(ArrowArrayView* array_view, ArrowArray* array, ArrowError* error)
int64_t ArrowBitCountSet(const uint8_t* bits, int64_t i_from, int64_t i_to)
cdef const char* ArrowNanoarrowVersion()
cdef const char* ArrowErrorMessage(ArrowError* error)

cdef void ArrowSchemaMove(ArrowSchema* src, ArrowSchema* dst)
cdef void ArrowArrayMove(ArrowArray* src, ArrowArray* dst)
cdef void ArrowArrayStreamMove(ArrowArrayStream* src, ArrowArrayStream* dst)

cdef int64_t ArrowSchemaToString(ArrowSchema* schema, char* out, int64_t n,
char recursive)
cdef ArrowErrorCode ArrowSchemaDeepCopy(ArrowSchema* schema,
ArrowSchema* schema_out)
cdef ArrowErrorCode ArrowSchemaSetType(ArrowSchema* schema,ArrowType type_)
ArrowErrorCode ArrowSchemaSetTypeStruct(ArrowSchema* schema, int64_t n_children)

cdef struct ArrowMetadataReader:
pass

cdef ArrowErrorCode ArrowMetadataReaderInit(ArrowMetadataReader* reader,
const char* metadata)

cdef ArrowErrorCode ArrowMetadataReaderRead(ArrowMetadataReader* reader,
ArrowStringView* key_out,
ArrowStringView* value_out)

cdef ArrowErrorCode ArrowArrayViewInitFromSchema(ArrowArrayView* array_view, ArrowSchema* schema, ArrowError* error)
cdef ArrowErrorCode ArrowArrayViewSetArray(ArrowArrayView* array_view, ArrowArray* array, ArrowError* error)
cdef int64_t ArrowBitCountSet(const uint8_t* bits, int64_t i_from, int64_t i_to)
22 changes: 16 additions & 6 deletions python/tests/test_nanoarrow.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,37 @@
import numpy as np
import pyarrow as pa

import nanoarrow
import nanoarrow as na

import pytest

def test_version():
assert(na.version() == "0.1.0-SNAPSHOT")

def test_as_numpy_array():

arr = pa.array([1, 2, 3])
result = nanoarrow.as_numpy_array(arr)
result = na.as_numpy_array(arr)
expected = arr.to_numpy()
np.testing.assert_array_equal(result, expected)

arr = pa.array([1, 2, 3], pa.uint8())
result = nanoarrow.as_numpy_array(arr)
result = na.as_numpy_array(arr)
expected = arr.to_numpy()
np.testing.assert_array_equal(result, expected)

arr = pa.array([1, 2, None])
with pytest.raises(ValueError, match="Cannot convert array with nulls"):
nanoarrow.as_numpy_array(arr)
na.as_numpy_array(arr)

arr = pa.array([[1], [2, 3]])
with pytest.raises(TypeError, match="Cannot convert a non-primitive array"):
nanoarrow.as_numpy_array(arr)
na.as_numpy_array(arr)

def test_schema():
pa_schema = pa.schema([pa.field("some_name", pa.int32())])
na_schema = na.Schema.from_pyarrow(pa_schema)
assert(na_schema.format == "+s")
assert(na_schema.flags == 0)
assert(len(na_schema.children), 1)
assert(na_schema.children[0].format == "i")

0 comments on commit 8f888a9

Please sign in to comment.