diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml index 7d9357809..4b599f7ee 100644 --- a/.github/workflows/python.yaml +++ b/.github/workflows/python.yaml @@ -40,7 +40,7 @@ jobs: steps: - uses: actions/checkout@v3 - + - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v4 with: @@ -57,3 +57,31 @@ jobs: - name: Run tests run: | pytest python/tests -v -s + + - name: Run doctests + if: success() && matrix.python-version == '3.10' + run: | + # Needs editable install to run --doctest-cython + pip install pytest-cython + pip install -e python + pytest python --doctest-cython + + - name: Coverage + if: success() && matrix.python-version == '3.10' + run: | + pip uninstall --yes nanoarrow + pip install pytest-cov Cython + pushd python + + # Build with Cython + gcc coverage options + NANOARROW_PYTHON_COVERAGE=1 python setup.py build_ext --inplace + + # Run tests + coverage.py (generates .coverage + coverage.xml files) + python -m pytest --cov ./nanoarrow + python -m coverage xml + + - name: Upload coverage to codecov + if: success() && matrix.python-version == '3.10' + uses: codecov/codecov-action@v2 + with: + files: 'python/coverage.xml' diff --git a/python/.coveragerc b/python/.coveragerc new file mode 100644 index 000000000..1fb6a24ea --- /dev/null +++ b/python/.coveragerc @@ -0,0 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# .coveragerc to control coverage.py +[run] +plugins = Cython.Coverage diff --git a/python/.gitignore b/python/.gitignore index fcf8363ba..b3724522b 100644 --- a/python/.gitignore +++ b/python/.gitignore @@ -16,9 +16,10 @@ # specific language governing permissions and limitations # under the License. -src/nanoarrow/nanoarrow.c -src/nanoarrow/nanoarrow.h -src/nanoarrow/*.cpp +nanoarrow/nanoarrow.c +nanoarrow/nanoarrow.h +nanoarrow/nanoarrow_c.pxd +nanoarrow/*.c # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/python/MANIFEST.in b/python/MANIFEST.in new file mode 100644 index 000000000..93ed2fd0a --- /dev/null +++ b/python/MANIFEST.in @@ -0,0 +1,21 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +exclude bootstrap.py +include nanoarrow/nanoarrow.c +include nanoarrow/nanoarrow.h +include nanoarrow/nanoarrow_c.pxd diff --git a/python/README.ipynb b/python/README.ipynb new file mode 100644 index 000000000..d89d4c4a6 --- /dev/null +++ b/python/README.ipynb @@ -0,0 +1,392 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "\n", + "\n", + "# nanoarrow for Python\n", + "\n", + "The nanoarrow Python package provides bindings to the nanoarrow C library. Like\n", + "the nanoarrow C library, it provides tools to facilitate the use of the\n", + "[Arrow C Data](https://arrow.apache.org/docs/format/CDataInterface.html) \n", + "and [Arrow C Stream](https://arrow.apache.org/docs/format/CStreamInterface.html) \n", + "interfaces.\n", + "\n", + "## Installation\n", + "\n", + "Python bindings for nanoarrow are not yet available on PyPI. You can install via\n", + "URL (requires a C compiler):\n", + "\n", + "```bash\n", + "python -m pip install \"https://github.com/apache/arrow-nanoarrow/archive/refs/heads/main.zip#egg=nanoarrow&subdirectory=python\"\n", + "```\n", + "\n", + "If you can import the namespace, you're good to go!" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import nanoarrow as na" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Example\n", + "\n", + "The Arrow C Data and Arrow C Stream interfaces are comprised of three structures: the `ArrowSchema` which represents a data type of an array, the `ArrowArray` which represents the values of an array, and an `ArrowArrayStream`, which represents zero or more `ArrowArray`s with a common `ArrowSchema`. All three can be wrapped by Python objects using the nanoarrow Python package.\n", + "\n", + "### Schemas\n", + "\n", + "Use `nanoarrow.schema()` to convert a data type-like object to an `ArrowSchema`. This is currently only implemented for pyarrow objects." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import pyarrow as pa\n", + "schema = na.schema(pa.decimal128(10, 3))" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can extract the fields of a `Schema` object one at a time or parse it into a view to extract deserialized parameters." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "d:10,3\n", + "10\n", + "3\n" + ] + } + ], + "source": [ + "print(schema.format)\n", + "print(schema.view().decimal_precision)\n", + "print(schema.view().decimal_scale)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The `nanoarrow.schema()` helper is currently only implemented for pyarrow objects. If your data type has an `_export_to_c()`-like function, you can get the address of a freshly-allocated `ArrowSchema` as well:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'int32'" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "schema = na.Schema.allocate()\n", + "pa.int32()._export_to_c(schema._addr())\n", + "schema.view().type" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The `Schema` object cleans up after itself: when the object is deleted, the underlying `Schema` is released." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Arrays\n", + "\n", + "You can use `nanoarrow.array()` to convert an array-like object to a `nanoarrow.Array`, optionally attaching a `Schema` that can be used to interpret its contents. This is currently only implemented for pyarrow objects." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "array = na.array(pa.array([\"one\", \"two\", \"three\", None]))" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Like the `Schema`, you can inspect an `Array` by extracting fields individually:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "4\n", + "1\n" + ] + } + ], + "source": [ + "print(array.length)\n", + "print(array.null_count)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "...and parse the `Array`/`Schema` combination into a view whose contents is more readily accessible." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[array([7], dtype=uint8),\n", + " array([ 0, 3, 6, 11, 11], dtype=int32),\n", + " array([b'o', b'n', b'e', b't', b'w', b'o', b't', b'h', b'r', b'e', b'e'],\n", + " dtype='|S1')]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import numpy as np\n", + "view = array.view()\n", + "[np.array(buffer) for buffer in view.buffers]" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Like the `Schema`, you can allocate an empty one and access its address with `_addr()` to pass to other array-exporting functions." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "3" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "array = na.Array.allocate(na.Schema.allocate())\n", + "pa.array([1, 2, 3])._export_to_c(array._addr(), array.schema._addr())\n", + "array.length" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Array streams\n", + "\n", + "You can use `nanoarrow.array_stream()` to convert an object representing a sequence of `Array`s with a common `Schema` to a `nanoarrow.ArrayStream`. This is currently only implemented for pyarrow objects." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "pa_array_child = pa.array([1, 2, 3], pa.int32())\n", + "pa_array = pa.record_batch([pa_array_child], names=[\"some_column\"])\n", + "reader = pa.RecordBatchReader.from_batches(pa_array.schema, [pa_array])\n", + "array_stream = na.array_stream(reader)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can pull the next array from the stream using `.get_next()` or use it like an interator. The `.get_next()` method will return `None` when there are no more arrays in the stream." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "struct\n", + "3\n", + "True\n" + ] + } + ], + "source": [ + "print(array_stream.get_schema())\n", + "\n", + "for array in array_stream:\n", + " print(array.length)\n", + "\n", + "print(array_stream.get_next() is None)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can also get the address of a freshly-allocated stream to pass to a suitable exporting function:" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "struct" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "array_stream = na.ArrayStream.allocate()\n", + "reader._export_to_c(array_stream._addr())\n", + "array_stream.get_schema()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Development\n", + "\n", + "Python bindings for nanoarrow are managed with [setuptools](https://setuptools.pypa.io/en/latest/index.html).\n", + "This means you can build the project using:\n", + "\n", + "```shell\n", + "git clone https://github.com/apache/arrow-nanoarrow.git\n", + "cd arrow-nanoarrow/python\n", + "pip install -e .\n", + "```\n", + "\n", + "Tests use [pytest](https://docs.pytest.org/):\n", + "\n", + "```shell\n", + "# Install dependencies\n", + "pip install -e .[test]\n", + "\n", + "# Run tests\n", + "pytest -vvx\n", + "```" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.6" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/python/README.md b/python/README.md index 701896bb5..db898d24a 100644 --- a/python/README.md +++ b/python/README.md @@ -17,28 +17,196 @@ under the License. --> + + # nanoarrow for Python -Python bindings for nanoarrow. -## Building +The nanoarrow Python package provides bindings to the nanoarrow C library. Like +the nanoarrow C library, it provides tools to facilitate the use of the +[Arrow C Data](https://arrow.apache.org/docs/format/CDataInterface.html) +and [Arrow C Stream](https://arrow.apache.org/docs/format/CStreamInterface.html) +interfaces. + +## Installation + +Python bindings for nanoarrow are not yet available on PyPI. You can install via +URL (requires a C compiler): + +```bash +python -m pip install "https://github.com/apache/arrow-nanoarrow/archive/refs/heads/main.zip#egg=nanoarrow&subdirectory=python" +``` + +If you can import the namespace, you're good to go! + + +```python +import nanoarrow as na +``` + +## Example + +The Arrow C Data and Arrow C Stream interfaces are comprised of three structures: the `ArrowSchema` which represents a data type of an array, the `ArrowArray` which represents the values of an array, and an `ArrowArrayStream`, which represents zero or more `ArrowArray`s with a common `ArrowSchema`. All three can be wrapped by Python objects using the nanoarrow Python package. + +### Schemas + +Use `nanoarrow.schema()` to convert a data type-like object to an `ArrowSchema`. This is currently only implemented for pyarrow objects. + + +```python +import pyarrow as pa +schema = na.schema(pa.decimal128(10, 3)) +``` + +You can extract the fields of a `Schema` object one at a time or parse it into a view to extract deserialized parameters. + + +```python +print(schema.format) +print(schema.view().decimal_precision) +print(schema.view().decimal_scale) +``` + + d:10,3 + 10 + 3 + + +The `nanoarrow.schema()` helper is currently only implemented for pyarrow objects. If your data type has an `_export_to_c()`-like function, you can get the address of a freshly-allocated `ArrowSchema` as well: + + +```python +schema = na.Schema.allocate() +pa.int32()._export_to_c(schema._addr()) +schema.view().type +``` + + + + + 'int32' + + + +The `Schema` object cleans up after itself: when the object is deleted, the underlying `Schema` is released. + +### Arrays + +You can use `nanoarrow.array()` to convert an array-like object to a `nanoarrow.Array`, optionally attaching a `Schema` that can be used to interpret its contents. This is currently only implemented for pyarrow objects. + + +```python +array = na.array(pa.array(["one", "two", "three", None])) +``` + +Like the `Schema`, you can inspect an `Array` by extracting fields individually: + + +```python +print(array.length) +print(array.null_count) +``` + + 4 + 1 + + +...and parse the `Array`/`Schema` combination into a view whose contents is more readily accessible. -Python libraries are managed with [setuptools][setuptools]. In general, that -means all projects can be built as follows: + +```python +import numpy as np +view = array.view() +[np.array(buffer) for buffer in view.buffers] +``` + + + + + [array([7], dtype=uint8), + array([ 0, 3, 6, 11, 11], dtype=int32), + array([b'o', b'n', b'e', b't', b'w', b'o', b't', b'h', b'r', b'e', b'e'], + dtype='|S1')] + + + +Like the `Schema`, you can allocate an empty one and access its address with `_addr()` to pass to other array-exporting functions. + + +```python +array = na.Array.allocate(na.Schema.allocate()) +pa.array([1, 2, 3])._export_to_c(array._addr(), array.schema._addr()) +array.length +``` + + + + + 3 + + + +### Array streams + +You can use `nanoarrow.array_stream()` to convert an object representing a sequence of `Array`s with a common `Schema` to a `nanoarrow.ArrayStream`. This is currently only implemented for pyarrow objects. + + +```python +pa_array_child = pa.array([1, 2, 3], pa.int32()) +pa_array = pa.record_batch([pa_array_child], names=["some_column"]) +reader = pa.RecordBatchReader.from_batches(pa_array.schema, [pa_array]) +array_stream = na.array_stream(reader) +``` + +You can pull the next array from the stream using `.get_next()` or use it like an interator. The `.get_next()` method will return `None` when there are no more arrays in the stream. + + +```python +print(array_stream.get_schema()) + +for array in array_stream: + print(array.length) + +print(array_stream.get_next() is None) +``` + + struct + 3 + True + + +You can also get the address of a freshly-allocated stream to pass to a suitable exporting function: + + +```python +array_stream = na.ArrayStream.allocate() +reader._export_to_c(array_stream._addr()) +array_stream.get_schema() +``` + + + + + struct + + + +## Development + +Python bindings for nanoarrow are managed with [setuptools](https://setuptools.pypa.io/en/latest/index.html). +This means you can build the project using: ```shell -$ cd python -$ pip install -e . +git clone https://github.com/apache/arrow-nanoarrow.git +cd arrow-nanoarrow/python +pip install -e . ``` -Tests use [pytest][pytest]: +Tests use [pytest](https://docs.pytest.org/): ```shell # Install dependencies -$ pip install -e .[test] +pip install -e .[test] # Run tests -$ pytest -vvx +pytest -vvx ``` - -[pytest]: https://docs.pytest.org/ -[setuptools]: https://setuptools.pypa.io/en/latest/index.html \ No newline at end of file diff --git a/python/bootstrap.py b/python/bootstrap.py new file mode 100644 index 000000000..39b4fd950 --- /dev/null +++ b/python/bootstrap.py @@ -0,0 +1,199 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import re +import os +import shutil + +# Generate the nanoarrow_c.pxd file used by the Cython extension +class NanoarrowPxdGenerator: + + def __init__(self): + self._define_regexes() + + def generate_nanoarrow_pxd(self, file_in, file_out): + file_in_name = os.path.basename(file_in) + + # Read the nanoarrow.h header + content = None + with open(file_in, 'r') as input: + content = input.read() + + # Strip comments + content = self.re_comment.sub('', content) + + # Find types and function definitions + types = self._find_types(content) + func_defs = self._find_func_defs(content) + + # Make corresponding cython definitions + types_cython = [self._type_to_cython(t, ' ') for t in types] + func_defs_cython = [self._func_def_to_cython(d, ' ') for d in func_defs] + + # Unindent the header + header = self.re_newline_plus_indent.sub('\n', self._pxd_header()) + + # Write nanoarrow_c.pxd + with open(file_out, 'wb') as output: + output.write(header.encode('UTF-8')) + + output.write(f'\ncdef extern from "{file_in_name}" nogil:\n'.encode("UTF-8")) + + # A few things we add in manually + output.write(b'\n') + output.write(b' ctypedef int ArrowErrorCode\n') + output.write(b' cdef int NANOARROW_OK\n') + output.write(b'\n') + + for type in types_cython: + output.write(type.encode('UTF-8')) + output.write(b'\n\n') + + for func_def in func_defs_cython: + output.write(func_def.encode('UTF-8')) + output.write(b'\n') + + def _define_regexes(self): + self.re_comment = re.compile(r'\s*//[^\n]*') + self.re_type = re.compile(r'(?Pstruct|union|enum) (?PArrow[^ ]+) {(?P[^}]*)}') + self.re_func_def = re.compile(r'\n(static inline )?(?Pconst )?(struct|enum )?(?P[A-Za-z0-9_*]+) (?PArrow[A-Za-z]+)\((?P[^\)]*)\);') + self.re_tagged_type = re.compile(r'(?Pstruct|union|enum) (?PArrow[A-Za-z]+)') + self.re_struct_delim = re.compile(r';\s*') + self.re_enum_delim = re.compile(r',\s*') + self.re_whitespace = re.compile(r'\s+') + self.re_newline_plus_indent = re.compile(r'\n +') + + def _strip_comments(self, content): + return self.re_comment.sub('', content) + + def _find_types(self, content): + return [m.groupdict() for m in self.re_type.finditer(content)] + + def _find_func_defs(self, content): + return [m.groupdict() for m in self.re_func_def.finditer(content)] + + def _type_to_cython(self, t, indent=''): + type = t['type'] + name = t['name'] + body = self.re_tagged_type.sub(r'\2', t['body'].strip()) + if type == 'enum': + items = [item for item in self.re_enum_delim.split(body) if item] + else: + items = [item for item in self.re_struct_delim.split(body) if item] + + cython_body = f'\n{indent} '.join([''] + items) + return f'{indent}{type} {name}:{cython_body}' + + def _func_def_to_cython(self, d, indent=''): + return_type = d['return_type'].strip() + if d['const']: + return_type = 'const ' + return_type + name = d['name'] + args = re.sub(r'\s+', ' ', d['args'].strip()) + args = self.re_tagged_type.sub(r'\2', args) + + # Cython doesn't do (void) + if args == 'void': + args = '' + + return f'{indent}{return_type} {name}({args})' + + def _pxd_header(self): + return """ + # Licensed to the Apache Software Foundation (ASF) under one + # or more contributor license agreements. See the NOTICE file + # distributed with this work for additional information + # regarding copyright ownership. The ASF licenses this file + # to you under the Apache License, Version 2.0 (the + # "License"); you may not use this file except in compliance + # with the License. You may obtain a copy of the License at + # + # http://www.apache.org/licenses/LICENSE-2.0 + # + # Unless required by applicable law or agreed to in writing, + # software distributed under the License is distributed on an + # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + # KIND, either express or implied. See the License for the + # specific language governing permissions and limitations + # under the License. + + # cython: language_level = 3 + + from libc.stdint cimport int8_t, uint8_t, int16_t, uint16_t, int32_t, uint32_t, int64_t, uint64_t + """ + +# Runs cmake -DNANOARROW_BUNDLE=ON if cmake exists or copies nanoarrow.c/h +# from ../dist if it does not. Running cmake is safer because it will sync +# any changes from nanoarrow C library sources in the checkout but is not +# strictly necessary for things like installing from GitHub. +def copy_or_generate_nanoarrow_c(): + this_wd = os.getcwd() + this_dir = os.path.abspath(os.path.dirname(__file__)) + source_dir = os.path.dirname(this_dir) + + maybe_nanoarrow_h = os.path.join(this_dir, 'nanoarrow/nanoarrow.h') + maybe_nanoarrow_c = os.path.join(this_dir, 'nanoarrow/nanoarrow.c') + for f in (maybe_nanoarrow_c, maybe_nanoarrow_h): + if os.path.exists(f): + os.unlink(f) + + is_cmake_dir = 'CMakeLists.txt' in os.listdir(source_dir) + is_in_nanoarrow_repo = 'nanoarrow.h' in os.listdir(os.path.join(source_dir, 'src', 'nanoarrow')) + has_cmake = os.system('cmake --version') == 0 + build_dir = os.path.join(this_dir, '_cmake') + + if has_cmake and is_cmake_dir and is_in_nanoarrow_repo: + try: + os.mkdir(build_dir) + os.chdir(build_dir) + os.system(f'cmake ../.. -DNANOARROW_BUNDLE=ON -DNANOARROW_NAMESPACE=PythonPkg') + os.system(f'cmake --install . --prefix=../nanoarrow') + finally: + if os.path.exists(build_dir): + # Can fail on Windows with permission issues + try: + shutil.rmtree(build_dir) + except Exception as e: + print(f'Failed to remove _cmake temp directory: {str(e)}') + os.chdir(this_wd) + + elif is_in_nanoarrow_repo: + shutil.copyfile() + else: + raise ValueError('Attempt to build source distribution outside the nanoarrow repo') + + if not os.path.exists(os.path.join(this_dir, 'nanoarrow/nanoarrow.h')): + raise ValueError('Attempt to vendor nanoarrow.c/h failed') + + maybe_nanoarrow_hpp = os.path.join(this_dir, 'nanoarrow/nanoarrow.hpp') + if os.path.exists(maybe_nanoarrow_hpp): + os.unlink(maybe_nanoarrow_hpp) + +# Runs the pxd generator with some information about the file name +def generate_nanoarrow_pxd(): + this_dir = os.path.abspath(os.path.dirname(__file__)) + maybe_nanoarrow_h = os.path.join(this_dir, 'nanoarrow/nanoarrow.h') + maybe_nanoarrow_pxd = os.path.join(this_dir, 'nanoarrow/nanoarrow_c.pxd') + + NanoarrowPxdGenerator().generate_nanoarrow_pxd( + maybe_nanoarrow_h, + maybe_nanoarrow_pxd + ) + +if __name__ == '__main__': + copy_or_generate_nanoarrow_c() + generate_nanoarrow_pxd() diff --git a/python/src/nanoarrow/__init__.py b/python/nanoarrow/__init__.py similarity index 87% rename from python/src/nanoarrow/__init__.py rename to python/nanoarrow/__init__.py index 1586e60ab..bb4372642 100644 --- a/python/src/nanoarrow/__init__.py +++ b/python/nanoarrow/__init__.py @@ -15,6 +15,5 @@ # specific language governing permissions and limitations # under the License. -from ._lib import ( # noqa: F401 - as_numpy_array, -) +from ._lib import c_version, Schema, Array, ArrayView, ArrayStream +from .lib import schema, array, array_stream diff --git a/python/nanoarrow/_lib.pyx b/python/nanoarrow/_lib.pyx new file mode 100644 index 000000000..b5210e3e9 --- /dev/null +++ b/python/nanoarrow/_lib.pyx @@ -0,0 +1,903 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# cython: language_level = 3 +# cython: linetrace=True + +"""Low-level nanoarrow Python bindings + +This Cython extension provides low-level Python wrappers around the +Arrow C Data and Arrow C Stream interface structs. In general, there +is one wrapper per C struct and pointer validity is managed by keeping +strong references to Python objects. These wrappers are intended to +be literal and stay close to the structure definitions. +""" + +from libc.stdint cimport uintptr_t, int64_t +from cpython.mem cimport PyMem_Malloc, PyMem_Free +from cpython.bytes cimport PyBytes_FromStringAndSize +from cpython cimport Py_buffer +from nanoarrow_c cimport * + +def c_version(): + """Return the nanoarrow C library version string + """ + return ArrowNanoarrowVersion().decode("UTF-8") + + +cdef class SchemaHolder: + """Memory holder for an ArrowSchema + + This class is responsible for the lifecycle of the ArrowSchema + whose memory it is responsible for. When this object is deleted, + a non-NULL release callback is invoked. + """ + cdef ArrowSchema c_schema + + def __cinit__(self): + self.c_schema.release = NULL + + def __dealloc__(self): + if self.c_schema.release != NULL: + self.c_schema.release(&self.c_schema) + + def _addr(self): + return &self.c_schema + + +cdef class ArrayHolder: + """Memory holder for an ArrowArray + + This class is responsible for the lifecycle of the ArrowArray + whose memory it is responsible. When this object is deleted, + a non-NULL release callback is invoked. + """ + cdef ArrowArray c_array + + def __cinit__(self): + self.c_array.release = NULL + + def __dealloc__(self): + if self.c_array.release != NULL: + self.c_array.release(&self.c_array) + + def _addr(self): + return &self.c_array + +cdef class ArrayStreamHolder: + """Memory holder for an ArrowArrayStream + + This class is responsible for the lifecycle of the ArrowArrayStream + whose memory it is responsible. When this object is deleted, + a non-NULL release callback is invoked. + """ + cdef ArrowArrayStream c_array_stream + + def __cinit__(self): + self.c_array_stream.release = NULL + + def __dealloc__(self): + if self.c_array_stream.release != NULL: + self.c_array_stream.release(&self.c_array_stream) + + def _addr(self): + return &self.c_array_stream + + +cdef class ArrayViewHolder: + """Memory holder for an ArrowArrayView + + This class is responsible for the lifecycle of the ArrowArrayView + whose memory it is responsible. When this object is deleted, + ArrowArrayViewReset() is called on the contents. + """ + cdef ArrowArrayView c_array_view + + def __cinit__(self): + ArrowArrayViewInitFromType(&self.c_array_view, NANOARROW_TYPE_UNINITIALIZED) + + def __dealloc__(self): + ArrowArrayViewReset(&self.c_array_view) + + def _addr(self): + return &self.c_array_view + + +class NanoarrowException(RuntimeError): + """An error resulting from a call to the nanoarrow C library + + Calls to the nanoarrow C library and/or the Arrow C Stream interface + callbacks return an errno error code and sometimes a message with extra + detail. This exception wraps a RuntimeError to format a suitable message + and store the components of the original error. + """ + + def __init__(self, what, code, message=""): + self.what = what + self.code = code + self.message = message + + if self.message == "": + super().__init__(f"{self.what} failed ({self.code})") + else: + super().__init__(f"{self.what} failed ({self.code}): {self.message}") + + +cdef class Error: + """Memory holder for an ArrowError + + ArrowError is the C struct that is optionally passed to nanoarrow functions + when a detailed error message might be returned. This class holds a C + reference to the object and provides helpers for raising exceptions based + on the contained message. + """ + cdef ArrowError c_error + + def __cinit__(self): + self.c_error.message[0] = 0 + + def raise_message(self, what, code): + """Raise a NanoarrowException from this message + """ + raise NanoarrowException(what, code, self.c_error.message.decode("UTF-8")) + + @staticmethod + def raise_error(what, code): + """Raise a NanoarrowException without a message + """ + raise NanoarrowException(what, code, "") + + +cdef class Schema: + """ArrowSchema wrapper + + This class provides a user-facing interface to access the fields of + an ArrowSchema as defined in the Arrow C Data interface. These objects + are usually created using `nanoarrow.schema()`. This Python wrapper + allows access to schema fields but does not automatically deserialize + their content: use `.view()` to validate and deserialize the content + into a more easily inspectable object. + + Examples + -------- + + >>> import pyarrow as pa + >>> import nanoarrow as na + >>> schema = na.schema(pa.int32()) + >>> schema.is_valid() + True + >>> schema.format + 'i' + >>> schema.name + '' + >>> schema_view = schema.view() + >>> schema_view.type + 'int32' + """ + cdef object _base + cdef ArrowSchema* _ptr + + @staticmethod + def allocate(): + base = SchemaHolder() + return Schema(base, base._addr()) + + def __cinit__(self, object base, uintptr_t addr): + self._base = base, + self._ptr = addr + + def _addr(self): + return self._ptr + + def is_valid(self): + return self._ptr != NULL and self._ptr.release != NULL + + def _assert_valid(self): + if self._ptr == NULL: + raise RuntimeError("schema is NULL") + if self._ptr.release == NULL: + raise RuntimeError("schema is released") + + def __repr__(self): + cdef int64_t n_chars = ArrowSchemaToString(self._ptr, NULL, 0, True) + cdef char* out = PyMem_Malloc(n_chars + 1) + if not out: + raise MemoryError() + + ArrowSchemaToString(self._ptr, out, n_chars + 1, True) + out_str = out.decode("UTF-8") + PyMem_Free(out) + + return out_str + + @property + def format(self): + self._assert_valid() + if self._ptr.format != NULL: + return self._ptr.format.decode("UTF-8") + + @property + def name(self): + self._assert_valid() + if self._ptr.name != NULL: + return self._ptr.name.decode("UTF-8") + else: + return None + + @property + def flags(self): + return self._ptr.flags + + @property + def metadata(self): + self._assert_valid() + if self._ptr.metadata != NULL: + return SchemaMetadata(self, self._ptr.metadata) + else: + return None + + @property + def children(self): + self._assert_valid() + return SchemaChildren(self) + + @property + def dictionary(self): + self._assert_valid() + if self._ptr.dictionary != NULL: + return Schema(self, self._ptr.dictionary) + else: + return None + + def view(self): + self._assert_valid() + schema_view = SchemaView() + cdef Error error = Error() + cdef int result = ArrowSchemaViewInit(&schema_view._schema_view, self._ptr, &error.c_error) + if result != NANOARROW_OK: + error.raise_message("ArrowSchemaViewInit()", result) + + return schema_view + + +cdef class SchemaView: + """ArrowSchemaView wrapper + + The ArrowSchemaView is a nanoarrow C library structure that facilitates + access to the deserialized content of an ArrowSchema (e.g., parameter + values for parameterized types). This wrapper extends that facility to Python. + + Examples + -------- + + >>> import pyarrow as pa + >>> import nanoarrow as na + >>> schema = na.schema(pa.decimal128(10, 3)) + >>> schema_view = schema.view() + >>> schema_view.type + 'decimal128' + >>> schema_view.decimal_bitwidth + 128 + >>> schema_view.decimal_precision + 10 + >>> schema_view.decimal_scale + 3 + """ + cdef ArrowSchemaView _schema_view + + _fixed_size_types = ( + NANOARROW_TYPE_FIXED_SIZE_LIST, + NANOARROW_TYPE_FIXED_SIZE_BINARY + ) + + _decimal_types = ( + NANOARROW_TYPE_DECIMAL128, + NANOARROW_TYPE_DECIMAL256 + ) + + _time_unit_types = ( + NANOARROW_TYPE_TIME32, + NANOARROW_TYPE_TIME64, + NANOARROW_TYPE_DURATION, + NANOARROW_TYPE_TIMESTAMP + ) + + _union_types = ( + NANOARROW_TYPE_DENSE_UNION, + NANOARROW_TYPE_SPARSE_UNION + ) + + def __cinit__(self): + self._schema_view.type = NANOARROW_TYPE_UNINITIALIZED + self._schema_view.storage_type = NANOARROW_TYPE_UNINITIALIZED + + @property + def type(self): + cdef const char* type_str = ArrowTypeString(self._schema_view.type) + if type_str != NULL: + return type_str.decode('UTF-8') + + @property + def storage_type(self): + cdef const char* type_str = ArrowTypeString(self._schema_view.storage_type) + if type_str != NULL: + return type_str.decode('UTF-8') + + @property + def fixed_size(self): + if self._schema_view.type in SchemaView._fixed_size_types: + return self._schema_view.fixed_size + + @property + def decimal_bitwidth(self): + if self._schema_view.type in SchemaView._decimal_types: + return self._schema_view.decimal_bitwidth + + @property + def decimal_precision(self): + if self._schema_view.type in SchemaView._decimal_types: + return self._schema_view.decimal_precision + + @property + def decimal_scale(self): + if self._schema_view.type in SchemaView._decimal_types: + return self._schema_view.decimal_scale + + @property + def time_unit(self): + if self._schema_view.type in SchemaView._time_unit_types: + return ArrowTimeUnitString(self._schema_view.time_unit).decode('UTF-8') + + @property + def timezone(self): + if self._schema_view.type == NANOARROW_TYPE_TIMESTAMP: + return self._schema_view.timezone.decode('UTF_8') + + @property + def union_type_ids(self): + if self._schema_view.type in SchemaView._union_types: + type_ids_str = self._schema_view.union_type_ids.decode('UTF-8').split(',') + return (int(type_id) for type_id in type_ids_str) + + @property + def extension_name(self): + if self._schema_view.extension_name.data != NULL: + name_bytes = PyBytes_FromStringAndSize( + self._schema_view.extension_name.data, + self._schema_view.extension_name.size_bytes + ) + return name_bytes.decode('UTF-8') + + @property + def extension_metadata(self): + if self._schema_view.extension_name.data != NULL: + return PyBytes_FromStringAndSize( + self._schema_view.extension_metadata.data, + self._schema_view.extension_metadata.size_bytes + ) + +cdef class Array: + """ArrowArray wrapper + + This class provides a user-facing interface to access the fields of + an ArrowArray as defined in the Arrow C Data interface, holding an + optional reference to a Schema that can be used to safely deserialize + the content. These objects are usually created using `nanoarrow.array()`. + This Python wrapper allows access to array fields but does not + automatically deserialize their content: use `.view()` to validate and + deserialize the content into a more easily inspectable object. + + Examples + -------- + + >>> import pyarrow as pa + >>> import numpy as np + >>> import nanoarrow as na + >>> array = na.array(pa.array(["one", "two", "three", None])) + >>> array.length + 4 + >>> array.null_count + 1 + >>> array_view = array.view() + """ + cdef object _base + cdef ArrowArray* _ptr + cdef Schema _schema + + @staticmethod + def allocate(Schema schema): + base = ArrayHolder() + return Array(base, base._addr(), schema) + + def __cinit__(self, object base, uintptr_t addr, Schema schema): + self._base = base, + self._ptr = addr + self._schema = schema + + def _addr(self): + return self._ptr + + def is_valid(self): + return self._ptr != NULL and self._ptr.release != NULL + + def _assert_valid(self): + if self._ptr == NULL: + raise RuntimeError("Array is NULL") + if self._ptr.release == NULL: + raise RuntimeError("Array is released") + + @property + def schema(self): + return self._schema + + @property + def length(self): + self._assert_valid() + return self._ptr.length + + @property + def offset(self): + self._assert_valid() + return self._ptr.offset + + @property + def null_count(self): + return self._ptr.null_count + + @property + def buffers(self): + return tuple(self._ptr.buffers[i] for i in range(self._ptr.n_buffers)) + + @property + def children(self): + return ArrayChildren(self) + + @property + def dictionary(self): + self._assert_valid() + if self._ptr.dictionary != NULL: + return Array(self, self._ptr.dictionary, self._schema.dictionary) + else: + return None + + def view(self): + cdef ArrayViewHolder holder = ArrayViewHolder() + + cdef Error error = Error() + cdef int result = ArrowArrayViewInitFromSchema(&holder.c_array_view, + self._schema._ptr, &error.c_error) + if result != NANOARROW_OK: + error.raise_message("ArrowArrayViewInitFromSchema()", result) + + result = ArrowArrayViewSetArray(&holder.c_array_view, self._ptr, &error.c_error) + if result != NANOARROW_OK: + error.raise_message("ArrowArrayViewSetArray()", result) + + return ArrayView(holder, holder._addr(), self._schema, self) + + +cdef class ArrayView: + """ArrowArrayView wrapper + + The ArrowArrayView is a nanoarrow C library structure that provides + structured access to buffers addresses, buffer sizes, and buffer + data types. The buffer data is usually propagated from an ArrowArray + but can also be propagated from other types of objects (e.g., serialized + IPC). The offset and length of this view are independent of its parent + (i.e., this object can also represent a slice of its parent). + + Examples + -------- + + >>> import pyarrow as pa + >>> import numpy as np + >>> import nanoarrow as na + >>> array_view = na.array(pa.array(["one", "two", "three", None])).view() + >>> np.array(array_view.buffers[1]) + array([ 0, 3, 6, 11, 11], dtype=int32) + >>> np.array(array_view.buffers[2]) + array([b'o', b'n', b'e', b't', b'w', b'o', b't', b'h', b'r', b'e', b'e'], + dtype='|S1') + """ + cdef object _base + cdef ArrowArrayView* _ptr + cdef Schema _schema + cdef object _base_buffer + + def __cinit__(self, object base, uintptr_t addr, Schema schema, object base_buffer): + self._base = base + self._ptr = addr + self._schema = schema + self._base_buffer = base_buffer + + @property + def length(self): + return self._ptr.length + + @property + def offset(self): + return self._ptr.offset + + @property + def null_count(self): + return self._ptr.null_count + + @property + def children(self): + return ArrayViewChildren(self) + + @property + def buffers(self): + return ArrayViewBuffers(self) + + @property + def dictionary(self): + if self._ptr.dictionary == NULL: + return None + else: + return ArrayView( + self, + self._ptr.dictionary, + self._schema.dictionary, + None + ) + + @property + def schema(self): + return self._schema + + +cdef class SchemaChildren: + """Wrapper for a lazily-resolved list of Schema children + """ + cdef Schema _parent + cdef int64_t _length + + def __cinit__(self, Schema parent): + self._parent = parent + self._length = parent._ptr.n_children + + def __len__(self): + return self._length + + def __getitem__(self, k): + k = int(k) + if k < 0 or k >= self._length: + raise IndexError(f"{k} out of range [0, {self._length})") + + return Schema(self._parent, self._child_addr(k)) + + cdef _child_addr(self, int64_t i): + cdef ArrowSchema** children = self._parent._ptr.children + cdef ArrowSchema* child = children[i] + return child + + +cdef class SchemaMetadata: + """Wrapper for a lazily-parsed Schema.metadata string + """ + + cdef object _parent + cdef const char* _metadata + cdef ArrowMetadataReader _reader + + def __cinit__(self, object parent, uintptr_t ptr): + self._parent = parent + self._metadata = ptr + + def _init_reader(self): + cdef int result = ArrowMetadataReaderInit(&self._reader, self._metadata) + if result != NANOARROW_OK: + Error.raise_error("ArrowMetadataReaderInit()", result) + + def __len__(self): + self._init_reader() + return self._reader.remaining_keys + + def __iter__(self): + cdef ArrowStringView key + cdef ArrowStringView value + self._init_reader() + while self._reader.remaining_keys > 0: + ArrowMetadataReaderRead(&self._reader, &key, &value) + key_obj = PyBytes_FromStringAndSize(key.data, key.size_bytes).decode('UTF-8') + value_obj = PyBytes_FromStringAndSize(value.data, value.size_bytes) + yield key_obj, value_obj + + +cdef class ArrayChildren: + """Wrapper for a lazily-resolved list of Array children + """ + cdef Array _parent + cdef int64_t _length + + def __cinit__(self, Array parent): + self._parent = parent + self._length = parent._ptr.n_children + + def __len__(self): + return self._length + + def __getitem__(self, k): + k = int(k) + if k < 0 or k >= self._length: + raise IndexError(f"{k} out of range [0, {self._length})") + return Array(self._parent, self._child_addr(k), self._parent.schema.children[k]) + + cdef _child_addr(self, int64_t i): + cdef ArrowArray** children = self._parent._ptr.children + cdef ArrowArray* child = children[i] + return child + + +cdef class ArrayViewChildren: + """Wrapper for a lazily-resolved list of ArrayView children + """ + cdef ArrayView _parent + cdef int64_t _length + + def __cinit__(self, ArrayView parent): + self._parent = parent + self._length = parent._ptr.n_children + + def __len__(self): + return self._length + + def __getitem__(self, k): + k = int(k) + if k < 0 or k >= self._length: + raise IndexError(f"{k} out of range [0, {self._length})") + return ArrayView( + self._parent, + self._child_addr(k), + self._parent._schema.children[k], + None + ) + + cdef _child_addr(self, int64_t i): + cdef ArrowArrayView** children = self._parent._ptr.children + cdef ArrowArrayView* child = children[i] + return child + + +cdef class BufferView: + """Wrapper for Array buffer content + + This object is a Python wrapper around a buffer held by an Array. + It implements the Python buffer protocol and is best accessed through + another implementor (e.g., `np.array(array_view.buffers[1])`)). Note that + this buffer content does not apply any parent offset. + """ + cdef object _base + cdef ArrowBufferView* _ptr + cdef ArrowBufferType _buffer_type + cdef ArrowType _buffer_data_type + cdef Py_ssize_t _element_size_bits + cdef Py_ssize_t _shape + cdef Py_ssize_t _strides + + def __cinit__(self, object base, uintptr_t addr, + ArrowBufferType buffer_type, ArrowType buffer_data_type, + Py_ssize_t element_size_bits): + self._base = base + self._ptr = addr + self._buffer_type = buffer_type + self._buffer_data_type = buffer_data_type + self._element_size_bits = element_size_bits + self._strides = self._item_size() + self._shape = self._ptr.size_bytes // self._strides + + + cdef Py_ssize_t _item_size(self): + if self._buffer_data_type == NANOARROW_TYPE_BOOL: + return 1 + elif self._buffer_data_type == NANOARROW_TYPE_STRING: + return 1 + elif self._buffer_data_type == NANOARROW_TYPE_BINARY: + return 1 + else: + return self._element_size_bits // 8 + + cdef const char* _get_format(self): + if self._buffer_data_type == NANOARROW_TYPE_INT8: + return "b" + elif self._buffer_data_type == NANOARROW_TYPE_UINT8: + return "B" + elif self._buffer_data_type == NANOARROW_TYPE_INT16: + return "h" + elif self._buffer_data_type == NANOARROW_TYPE_UINT16: + return "H" + elif self._buffer_data_type == NANOARROW_TYPE_INT32: + return "i" + elif self._buffer_data_type == NANOARROW_TYPE_UINT32: + return "I" + elif self._buffer_data_type == NANOARROW_TYPE_INT64: + return "l" + elif self._buffer_data_type == NANOARROW_TYPE_UINT64: + return "L" + elif self._buffer_data_type == NANOARROW_TYPE_FLOAT: + return "f" + elif self._buffer_data_type == NANOARROW_TYPE_DOUBLE: + return "d" + elif self._buffer_data_type == NANOARROW_TYPE_STRING: + return "c" + else: + return "B" + + def __getbuffer__(self, Py_buffer *buffer, int flags): + buffer.buf = self._ptr.data.data + buffer.format = self._get_format() + buffer.internal = NULL + buffer.itemsize = self._strides + buffer.len = self._ptr.size_bytes + buffer.ndim = 1 + buffer.obj = self + buffer.readonly = 1 + buffer.shape = &self._shape + buffer.strides = &self._strides + buffer.suboffsets = NULL + + def __releasebuffer__(self, Py_buffer *buffer): + pass + + +cdef class ArrayViewBuffers: + """A lazily-resolved list of ArrayView buffers + """ + cdef ArrayView _array_view + cdef int64_t _length + + def __cinit__(self, ArrayView array_view): + self._array_view = array_view + self._length = 3 + for i in range(3): + if self._array_view._ptr.layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_NONE: + self._length = i + break + + def __len__(self): + return self._length + + def __getitem__(self, k): + k = int(k) + if k < 0 or k >= self._length: + raise IndexError(f"{k} out of range [0, {self._length})") + cdef ArrowBufferView* buffer_view = &(self._array_view._ptr.buffer_views[k]) + if buffer_view.data.data == NULL: + return None + + return BufferView( + self._array_view, + buffer_view, + self._array_view._ptr.layout.buffer_type[k], + self._array_view._ptr.layout.buffer_data_type[k], + self._array_view._ptr.layout.element_size_bits[k] + ) + + +cdef class ArrayStream: + """ArrowArrayStream wrapper + + This class provides a user-facing interface to access the fields of + an ArrowArrayStream as defined in the Arrow C Stream interface. + These objects are usually created using `nanoarrow.array_stream()`. + + Examples + -------- + + >>> import pyarrow as pa + >>> import nanoarrow as na + >>> pa_column = pa.array([1, 2, 3], pa.int32()) + >>> pa_batch = pa.record_batch([pa_column], names=["col1"]) + >>> pa_reader = pa.RecordBatchReader.from_batches(pa_batch.schema, [pa_batch]) + >>> array_stream = na.array_stream(pa_reader) + >>> array_stream.get_schema() + struct + >>> array_stream.get_next().length + 3 + >>> array_stream.get_next() is None + Traceback (most recent call last): + ... + StopIteration + """ + cdef object _base + cdef ArrowArrayStream* _ptr + cdef object _cached_schema + + def __cinit__(self, object base, uintptr_t addr): + self._base = base + self._ptr = addr + self._cached_schema = None + + def _addr(self): + return self._ptr + + def is_valid(self): + return self._ptr != NULL and self._ptr.release != NULL + + def _assert_valid(self): + if self._ptr == NULL: + raise RuntimeError("array stream pointer is NULL") + if self._ptr.release == NULL: + raise RuntimeError("array stream is released") + + def _get_schema(self, Schema schema): + self._assert_valid() + cdef int code = self._ptr.get_schema(self._ptr, schema._ptr) + cdef const char* message = NULL + if code != NANOARROW_OK: + message = self._ptr.get_last_error(self._ptr) + if message != NULL: + raise NanoarrowException( + "ArrowArrayStream::get_schema()", + code, + message.decode("UTF-8") + ) + else: + raise NanoarrowException("ArrowArrayStream::get_schema()", code) + + self._cached_schema = schema + + def get_schema(self): + """Get the schema associated with this stream + """ + out = Schema.allocate() + self._get_schema(out) + return out + + def get_next(self): + """Get the next Array from this stream + + Returns None when there are no more arrays in this stream. + """ + self._assert_valid() + + # We return a reference to the same Python object for each + # Array that is returned. This is independent of get_schema(), + # which is guaranteed to call the C object's callback and + # faithfully pass on the returned value. + if self._cached_schema is None: + self._cached_schema = Schema.allocate() + self._get_schema(self._cached_schema) + + cdef Array array = Array.allocate(self._cached_schema) + cdef int code = self._ptr.get_next(self._ptr, array._ptr) + cdef const char* message = NULL + if code != NANOARROW_OK: + message = self._ptr.get_last_error(self._ptr) + if message != NULL: + raise NanoarrowException( + "ArrowArrayStream::get_next()", + code, + message.decode("UTF-8") + ) + else: + raise NanoarrowException("ArrowArrayStream::get_next()", code) + + if not array.is_valid(): + raise StopIteration() + else: + return array + + def __iter__(self): + while True: + yield self.get_next() + + @staticmethod + def allocate(): + base = ArrayStreamHolder() + return ArrayStream(base, base._addr()) diff --git a/python/nanoarrow/lib.py b/python/nanoarrow/lib.py new file mode 100644 index 000000000..a3c27e72f --- /dev/null +++ b/python/nanoarrow/lib.py @@ -0,0 +1,69 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from ._lib import Schema, Array, ArrayStream + + +def schema(obj): + if isinstance(obj, Schema): + return obj + + # Not particularly safe because _export_to_c() could be exporting an + # array, schema, or array_stream. The ideal + # solution here would be something like __arrow_c_schema__() + if hasattr(obj, "_export_to_c"): + out = Schema.allocate() + obj._export_to_c(out._addr()) + return out + else: + raise TypeError( + f"Can't convert object of type {type(obj).__name__} to nanoarrow.Schema" + ) + + +def array(obj): + if isinstance(obj, Array): + return obj + + # Somewhat safe because calling _export_to_c() with two arguments will + # not fail with a crash (but will fail with a confusing error). The ideal + # solution here would be something like __arrow_c_array__() + if hasattr(obj, "_export_to_c"): + out = Array.allocate(Schema.allocate()) + obj._export_to_c(out._addr(), out.schema._addr()) + return out + else: + raise TypeError( + f"Can't convert object of type {type(obj).__name__} to nanoarrow.Array" + ) + + +def array_stream(obj): + if isinstance(obj, Schema): + return obj + + # Not particularly safe because _export_to_c() could be exporting an + # array, schema, or array_stream. The ideal + # solution here would be something like __arrow_c_array_stream__() + if hasattr(obj, "_export_to_c"): + out = ArrayStream.allocate() + obj._export_to_c(out._addr()) + return out + else: + raise TypeError( + f"Can't convert object of type {type(obj).__name__} to nanoarrow.ArrowArrayStream" + ) diff --git a/python/pyproject.toml b/python/pyproject.toml index 1cc2c17ec..743cebe0c 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -19,14 +19,13 @@ [project] name = "nanoarrow" version = "1.0.0-alpha0" -description = "" +description = "Python bindings to the nanoarrow C library" authors = [{name = "Apache Arrow Developers", email = "dev@arrow.apache.org"}] license = {text = "Apache-2.0"} requires-python = ">=3.8" -dependencies = ["numpy"] [project.optional-dependencies] -test = ["pyarrow", "pytest"] +test = ["pyarrow", "pytest", "numpy"] [project.urls] homepage = "https://arrow.apache.org" @@ -36,7 +35,6 @@ repository = "https://github.com/apache/arrow-nanoarrow" requires = [ "setuptools >= 61.0.0", "setuptools-scm", - "Cython", - "oldest-supported-numpy", + "Cython" ] build-backend = "setuptools.build_meta" diff --git a/python/setup.py b/python/setup.py index f6f7efb1c..4222cd85d 100644 --- a/python/setup.py +++ b/python/setup.py @@ -17,33 +17,43 @@ # specific language governing permissions and limitations # under the License. -import shutil -from pathlib import Path - +import os +import sys +import subprocess from setuptools import Extension, setup -import numpy as np +# Run bootstrap.py to run cmake generating a fresh bundle based on this +# checkout or copy from ../dist if the caller doesn't have cmake available. +# Note that bootstrap.py won't exist if building from sdist. +this_dir = os.path.dirname(__file__) +bootstrap_py = os.path.join(this_dir, "bootstrap.py") +if os.path.exists(bootstrap_py): + subprocess.run([sys.executable, bootstrap_py]) -# setuptools gets confused by relative paths that extend above the project root -target = Path(__file__).parent / "src" / "nanoarrow" -shutil.copy( - Path(__file__).parent / "../dist/nanoarrow.c", target / "nanoarrow.c" -) -shutil.copy( - Path(__file__).parent / "../dist/nanoarrow.h", target / "nanoarrow.h" -) +# Set some extra flags for compiling with coverage support +if os.getenv("NANOARROW_PYTHON_COVERAGE") == "1": + coverage_compile_args = ["--coverage"] + coverage_link_args = ["--coverage"] + coverage_define_macros = [("CYTHON_TRACE", 1)] +else: + coverage_compile_args = [] + coverage_link_args = [] + coverage_define_macros = [] setup( ext_modules=[ Extension( name="nanoarrow._lib", - include_dirs=[np.get_include(), "src/nanoarrow"], - language="c++", + include_dirs=["nanoarrow"], + language="c", sources=[ - "src/nanoarrow/_lib.pyx", - "src/nanoarrow/nanoarrow.c", + "nanoarrow/_lib.pyx", + "nanoarrow/nanoarrow.c", ], + extra_compile_args=coverage_compile_args, + extra_link_args=coverage_link_args, + define_macros=coverage_define_macros, ) ] ) diff --git a/python/src/nanoarrow/_lib.pyx b/python/src/nanoarrow/_lib.pyx deleted file mode 100644 index a6b4da153..000000000 --- a/python/src/nanoarrow/_lib.pyx +++ /dev/null @@ -1,86 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# cython: language_level = 3 - -"""Low-level nanoarrow Python bindings.""" - -from libc.stdint cimport uint8_t, uintptr_t - -from nanoarrow_c cimport * - -import numpy as np -cimport numpy as cnp - -cnp.import_array() - - -cdef dict _numpy_type_map = { - NANOARROW_TYPE_UINT8: cnp.NPY_UINT8, - NANOARROW_TYPE_INT8: cnp.NPY_INT8, - NANOARROW_TYPE_UINT16: cnp.NPY_UINT16, - NANOARROW_TYPE_INT16: cnp.NPY_INT16, - NANOARROW_TYPE_UINT32: cnp.NPY_UINT32, - NANOARROW_TYPE_INT32: cnp.NPY_INT32, - NANOARROW_TYPE_UINT64: cnp.NPY_UINT64, - NANOARROW_TYPE_INT64: cnp.NPY_INT64, - NANOARROW_TYPE_HALF_FLOAT: cnp.NPY_FLOAT16, - NANOARROW_TYPE_FLOAT: cnp.NPY_FLOAT32, - NANOARROW_TYPE_DOUBLE: cnp.NPY_FLOAT64, -} - - -def as_numpy_array(arr): - cdef ArrowSchema schema - cdef ArrowArray array - cdef ArrowArrayView array_view - cdef ArrowError error - - arr._export_to_c( &array, &schema) - ArrowArrayViewInitFromSchema(&array_view, &schema, &error) - - # primitive arrays have DATA as the second buffer - if array_view.layout.buffer_type[1] != NANOARROW_BUFFER_TYPE_DATA: - raise TypeError("Cannot convert a non-primitive array") - - # disallow nulls for this method - if array.null_count > 0: - raise ValueError("Cannot convert array with nulls") - elif array.null_count < 0: - # not yet computed - if array_view.layout.buffer_type[0] == NANOARROW_BUFFER_TYPE_VALIDITY: - if array.buffers[0] != NULL: - null_count = ArrowBitCountSet( - array.buffers[0], array.offset, array.length - ) - if null_count > 0: - raise ValueError("Cannot convert array with nulls") - - cdef int type_num - if array_view.storage_type in _numpy_type_map: - type_num = _numpy_type_map[array_view.storage_type] - else: - raise NotImplementedError(array_view.storage_type) - - cdef cnp.npy_intp dims[1] - dims[0] = array.length - cdef cnp.ndarray result = cnp.PyArray_New( - np.ndarray, 1, dims, type_num, NULL, array.buffers[1], -1, 0, NULL - ) - # TODO set base - - return result diff --git a/python/src/nanoarrow/nanoarrow_c.pxd b/python/src/nanoarrow/nanoarrow_c.pxd deleted file mode 100644 index 440f449c1..000000000 --- a/python/src/nanoarrow/nanoarrow_c.pxd +++ /dev/null @@ -1,127 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# cython: language_level = 3 - -from libc.stdint cimport int64_t, int8_t, uint8_t - - -cdef extern from "nanoarrow.h": - struct ArrowSchema: - const char* format - int64_t n_children - void (*release)(ArrowSchema*) - - struct ArrowArray: - int64_t length - int64_t null_count - int64_t offset - const void** buffers - void (*release)(ArrowArray*) - - struct ArrowArrayStream: - int (*get_schema)(ArrowArrayStream* stream, ArrowSchema* out) - - ctypedef int ArrowErrorCode - - enum ArrowType: - NANOARROW_TYPE_UNINITIALIZED = 0 - NANOARROW_TYPE_NA = 1 - NANOARROW_TYPE_BOOL - NANOARROW_TYPE_UINT8 - NANOARROW_TYPE_INT8 - NANOARROW_TYPE_UINT16 - NANOARROW_TYPE_INT16 - NANOARROW_TYPE_UINT32 - NANOARROW_TYPE_INT32 - NANOARROW_TYPE_UINT64 - NANOARROW_TYPE_INT64 - NANOARROW_TYPE_HALF_FLOAT - NANOARROW_TYPE_FLOAT - NANOARROW_TYPE_DOUBLE - NANOARROW_TYPE_STRING - NANOARROW_TYPE_BINARY - NANOARROW_TYPE_FIXED_SIZE_BINARY - NANOARROW_TYPE_DATE32 - NANOARROW_TYPE_DATE64 - NANOARROW_TYPE_TIMESTAMP - NANOARROW_TYPE_TIME32 - NANOARROW_TYPE_TIME64 - NANOARROW_TYPE_INTERVAL_MONTHS - NANOARROW_TYPE_INTERVAL_DAY_TIME - NANOARROW_TYPE_DECIMAL128 - NANOARROW_TYPE_DECIMAL256 - NANOARROW_TYPE_LIST - NANOARROW_TYPE_STRUCT - NANOARROW_TYPE_SPARSE_UNION - NANOARROW_TYPE_DENSE_UNION - NANOARROW_TYPE_DICTIONARY - NANOARROW_TYPE_MAP - NANOARROW_TYPE_EXTENSION - NANOARROW_TYPE_FIXED_SIZE_LIST - NANOARROW_TYPE_DURATION - NANOARROW_TYPE_LARGE_STRING - NANOARROW_TYPE_LARGE_BINARY - NANOARROW_TYPE_LARGE_LIST - NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO - - enum ArrowBufferType: - NANOARROW_BUFFER_TYPE_NONE - NANOARROW_BUFFER_TYPE_VALIDITY - NANOARROW_BUFFER_TYPE_TYPE_ID - NANOARROW_BUFFER_TYPE_UNION_OFFSET - NANOARROW_BUFFER_TYPE_DATA_OFFSET - NANOARROW_BUFFER_TYPE_DATA - - struct ArrowError: - pass - - const char* ArrowErrorMessage(ArrowError* error) - - struct ArrowLayout: - ArrowBufferType buffer_type[3] - int64_t element_size_bits[3] - int64_t child_size_elements - - cdef union buffer_data: - const void* data - const int8_t* as_int8 - const uint8_t* as_uint8 - - struct ArrowBufferView: - buffer_data data - int64_t size_bytes - - struct ArrowBuffer: - uint8_t* data - int64_t size_bytes - - struct ArrowBitmap: - ArrowBuffer buffer - int64_t size_bits - - struct ArrowArrayView: - ArrowArray* array - ArrowType storage_type - ArrowLayout layout - ArrowBufferView buffer_views[3] - int64_t n_children - ArrowArrayView** children - - ArrowErrorCode ArrowArrayViewInitFromSchema(ArrowArrayView* array_view, ArrowSchema* schema, ArrowError* error) - ArrowErrorCode ArrowArrayViewSetArray(ArrowArrayView* array_view, ArrowArray* array, ArrowError* error) - int64_t ArrowBitCountSet(const uint8_t* bits, int64_t i_from, int64_t i_to) diff --git a/python/tests/test_nanoarrow.py b/python/tests/test_nanoarrow.py index fd76534e1..316227407 100644 --- a/python/tests/test_nanoarrow.py +++ b/python/tests/test_nanoarrow.py @@ -1,27 +1,293 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import sys +import re import numpy as np import pyarrow as pa +import pytest -import nanoarrow +import nanoarrow as na -import pytest +def test_c_version(): + re_version = re.compile(r"^[0-9]+\.[0-9]+\.[0-9]+(-SNAPSHOT)?$") + assert re_version.match(na.c_version()) is not None + + +def test_schema_helper(): + schema = na.Schema.allocate() + assert na.schema(schema) is schema + + schema = na.schema(pa.null()) + assert isinstance(schema, na.Schema) + + with pytest.raises(TypeError): + na.schema(None) + + +def test_array_helper(): + array = na.Array.allocate(na.Schema.allocate()) + assert na.array(array) is array + + array = na.array(pa.array([], pa.null())) + assert isinstance(array, na.Array) + + with pytest.raises(TypeError): + na.schema(None) + + +def test_schema_basic(): + schema = na.Schema.allocate() + assert schema.is_valid() is False + assert repr(schema) == "[invalid: schema is released]" + + schema = na.schema(pa.schema([pa.field("some_name", pa.int32())])) + + assert schema.format == "+s" + assert schema.flags == 0 + assert schema.metadata is None + assert len(schema.children) == 1 + assert schema.children[0].format == "i" + assert schema.children[0].name == "some_name" + assert repr(schema.children[0]) == "int32" + assert schema.dictionary is None + + with pytest.raises(IndexError): + schema.children[1] + + +def test_schema_dictionary(): + schema = na.schema(pa.dictionary(pa.int32(), pa.utf8())) + assert schema.format == "i" + assert schema.dictionary.format == "u" + + +def test_schema_metadata(): + meta = {"key1": "value1", "key2": "value2"} + schema = na.schema(pa.field("", pa.int32(), metadata=meta)) + + assert len(schema.metadata) == 2 + + meta2 = {k: v for k, v in schema.metadata} + assert list(meta2.keys()) == ["key1", "key2"] + assert list(meta2.values()) == [b"value1", b"value2"] + + +def test_schema_view(): + schema = na.Schema.allocate() + with pytest.raises(RuntimeError): + schema.view() + + schema = na.schema(pa.int32()) + view = schema.view() + assert view.type == "int32" + assert view.storage_type == "int32" + + assert view.fixed_size is None + assert view.decimal_bitwidth is None + assert view.decimal_scale is None + assert view.time_unit is None + assert view.timezone is None + assert view.union_type_ids is None + assert view.extension_name is None + assert view.extension_metadata is None + + +def test_schema_view_extra_params(): + schema = na.schema(pa.binary(12)) + view = schema.view() + assert view.fixed_size == 12 + + schema = na.schema(pa.list_(pa.int32(), 12)) + assert view.fixed_size == 12 + + schema = na.schema(pa.decimal128(10, 3)) + view = schema.view() + assert view.decimal_bitwidth == 128 + assert view.decimal_precision == 10 + assert view.decimal_scale == 3 + + schema = na.schema(pa.decimal256(10, 3)) + view = schema.view() + assert view.decimal_bitwidth == 256 + assert view.decimal_precision == 10 + assert view.decimal_scale == 3 + + schema = na.schema(pa.duration("us")) + view = schema.view() + assert view.time_unit == "us" + + schema = na.schema(pa.timestamp("us", tz="America/Halifax")) + view = schema.view() + assert view.type == "timestamp" + assert view.storage_type == "int64" + assert view.time_unit == "us" + assert view.timezone == "America/Halifax" + + meta = { + "ARROW:extension:name": "some_name", + "ARROW:extension:metadata": "some_metadata", + } + schema = na.schema(pa.field("", pa.int32(), metadata=meta)) + view = schema.view() + assert view.extension_name == "some_name" + assert view.extension_metadata == b"some_metadata" + + +def test_array(): + array = na.array(pa.array([1, 2, 3], pa.int32())) + assert array.is_valid() is True + assert array.length == 3 + assert array.offset == 0 + assert array.null_count == 0 + assert len(array.buffers) == 2 + assert array.buffers[0] == 0 + assert len(array.children) == 0 + assert array.dictionary is None + + with pytest.raises(IndexError): + array.children[1] + + +def test_array_view(): + array = na.array(pa.array([1, 2, 3], pa.int32())) + view = array.view() + + assert view.schema is array.schema + + data_buffer = memoryview(view.buffers[1]) + data_buffer_copy = bytes(data_buffer) + assert len(data_buffer_copy) == 12 + + if sys.byteorder == "little": + assert data_buffer_copy == b"\x01\x00\x00\x00\x02\x00\x00\x00\x03\x00\x00\x00" + else: + assert data_buffer_copy == b"\x00\x00\x00\x01\x00\x00\x00\x02\x00\x00\x00\x03" + + with pytest.raises(IndexError): + view.children[1] + + +def test_array_view_recursive(): + pa_array_child = pa.array([1, 2, 3], pa.int32()) + pa_array = pa.record_batch([pa_array_child], names=["some_column"]) + + array = na.array(pa_array) + + assert array.schema.format == "+s" + assert array.length == 3 + assert len(array.children) == 1 + + assert array.children[0].schema.format == "i" + assert array.children[0].length == 3 + assert array.children[0].schema._addr() == array.schema.children[0]._addr() + + view = array.view() + assert len(view.buffers) == 1 + assert len(view.children) == 1 + assert view.schema._addr() == array.schema._addr() + + assert len(view.children[0].buffers) == 2 + assert view.children[0].schema._addr() == array.schema.children[0]._addr() + assert view.children[0].schema._addr() == array.children[0].schema._addr() + + +def test_array_view_dictionary(): + pa_array = pa.array(["a", "b", "b"], pa.dictionary(pa.int32(), pa.utf8())) + array = na.array(pa_array) + + assert array.schema.format == "i" + assert array.dictionary.schema.format == "u" + + view = array.view() + assert len(view.buffers) == 2 + assert len(view.dictionary.buffers) == 3 + + +def test_buffers_data(): + data_types = [ + (pa.uint8(), np.uint8()), + (pa.int8(), np.int8()), + (pa.uint16(), np.uint16()), + (pa.int16(), np.int16()), + (pa.uint32(), np.uint32()), + (pa.int32(), np.int32()), + (pa.uint64(), np.uint64()), + (pa.int64(), np.int64()), + (pa.float32(), np.float32()), + (pa.float64(), np.float64()), + ] + + for pa_type, np_type in data_types: + view = na.array(pa.array([0, 1, 2], pa_type)).view() + np.testing.assert_array_equal( + np.array(view.buffers[1]), np.array([0, 1, 2], np_type) + ) + + +def test_buffers_string(): + view = na.array(pa.array(["a", "bc", "def"])).view() + + assert view.buffers[0] is None + np.testing.assert_array_equal( + np.array(view.buffers[1]), np.array([0, 1, 3, 6], np.int32()) + ) + np.testing.assert_array_equal( + np.array(view.buffers[2]), np.array(list("abcdef"), dtype="|S1") + ) + + +def test_buffers_binary(): + view = na.array(pa.array([b"a", b"bc", b"def"])).view() + + assert view.buffers[0] is None + np.testing.assert_array_equal( + np.array(view.buffers[1]), np.array([0, 1, 3, 6], np.int32()) + ) + np.testing.assert_array_equal(np.array(view.buffers[2]), np.array(list(b"abcdef"))) + + +def test_array_stream(): + array_stream = na.ArrayStream.allocate() + assert array_stream.is_valid() is False + with pytest.raises(RuntimeError): + array_stream.get_schema() + with pytest.raises(RuntimeError): + array_stream.get_next() + + pa_array_child = pa.array([1, 2, 3], pa.int32()) + pa_array = pa.record_batch([pa_array_child], names=["some_column"]) + reader = pa.RecordBatchReader.from_batches(pa_array.schema, [pa_array]) + array_stream = na.array_stream(reader) -def test_as_numpy_array(): - - arr = pa.array([1, 2, 3]) - result = nanoarrow.as_numpy_array(arr) - expected = arr.to_numpy() - np.testing.assert_array_equal(result, expected) + assert array_stream.is_valid() is True + array = array_stream.get_next() + assert array.schema.children[0].name == "some_column" + with pytest.raises(StopIteration): + array_stream.get_next() - arr = pa.array([1, 2, 3], pa.uint8()) - result = nanoarrow.as_numpy_array(arr) - expected = arr.to_numpy() - np.testing.assert_array_equal(result, expected) - arr = pa.array([1, 2, None]) - with pytest.raises(ValueError, match="Cannot convert array with nulls"): - nanoarrow.as_numpy_array(arr) +def test_array_stream_iter(): + pa_array_child = pa.array([1, 2, 3], pa.int32()) + pa_array = pa.record_batch([pa_array_child], names=["some_column"]) + reader = pa.RecordBatchReader.from_batches(pa_array.schema, [pa_array]) + array_stream = na.array_stream(reader) - arr = pa.array([[1], [2, 3]]) - with pytest.raises(TypeError, match="Cannot convert a non-primitive array"): - nanoarrow.as_numpy_array(arr) + arrays = list(array_stream) + assert len(arrays) == 1 + assert arrays[0].schema.children[0].name == "some_column" diff --git a/src/nanoarrow/nanoarrow_types.h b/src/nanoarrow/nanoarrow_types.h index 45ee3c636..bf85b19b2 100644 --- a/src/nanoarrow/nanoarrow_types.h +++ b/src/nanoarrow/nanoarrow_types.h @@ -298,6 +298,8 @@ enum ArrowType { /// \ingroup nanoarrow-utils /// /// Returns NULL for invalid values for type +static inline const char* ArrowTypeString(enum ArrowType type); + static inline const char* ArrowTypeString(enum ArrowType type) { switch (type) { case NANOARROW_TYPE_NA: @@ -416,6 +418,8 @@ enum ArrowValidationLevel { /// \ingroup nanoarrow-utils /// /// Returns NULL for invalid values for time_unit +static inline const char* ArrowTimeUnitString(enum ArrowTimeUnit time_unit); + static inline const char* ArrowTimeUnitString(enum ArrowTimeUnit time_unit) { switch (time_unit) { case NANOARROW_TIME_UNIT_SECOND: @@ -458,6 +462,8 @@ struct ArrowStringView { /// \brief Return a view of a const C string /// \ingroup nanoarrow-utils +static inline struct ArrowStringView ArrowCharView(const char* value); + static inline struct ArrowStringView ArrowCharView(const char* value) { struct ArrowStringView out;