From 83992502ef231842d45057e81eeb5568c0fc05d1 Mon Sep 17 00:00:00 2001 From: gettodaze Date: Fri, 24 May 2024 12:03:58 -0700 Subject: [PATCH 01/28] add dummy code --- src/__init__.py | 1 + src/__init__.pyi | 1 + src/_arraykit.c | 19 +++++++++++++++++++ test/test_util.py | 8 ++++++++ 4 files changed, 29 insertions(+) diff --git a/src/__init__.py b/src/__init__.py index 12a74055..420d9060 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -27,3 +27,4 @@ from ._arraykit import first_true_1d as first_true_1d from ._arraykit import first_true_2d as first_true_2d from ._arraykit import slice_to_ascending_slice as slice_to_ascending_slice +from ._arraykit import array2d_to_array1d as array2d_to_array1d \ No newline at end of file diff --git a/src/__init__.pyi b/src/__init__.pyi index d28fc74e..7caa3cdf 100644 --- a/src/__init__.pyi +++ b/src/__init__.pyi @@ -138,3 +138,4 @@ def get_new_indexers_and_screen(indexers: np.ndarray, positions: np.ndarray) -> def first_true_1d(__array: np.ndarray, *, forward: bool) -> int: ... def first_true_2d(__array: np.ndarray, *, forward: bool, axis: int) -> np.ndarray: ... def slice_to_ascending_slice(__slice: slice, __size: int) -> slice: ... +def array2d_to_array1d(__array: np.ndarray) -> np.ndarray: ... \ No newline at end of file diff --git a/src/_arraykit.c b/src/_arraykit.c index 1135fabe..15e141a2 100644 --- a/src/_arraykit.c +++ b/src/_arraykit.c @@ -3503,6 +3503,24 @@ array_deepcopy(PyObject *m, PyObject *args, PyObject *kwargs) return AK_ArrayDeepCopy(m, (PyArrayObject*)array, memo); } +// JTODO: here +// Reshape if necessary a row that might be 2D or 1D is returned as a 1D array. +static PyObject * +array2d_to_array1d(PyObject *Py_UNUSED(m), PyObject *a) +{ + AK_CHECK_NUMPY_ARRAY_1D_2D(a); // JTODO: should we allow 1D arrays? + PyArrayObject *array = (PyArrayObject *)a; + + if (PyArray_NDIM(array) == 2) { + npy_intp dim[1] = {PyArray_DIM(array, 1)}; + PyArray_Dims shape = {dim, 1}; + // NOTE: this will set PyErr if shape is not compatible + return PyArray_Newshape(array, &shape, NPY_ANYORDER); + } + Py_INCREF(a); + return a; +} + //------------------------------------------------------------------------------ // type resolution @@ -5943,6 +5961,7 @@ static PyMethodDef arraykit_methods[] = { (PyCFunction)array_deepcopy, METH_VARARGS | METH_KEYWORDS, NULL}, + {"array2d_to_array1d", array2d_to_array1d, METH_O, NULL}, {"resolve_dtype", resolve_dtype, METH_VARARGS, NULL}, {"resolve_dtype_iter", resolve_dtype_iter, METH_O, NULL}, {"first_true_1d", diff --git a/test/test_util.py b/test/test_util.py index 196e8ee6..0357af99 100644 --- a/test/test_util.py +++ b/test/test_util.py @@ -22,6 +22,7 @@ from arraykit import first_true_1d from arraykit import first_true_2d from arraykit import slice_to_ascending_slice +from arraykit import array2d_to_array1d from performance.reference.util import get_new_indexers_and_screen_ak as get_new_indexers_and_screen_full from arraykit import get_new_indexers_and_screen @@ -283,6 +284,13 @@ def test_array_deepcopy_h(self) -> None: with self.assertRaises(TypeError): a2 = array_deepcopy(a1, ()) + #--------------------------------------------------------------------------- + def test_array2d_to_array1d_a(self) -> None: + a1 = np.arange(10) + a2 = array2d_to_array1d(a1) + self.assertEqual(mloc(a1), mloc(a2)) + + #--------------------------------------------------------------------------- def test_isna_element_a(self) -> None: From 79d57eda81bf7001832a8bc795fc88b2306bace0 Mon Sep 17 00:00:00 2001 From: gettodaze Date: Fri, 24 May 2024 13:03:48 -0700 Subject: [PATCH 02/28] implemented rough version --- src/_arraykit.c | 58 +++++++++++++++++++++++++++++++++++++++-------- test/test_util.py | 24 +++++++++++++++++--- 2 files changed, 70 insertions(+), 12 deletions(-) diff --git a/src/_arraykit.c b/src/_arraykit.c index 15e141a2..24d6b0dc 100644 --- a/src/_arraykit.c +++ b/src/_arraykit.c @@ -35,6 +35,18 @@ } \ } while (0) +// Given a PyObject, raise if not an array or is not two dimensional. +# define AK_CHECK_NUMPY_ARRAY_2D(O) \ + do { \ + AK_CHECK_NUMPY_ARRAY(O) \ + int ndim = PyArray_NDIM((PyArrayObject *)O); \ + if (ndim != 2) { \ + return PyErr_Format(PyExc_NotImplementedError,\ + "Expected a 2D array, not %i.", \ + ndim); \ + } \ + } while (0) + // Placeholder of not implemented pathways / debugging. # define AK_NOT_IMPLEMENTED(msg) \ do { \ @@ -3508,17 +3520,45 @@ array_deepcopy(PyObject *m, PyObject *args, PyObject *kwargs) static PyObject * array2d_to_array1d(PyObject *Py_UNUSED(m), PyObject *a) { - AK_CHECK_NUMPY_ARRAY_1D_2D(a); // JTODO: should we allow 1D arrays? - PyArrayObject *array = (PyArrayObject *)a; + AK_CHECK_NUMPY_ARRAY_2D(a); // JTODO: should we allow 1D arrays? + PyArrayObject *input_array = (PyArrayObject *)a; + + // get array dimensions + npy_intp num_rows = PyArray_DIM(input_array, 0); + npy_intp num_cols = PyArray_DIM(input_array, 1); + + // create output array + npy_intp dims[1] = {num_rows}; + PyObject* output_array = PyArray_SimpleNew(1, dims, NPY_OBJECT); + if (!output_array) { + return NULL; // JTODO: check if this is the correct error handling + } + + for (npy_intp i = 0; i < num_rows; ++i) { + PyObject* tuple = PyTuple_New(num_cols); + if (!tuple) { + Py_DECREF(output_array); + return NULL; + } - if (PyArray_NDIM(array) == 2) { - npy_intp dim[1] = {PyArray_DIM(array, 1)}; - PyArray_Dims shape = {dim, 1}; - // NOTE: this will set PyErr if shape is not compatible - return PyArray_Newshape(array, &shape, NPY_ANYORDER); + for (npy_intp j = 0; j < num_cols; ++j) { + PyObject* item = PyArray_GETITEM(input_array, PyArray_GETPTR2(input_array, i, j)); + if (!item) { + Py_DECREF(tuple); + Py_DECREF(output_array); + return NULL; + } + PyTuple_SET_ITEM(tuple, j, item); + } + + PyArray_SETITEM((PyArrayObject*)output_array, PyArray_GETPTR1((PyArrayObject*)output_array, i), tuple); + Py_DECREF(tuple); // JTODO: check } - Py_INCREF(a); - return a; + + PyArray_CLEARFLAGS((PyArrayObject *)output_array, NPY_ARRAY_WRITEABLE); + + // Py_INCREF(a); // JTODO: check if this is necessary + return output_array; } //------------------------------------------------------------------------------ diff --git a/test/test_util.py b/test/test_util.py index 0357af99..7585e121 100644 --- a/test/test_util.py +++ b/test/test_util.py @@ -285,10 +285,28 @@ def test_array_deepcopy_h(self) -> None: a2 = array_deepcopy(a1, ()) #--------------------------------------------------------------------------- - def test_array2d_to_array1d_a(self) -> None: + def test_array2d_to_array1d_dummy(self) -> None: + # JTODO: remove a1 = np.arange(10) - a2 = array2d_to_array1d(a1) - self.assertEqual(mloc(a1), mloc(a2)) + with self.assertRaises(NotImplementedError): + # 1 dimensional + _ = array2d_to_array1d(a1) + + def test_array2d_to_array1d_b(self) -> None: + # def py_array2d_to_array1d(array: np.ndarray) -> np.ndarray: + # post: np.ndarray = np.empty(array.shape[0], dtype=object) + # for i, row in enumerate(array): + # post[i] = tuple(row) + # post.flags.writeable = False + # return post + + a1 = np.arange(10).reshape(5, 2) + + result = array2d_to_array1d(a1) + assert isinstance(result[0], tuple) + assert result[0] == (0, 1) + assert tuple(result) == ((0, 1), (2, 3), (4, 5), (6, 7), (8, 9)) + #--------------------------------------------------------------------------- From 59f18c6effef10db6aa9d99d383e443c76ca4875 Mon Sep 17 00:00:00 2001 From: Christopher Ariza Date: Tue, 11 Jun 2024 11:50:36 -0700 Subject: [PATCH 03/28] additional tests --- src/_arraykit.c | 17 ++++++++--------- test/test_util.py | 27 ++++++++++++++++----------- 2 files changed, 24 insertions(+), 20 deletions(-) diff --git a/src/_arraykit.c b/src/_arraykit.c index 8aed572d..0781955e 100644 --- a/src/_arraykit.c +++ b/src/_arraykit.c @@ -3523,23 +3523,21 @@ array_deepcopy(PyObject *m, PyObject *args, PyObject *kwargs) return AK_ArrayDeepCopy(m, (PyArrayObject*)array, memo); } -// JTODO: here + // Reshape if necessary a row that might be 2D or 1D is returned as a 1D array. static PyObject * array2d_to_array1d(PyObject *Py_UNUSED(m), PyObject *a) { - AK_CHECK_NUMPY_ARRAY_2D(a); // JTODO: should we allow 1D arrays? + AK_CHECK_NUMPY_ARRAY_2D(a); PyArrayObject *input_array = (PyArrayObject *)a; - - // get array dimensions + npy_intp num_rows = PyArray_DIM(input_array, 0); npy_intp num_cols = PyArray_DIM(input_array, 1); - // create output array npy_intp dims[1] = {num_rows}; PyObject* output_array = PyArray_SimpleNew(1, dims, NPY_OBJECT); - if (!output_array) { - return NULL; // JTODO: check if this is the correct error handling + if (output_array == NULL) { + return NULL; } for (npy_intp i = 0; i < num_rows; ++i) { @@ -3550,10 +3548,11 @@ array2d_to_array1d(PyObject *Py_UNUSED(m), PyObject *a) } for (npy_intp j = 0; j < num_cols; ++j) { - PyObject* item = PyArray_GETITEM(input_array, PyArray_GETPTR2(input_array, i, j)); + PyObject* item = PyArray_ToScalar(PyArray_GETPTR2(input_array, i, j), input_array); + // PyObject* item = PyArray_GETITEM(input_array, PyArray_GETPTR2(input_array, i, j)); if (!item) { Py_DECREF(tuple); - Py_DECREF(output_array); + Py_DECREF(output_array); // TODO: need to decrer object compmonents return NULL; } PyTuple_SET_ITEM(tuple, j, item); diff --git a/test/test_util.py b/test/test_util.py index 7585e121..a489a4d9 100644 --- a/test/test_util.py +++ b/test/test_util.py @@ -286,27 +286,32 @@ def test_array_deepcopy_h(self) -> None: #--------------------------------------------------------------------------- def test_array2d_to_array1d_dummy(self) -> None: - # JTODO: remove a1 = np.arange(10) with self.assertRaises(NotImplementedError): # 1 dimensional _ = array2d_to_array1d(a1) def test_array2d_to_array1d_b(self) -> None: - # def py_array2d_to_array1d(array: np.ndarray) -> np.ndarray: - # post: np.ndarray = np.empty(array.shape[0], dtype=object) - # for i, row in enumerate(array): - # post[i] = tuple(row) - # post.flags.writeable = False - # return post - a1 = np.arange(10).reshape(5, 2) - result = array2d_to_array1d(a1) assert isinstance(result[0], tuple) assert result[0] == (0, 1) - assert tuple(result) == ((0, 1), (2, 3), (4, 5), (6, 7), (8, 9)) - + self.assertIs(type(result[0][0]), np.int64) + self.assertFalse(result.flags.writeable) + self.assertEqual(tuple(result), ((0, 1), (2, 3), (4, 5), (6, 7), (8, 9))) + + + def test_array2d_to_array1d_c(self) -> None: + a1 = np.array([["a", "b"], ["ccc", "ddd"], ["ee", "ff"]]) + a2 = array2d_to_array1d(a1) + self.assertEqual(a2.tolist(), [('a', 'b'), ('ccc', 'ddd'), ('ee', 'ff')]) + + def test_array2d_to_array1d_d(self) -> None: + a1 = np.array([[3, 5], [10, 20], [7, 2]], dtype=np.uint8) + a2 = array2d_to_array1d(a1) + self.assertEqual(a2.tolist(), [(3, 5), (10, 20), (7, 2)]) + self.assertIs(type(a2[0][0]), np.uint8) + #--------------------------------------------------------------------------- From 31eec456564bcdf656fb230b5b0c54af01aad7ae Mon Sep 17 00:00:00 2001 From: Christopher Ariza Date: Tue, 11 Jun 2024 12:12:29 -0700 Subject: [PATCH 04/28] clean up, unified error handling --- src/_arraykit.c | 46 ++++++++++++++++++++++++++-------------------- 1 file changed, 26 insertions(+), 20 deletions(-) diff --git a/src/_arraykit.c b/src/_arraykit.c index 0781955e..7212883e 100644 --- a/src/_arraykit.c +++ b/src/_arraykit.c @@ -3534,38 +3534,45 @@ array2d_to_array1d(PyObject *Py_UNUSED(m), PyObject *a) npy_intp num_rows = PyArray_DIM(input_array, 0); npy_intp num_cols = PyArray_DIM(input_array, 1); - npy_intp dims[1] = {num_rows}; - PyObject* output_array = PyArray_SimpleNew(1, dims, NPY_OBJECT); - if (output_array == NULL) { + npy_intp dims[] = {num_rows}; + // NOTE: this initializes values to NULL, not None + PyObject* output = PyArray_SimpleNew(1, dims, NPY_OBJECT); + if (output == NULL) { return NULL; } - for (npy_intp i = 0; i < num_rows; ++i) { + PyObject** output_data = (PyObject**)PyArray_DATA((PyArrayObject*)output); + PyObject** p = output_data; + PyObject** p_end = p + num_rows; + npy_intp i; + + while (p < p_end) { PyObject* tuple = PyTuple_New(num_cols); if (!tuple) { - Py_DECREF(output_array); - return NULL; + goto error; } - + i = p - output_data; for (npy_intp j = 0; j < num_cols; ++j) { + // cannot assume input_array is contiguous PyObject* item = PyArray_ToScalar(PyArray_GETPTR2(input_array, i, j), input_array); - // PyObject* item = PyArray_GETITEM(input_array, PyArray_GETPTR2(input_array, i, j)); if (!item) { Py_DECREF(tuple); - Py_DECREF(output_array); // TODO: need to decrer object compmonents - return NULL; + goto error; } - PyTuple_SET_ITEM(tuple, j, item); + PyTuple_SET_ITEM(tuple, j, item); // steals reference to item } - - PyArray_SETITEM((PyArrayObject*)output_array, PyArray_GETPTR1((PyArrayObject*)output_array, i), tuple); - Py_DECREF(tuple); // JTODO: check + *p++ = tuple; // assign with new ref, no incr needed } - - PyArray_CLEARFLAGS((PyArrayObject *)output_array, NPY_ARRAY_WRITEABLE); - - // Py_INCREF(a); // JTODO: check if this is necessary - return output_array; + PyArray_CLEARFLAGS((PyArrayObject *)output, NPY_ARRAY_WRITEABLE); + return output; +error: + p = output_data; + p_end = p + num_rows; + while (p < p_end) { // decref all tuples within array + Py_XDECREF(*p++); // xdec as might be NULL + } + Py_DECREF(output); + return NULL; } //------------------------------------------------------------------------------ @@ -6001,7 +6008,6 @@ TriMap_dealloc(TriMapObject *self) { if (self->many_from != NULL) { // decref all arrays before freeing for (Py_ssize_t i = 0; i < self->many_count; i++) { - // NOTE: using dot to get to pointer? Py_DECREF((PyObject*)self->many_from[i].dst); } PyMem_Free(self->many_from); From 968251e37ded74bf9ba26b44b6de2014644cd469 Mon Sep 17 00:00:00 2001 From: Christopher Ariza Date: Tue, 11 Jun 2024 12:43:27 -0700 Subject: [PATCH 05/28] added performance panel --- doc/articles/array2d_to_1d.py | 243 ++++++++++++++++++++++++++++++++++ 1 file changed, 243 insertions(+) create mode 100644 doc/articles/array2d_to_1d.py diff --git a/doc/articles/array2d_to_1d.py b/doc/articles/array2d_to_1d.py new file mode 100644 index 00000000..119c4e95 --- /dev/null +++ b/doc/articles/array2d_to_1d.py @@ -0,0 +1,243 @@ + + + +import os +import sys +import timeit +import typing as tp + +from arraykit import array2d_to_array1d +import arraykit as ak + +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd + +sys.path.append(os.getcwd()) + + + +class ArrayProcessor: + NAME = '' + SORT = -1 + + def __init__(self, array: np.ndarray): + self.array = array + +#------------------------------------------------------------------------------- +class AKArray2D1D(ArrayProcessor): + NAME = 'ak.array2d_to_array1d()' + SORT = 0 + + def __call__(self): + _ = array2d_to_array1d(self.array) + +class PyArray2D1D(ArrayProcessor): + NAME = 'Python construction' + SORT = 1 + + def __call__(self): + post = np.empty(self.array.shape[0], dtype=object) + for i, row in enumerate(self.array): + post[i] = tuple(row) + post.flags.writeable = False + return post + + +#------------------------------------------------------------------------------- +NUMBER = 200 + +def seconds_to_display(seconds: float) -> str: + seconds /= NUMBER + if seconds < 1e-4: + return f'{seconds * 1e6: .1f} (µs)' + if seconds < 1e-1: + return f'{seconds * 1e3: .1f} (ms)' + return f'{seconds: .1f} (s)' + + +def plot_performance(frame): + fixture_total = len(frame['fixture'].unique()) + cat_total = len(frame['size'].unique()) + processor_total = len(frame['cls_processor'].unique()) + fig, axes = plt.subplots(cat_total, fixture_total) + + # cmap = plt.get_cmap('terrain') + cmap = plt.get_cmap('plasma') + + color = cmap(np.arange(processor_total) / max(processor_total, 3)) + + # category is the size of the array + for cat_count, (cat_label, cat) in enumerate(frame.groupby('size')): + # each fixture is a collection of tests for one display + fixtures = {fixture_label: fixture for fixture_label, fixture in cat.groupby('fixture')} + for fixture_count, (fixture_label, fixture) in enumerate( + (k, fixtures[k]) for k in FixtureFactory.DENSITY_TO_DISPLAY): + ax = axes[cat_count][fixture_count] + + # set order + fixture['sort'] = [f.SORT for f in fixture['cls_processor']] + fixture = fixture.sort_values('sort') + + results = fixture['time'].values.tolist() + names = [cls.NAME for cls in fixture['cls_processor']] + # x = np.arange(len(results)) + names_display = names + post = ax.bar(names_display, results, color=color) + + # density, position = fixture_label.split('-') + # cat_label is the size of the array + title = f'{cat_label:.0e}\n{FixtureFactory.DENSITY_TO_DISPLAY[fixture_label]}' + + ax.set_title(title, fontsize=6) + ax.set_box_aspect(0.75) # makes taller than wide + time_max = fixture['time'].max() + ax.set_yticks([0, time_max * 0.5, time_max]) + ax.set_yticklabels(['', + seconds_to_display(time_max * .5), + seconds_to_display(time_max), + ], fontsize=4) + # ax.set_xticks(x, names_display, rotation='vertical') + ax.tick_params( + axis='x', + which='both', + bottom=False, + top=False, + labelbottom=False, + ) + + fig.set_size_inches(8, 4) # width, height + fig.legend(post, names_display, loc='center right', fontsize=6) + # horizontal, vertical + fig.text(.05, .96, f'array2d_to_array1d() Performance: {NUMBER} Iterations', fontsize=10) + fig.text(.05, .90, get_versions(), fontsize=6) + + fp = '/tmp/array2d_to_array1d.png' + plt.subplots_adjust( + left=0.05, + bottom=0.05, + right=0.8, + top=0.85, + wspace=0.9, # width + hspace=0.5, + ) + # plt.rcParams.update({'font.size': 22}) + plt.savefig(fp, dpi=300) + + if sys.platform.startswith('linux'): + os.system(f'eog {fp}&') + else: + os.system(f'open {fp}') + + +#------------------------------------------------------------------------------- + +class FixtureFactory: + NAME = '' + + @staticmethod + def get_array(size: int, width_ratio: int) -> np.ndarray: + return np.arange(size).reshape(width_ratio, size // width_ratio) + + @classmethod + def get_label_array(cls, size: int) -> tp.Tuple[str, np.ndarray]: + array = cls.get_array(size) + return cls.NAME, array + + DENSITY_TO_DISPLAY = { + 'column-2': '2 Column', + 'column-5': '5 Column', + 'column-10': '10 Column', + 'column-20': '20 Column', + } + + # POSITION_TO_DISPLAY = { + # 'first_third': 'Fill 1/3 to End', + # 'second_third': 'Fill 2/3 to End', + # } + + +class FFC2(FixtureFactory): + NAME = 'column-2' + + @staticmethod + def get_array(size: int) -> np.ndarray: + a = FixtureFactory.get_array(size, 2) + return a + +class FFC5(FixtureFactory): + NAME = 'column-5' + + @staticmethod + def get_array(size: int) -> np.ndarray: + a = FixtureFactory.get_array(size, 5) + return a + +class FFC10(FixtureFactory): + NAME = 'column-10' + + @staticmethod + def get_array(size: int) -> np.ndarray: + a = FixtureFactory.get_array(size, 10) + return a + +class FFC20(FixtureFactory): + NAME = 'column-20' + + @staticmethod + def get_array(size: int) -> np.ndarray: + a = FixtureFactory.get_array(size, 20) + return a + +def get_versions() -> str: + import platform + return f'OS: {platform.system()} / ArrayKit: {ak.__version__} / NumPy: {np.__version__}\n' + + +CLS_PROCESSOR = ( + AKArray2D1D, + PyArray2D1D, + ) + +CLS_FF = ( + FFC2, + FFC5, + FFC10, + FFC20, +) + + +def run_test(): + records = [] + for size in (1_000, 10_000, 100_000, 1_000_000): + for ff in CLS_FF: + fixture_label, fixture = ff.get_label_array(size) + for cls in CLS_PROCESSOR: + runner = cls(fixture) + + record = [cls, NUMBER, fixture_label, size] + print(record) + try: + result = timeit.timeit( + f'runner()', + globals=locals(), + number=NUMBER) + except OSError: + result = np.nan + finally: + pass + record.append(result) + records.append(record) + + f = pd.DataFrame.from_records(records, + columns=('cls_processor', 'number', 'fixture', 'size', 'time') + ) + print(f) + plot_performance(f) + +if __name__ == '__main__': + + run_test() + + + From 93318b866c2ed6aed5be6e01b99d7a30959d23f1 Mon Sep 17 00:00:00 2001 From: Christopher Ariza Date: Tue, 11 Jun 2024 12:48:43 -0700 Subject: [PATCH 06/28] handle winedows int type --- src/_arraykit.c | 2 +- test/test_util.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/_arraykit.c b/src/_arraykit.c index 7212883e..839dde5d 100644 --- a/src/_arraykit.c +++ b/src/_arraykit.c @@ -3555,7 +3555,7 @@ array2d_to_array1d(PyObject *Py_UNUSED(m), PyObject *a) for (npy_intp j = 0; j < num_cols; ++j) { // cannot assume input_array is contiguous PyObject* item = PyArray_ToScalar(PyArray_GETPTR2(input_array, i, j), input_array); - if (!item) { + if (item == NULL) { Py_DECREF(tuple); goto error; } diff --git a/test/test_util.py b/test/test_util.py index a489a4d9..cff7f183 100644 --- a/test/test_util.py +++ b/test/test_util.py @@ -292,7 +292,7 @@ def test_array2d_to_array1d_dummy(self) -> None: _ = array2d_to_array1d(a1) def test_array2d_to_array1d_b(self) -> None: - a1 = np.arange(10).reshape(5, 2) + a1 = np.arange(10, dtype=np.int64).reshape(5, 2) result = array2d_to_array1d(a1) assert isinstance(result[0], tuple) assert result[0] == (0, 1) From 0e649ca2fa92653665ddf67f9c9eb9a1a97286c4 Mon Sep 17 00:00:00 2001 From: Christopher Ariza Date: Wed, 12 Jun 2024 09:22:08 -0700 Subject: [PATCH 07/28] minor optimizations --- src/_arraykit.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/_arraykit.c b/src/_arraykit.c index 839dde5d..53dd8c46 100644 --- a/src/_arraykit.c +++ b/src/_arraykit.c @@ -3524,6 +3524,8 @@ array_deepcopy(PyObject *m, PyObject *args, PyObject *kwargs) } +# define AK_A2D1D_ + // Reshape if necessary a row that might be 2D or 1D is returned as a 1D array. static PyObject * array2d_to_array1d(PyObject *Py_UNUSED(m), PyObject *a) @@ -3544,17 +3546,18 @@ array2d_to_array1d(PyObject *Py_UNUSED(m), PyObject *a) PyObject** output_data = (PyObject**)PyArray_DATA((PyArrayObject*)output); PyObject** p = output_data; PyObject** p_end = p + num_rows; - npy_intp i; + npy_intp i = 0; + PyObject* tuple; + PyObject* item; while (p < p_end) { - PyObject* tuple = PyTuple_New(num_cols); - if (!tuple) { + tuple = PyTuple_New(num_cols); + if (tuple == NULL) { goto error; } - i = p - output_data; for (npy_intp j = 0; j < num_cols; ++j) { // cannot assume input_array is contiguous - PyObject* item = PyArray_ToScalar(PyArray_GETPTR2(input_array, i, j), input_array); + item = PyArray_ToScalar(PyArray_GETPTR2(input_array, i, j), input_array); if (item == NULL) { Py_DECREF(tuple); goto error; @@ -3562,6 +3565,7 @@ array2d_to_array1d(PyObject *Py_UNUSED(m), PyObject *a) PyTuple_SET_ITEM(tuple, j, item); // steals reference to item } *p++ = tuple; // assign with new ref, no incr needed + i++; } PyArray_CLEARFLAGS((PyArrayObject *)output, NPY_ARRAY_WRITEABLE); return output; From 05bde4d3a332fe070060527df737788ddd8452d3 Mon Sep 17 00:00:00 2001 From: Christopher Ariza Date: Wed, 12 Jun 2024 11:07:27 -0700 Subject: [PATCH 08/28] corrected performance tests, additional tests --- doc/articles/array2d_to_1d.py | 2 +- src/_arraykit.c | 3 ++- test/test_util.py | 5 ++++- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/doc/articles/array2d_to_1d.py b/doc/articles/array2d_to_1d.py index 119c4e95..3da09220 100644 --- a/doc/articles/array2d_to_1d.py +++ b/doc/articles/array2d_to_1d.py @@ -137,7 +137,7 @@ class FixtureFactory: @staticmethod def get_array(size: int, width_ratio: int) -> np.ndarray: - return np.arange(size).reshape(width_ratio, size // width_ratio) + return np.arange(size).reshape(size // width_ratio, width_ratio) @classmethod def get_label_array(cls, size: int) -> tp.Tuple[str, np.ndarray]: diff --git a/src/_arraykit.c b/src/_arraykit.c index 53dd8c46..cf4370b4 100644 --- a/src/_arraykit.c +++ b/src/_arraykit.c @@ -3547,6 +3547,7 @@ array2d_to_array1d(PyObject *Py_UNUSED(m), PyObject *a) PyObject** p = output_data; PyObject** p_end = p + num_rows; npy_intp i = 0; + npy_intp j; PyObject* tuple; PyObject* item; @@ -3555,7 +3556,7 @@ array2d_to_array1d(PyObject *Py_UNUSED(m), PyObject *a) if (tuple == NULL) { goto error; } - for (npy_intp j = 0; j < num_cols; ++j) { + for (j = 0; j < num_cols; ++j) { // cannot assume input_array is contiguous item = PyArray_ToScalar(PyArray_GETPTR2(input_array, i, j), input_array); if (item == NULL) { diff --git a/test/test_util.py b/test/test_util.py index cff7f183..18304991 100644 --- a/test/test_util.py +++ b/test/test_util.py @@ -312,7 +312,10 @@ def test_array2d_to_array1d_d(self) -> None: self.assertEqual(a2.tolist(), [(3, 5), (10, 20), (7, 2)]) self.assertIs(type(a2[0][0]), np.uint8) - + def test_array2d_to_array1d_e(self) -> None: + a1 = np.arange(20, dtype=np.int64).reshape(4, 5) + result = array2d_to_array1d(a1) + self.assertEqual(result.tolist(), [(0, 1, 2, 3, 4), (5, 6, 7, 8, 9), (10, 11, 12, 13, 14), (15, 16, 17, 18, 19)]) #--------------------------------------------------------------------------- From 05f5dd6a9974167a0426fb7e2a00134d434439d5 Mon Sep 17 00:00:00 2001 From: Christopher Ariza Date: Thu, 20 Jun 2024 17:30:09 -0400 Subject: [PATCH 09/28] progress on array2d_tuple_iter --- src/__init__.py | 1 + src/__init__.pyi | 3 +- src/_arraykit.c | 97 +++++++++++++++++++++++++++++++++++++++++++++-- test/test_util.py | 7 ++++ 4 files changed, 103 insertions(+), 5 deletions(-) diff --git a/src/__init__.py b/src/__init__.py index e50f4b2c..0f963278 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -29,4 +29,5 @@ from ._arraykit import first_true_2d as first_true_2d from ._arraykit import slice_to_ascending_slice as slice_to_ascending_slice from ._arraykit import array2d_to_array1d as array2d_to_array1d +from ._arraykit import array2d_tuple_iter as array2d_tuple_iter from ._arraykit import nonzero_1d as nonzero_1d diff --git a/src/__init__.pyi b/src/__init__.pyi index 3f61a10a..71f30ac7 100644 --- a/src/__init__.pyi +++ b/src/__init__.pyi @@ -161,4 +161,5 @@ def first_true_1d(__array: np.ndarray, *, forward: bool) -> int: ... def first_true_2d(__array: np.ndarray, *, forward: bool, axis: int) -> np.ndarray: ... def nonzero_1d(__array: np.ndarray, /) -> np.ndarray: ... def slice_to_ascending_slice(__slice: slice, __size: int) -> slice: ... -def array2d_to_array1d(__array: np.ndarray) -> np.ndarray: ... \ No newline at end of file +def array2d_to_array1d(__array: np.ndarray) -> np.ndarray: ... +def array2d_tuple_iter(__array: np.ndarray) -> tp.Iterator[tp.Tuple[tp.Any, ...]]: ... \ No newline at end of file diff --git a/src/_arraykit.c b/src/_arraykit.c index cf4370b4..7673d659 100644 --- a/src/_arraykit.c +++ b/src/_arraykit.c @@ -3523,10 +3523,8 @@ array_deepcopy(PyObject *m, PyObject *args, PyObject *kwargs) return AK_ArrayDeepCopy(m, (PyArrayObject*)array, memo); } - -# define AK_A2D1D_ - -// Reshape if necessary a row that might be 2D or 1D is returned as a 1D array. +//------------------------------------------------------------------------------ +// Given a 2D array, return a 1D object array of tuples. static PyObject * array2d_to_array1d(PyObject *Py_UNUSED(m), PyObject *a) { @@ -3580,6 +3578,96 @@ array2d_to_array1d(PyObject *Py_UNUSED(m), PyObject *a) return NULL; } +//------------------------------------------------------------------------------ +// Array2DTuple Iterator + +static PyTypeObject A2DTupleType; + +typedef struct A2DTupleObject { + PyObject_HEAD + PyArrayObject* array; + npy_intp num_rows; + npy_intp num_cols; + Py_ssize_t pos; // current index state, mutated in-place + +} A2DTupleObject; + +static PyObject * +A2DTuple_new(PyArrayObject* array, + npy_intp num_rows, + npy_intp num_cols) { + A2DTupleObject* a2dt = PyObject_New(A2DTupleObject, &A2DTupleType); + if (!a2dt) { + return NULL; + } + Py_INCREF((PyObject*)array); + a2dt->array = array; + a2dt->num_rows = num_rows; + a2dt->num_cols = num_cols; + a2dt->pos = 0; + return (PyObject *)a2dt; +} + +static void +A2DTuple_dealloc(A2DTupleObject *self) { + Py_DECREF((PyObject*)self->array); + PyObject_Del((PyObject*)self); +} + +static PyObject* +A2DTuple_iter(A2DTupleObject *self) { + Py_INCREF(self); + return (PyObject*)self; +} + +static PyObject * +A2DTuple_iternext(A2DTupleObject *self) { + Py_ssize_t i = self->pos++; + if (i >= self->num_rows) { + return NULL; + } + Py_RETURN_NONE; +} + +// static PyObject * +// A2DTuple_reversed(A2DTupleObject *self) { +// return A2DTuple_new(self->bi, !self->reversed); +// } + +static PyObject * +A2DTuple_length_hint(A2DTupleObject *self) { + Py_ssize_t len = Py_MAX(0, self->num_rows - self->pos); + return PyLong_FromSsize_t(len); +} + +static PyMethodDef A2DTuple_methods[] = { + {"__length_hint__", (PyCFunction)A2DTuple_length_hint, METH_NOARGS, NULL}, + // {"__reversed__", (PyCFunction)A2DTuple_reversed, METH_NOARGS, NULL}, + {NULL}, +}; + +static PyTypeObject A2DTupleType = { + PyVarObject_HEAD_INIT(NULL, 0) + .tp_basicsize = sizeof(A2DTupleObject), + .tp_dealloc = (destructor) A2DTuple_dealloc, + .tp_iter = (getiterfunc) A2DTuple_iter, + .tp_iternext = (iternextfunc) A2DTuple_iternext, + .tp_methods = A2DTuple_methods, + .tp_name = "arraykit.A2DTupleIterator", +}; + +// Given a 2D array, return an iterator of row tuples. +static PyObject * +array2d_tuple_iter(PyObject *Py_UNUSED(m), PyObject *a) +{ + AK_CHECK_NUMPY_ARRAY_2D(a); + PyArrayObject* array = (PyArrayObject *)a; + npy_intp num_rows = PyArray_DIM(array, 0); + npy_intp num_cols = PyArray_DIM(array, 1); + return A2DTuple_new(array, num_rows, num_cols); +} + + //------------------------------------------------------------------------------ // type resolution @@ -7328,6 +7416,7 @@ static PyMethodDef arraykit_methods[] = { METH_VARARGS | METH_KEYWORDS, NULL}, {"array2d_to_array1d", array2d_to_array1d, METH_O, NULL}, + {"array2d_tuple_iter", array2d_tuple_iter, METH_O, NULL}, {"resolve_dtype", resolve_dtype, METH_VARARGS, NULL}, {"resolve_dtype_iter", resolve_dtype_iter, METH_O, NULL}, {"first_true_1d", diff --git a/test/test_util.py b/test/test_util.py index 18304991..963ef412 100644 --- a/test/test_util.py +++ b/test/test_util.py @@ -23,6 +23,7 @@ from arraykit import first_true_2d from arraykit import slice_to_ascending_slice from arraykit import array2d_to_array1d +from arraykit import array2d_tuple_iter from performance.reference.util import get_new_indexers_and_screen_ak as get_new_indexers_and_screen_full from arraykit import get_new_indexers_and_screen @@ -317,6 +318,12 @@ def test_array2d_to_array1d_e(self) -> None: result = array2d_to_array1d(a1) self.assertEqual(result.tolist(), [(0, 1, 2, 3, 4), (5, 6, 7, 8, 9), (10, 11, 12, 13, 14), (15, 16, 17, 18, 19)]) + #--------------------------------------------------------------------------- + def test_array2d_tuple_iter_a(self) -> None: + a1 = np.arange(20, dtype=np.int64).reshape(4, 5) + result = array2d_tuple_iter(a1) + self.assertEqual(len(list(result)), 4) + #--------------------------------------------------------------------------- def test_isna_element_a(self) -> None: From 8d33bd9cdaf5f973c053bdd068069281e48780e9 Mon Sep 17 00:00:00 2001 From: Christopher Ariza Date: Thu, 20 Jun 2024 17:43:22 -0400 Subject: [PATCH 10/28] progress on array2d_tuple_iter --- src/_arraykit.c | 26 +++++++++++++++++++++++--- test/test_util.py | 5 +++-- 2 files changed, 26 insertions(+), 5 deletions(-) diff --git a/src/_arraykit.c b/src/_arraykit.c index 7673d659..e34cda9f 100644 --- a/src/_arraykit.c +++ b/src/_arraykit.c @@ -3616,17 +3616,37 @@ A2DTuple_dealloc(A2DTupleObject *self) { static PyObject* A2DTuple_iter(A2DTupleObject *self) { + // NOTE: why do we not need to incref components sof self? Py_INCREF(self); return (PyObject*)self; } static PyObject * A2DTuple_iternext(A2DTupleObject *self) { - Py_ssize_t i = self->pos++; - if (i >= self->num_rows) { + Py_ssize_t i = self->pos; + if (i < self->num_rows) { + npy_intp num_cols = self->num_cols; + PyArrayObject* array = self->array; + PyObject* tuple = PyTuple_New(num_cols); + PyObject* item; + if (tuple == NULL) { + return NULL; + } + for (npy_intp j = 0; j < num_cols; ++j) { + // cannot assume input_array is contiguous + item = PyArray_ToScalar(PyArray_GETPTR2(array, i, j), array); + if (item == NULL) { + Py_DECREF(tuple); + return NULL; + } + PyTuple_SET_ITEM(tuple, j, item); // steals reference to item + } + self->pos++; + return tuple; + } + else { return NULL; } - Py_RETURN_NONE; } // static PyObject * diff --git a/test/test_util.py b/test/test_util.py index 963ef412..b61b3fb8 100644 --- a/test/test_util.py +++ b/test/test_util.py @@ -321,8 +321,9 @@ def test_array2d_to_array1d_e(self) -> None: #--------------------------------------------------------------------------- def test_array2d_tuple_iter_a(self) -> None: a1 = np.arange(20, dtype=np.int64).reshape(4, 5) - result = array2d_tuple_iter(a1) - self.assertEqual(len(list(result)), 4) + result = list(array2d_tuple_iter(a1)) + self.assertEqual(len(result), 4) + self.assertEqual(result, [(0, 1, 2, 3, 4), (5, 6, 7, 8, 9), (10, 11, 12, 13, 14), (15, 16, 17, 18, 19)]) #--------------------------------------------------------------------------- From b3386ca9dcc1ff3b0cd4b11d2c90115ddb886d25 Mon Sep 17 00:00:00 2001 From: Christopher Ariza Date: Thu, 20 Jun 2024 17:47:57 -0400 Subject: [PATCH 11/28] additional testing of array2d_tuple_iter --- test/test_util.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/test/test_util.py b/test/test_util.py index b61b3fb8..b8202e1e 100644 --- a/test/test_util.py +++ b/test/test_util.py @@ -325,6 +325,20 @@ def test_array2d_tuple_iter_a(self) -> None: self.assertEqual(len(result), 4) self.assertEqual(result, [(0, 1, 2, 3, 4), (5, 6, 7, 8, 9), (10, 11, 12, 13, 14), (15, 16, 17, 18, 19)]) + def test_array2d_tuple_iter_b(self) -> None: + a1 = np.arange(20, dtype=np.int64).reshape(10, 2) + result = list(array2d_tuple_iter(a1)) + self.assertEqual(result, [(0, 1), (2, 3), (4, 5), (6, 7), (8, 9), (10, 11), (12, 13), (14, 15), (16, 17), (18, 19)]) + + def test_array2d_tuple_iter_c(self) -> None: + a1 = np.array([['aaa', 'bb'], ['c', 'dd'], ['ee', 'fffff']]) + it = array2d_tuple_iter(a1) + self.assertEqual(next(it), ('aaa', 'bb')) + self.assertEqual(next(it), ('c', 'dd')) + self.assertEqual(next(it), ('ee', 'fffff')) + with self.assertRaises(StopIteration): + next(it) + #--------------------------------------------------------------------------- def test_isna_element_a(self) -> None: From 4334fe619e444a4ed1847a24fc495bd8b750ee08 Mon Sep 17 00:00:00 2001 From: Christopher Ariza Date: Thu, 20 Jun 2024 18:26:12 -0400 Subject: [PATCH 12/28] refined performance tests --- doc/articles/array2d_to_1d.py | 7 - doc/articles/array2d_tuple_iter.py | 260 +++++++++++++++++++++++++++++ 2 files changed, 260 insertions(+), 7 deletions(-) create mode 100644 doc/articles/array2d_tuple_iter.py diff --git a/doc/articles/array2d_to_1d.py b/doc/articles/array2d_to_1d.py index 3da09220..99bd158f 100644 --- a/doc/articles/array2d_to_1d.py +++ b/doc/articles/array2d_to_1d.py @@ -1,6 +1,3 @@ - - - import os import sys import timeit @@ -15,8 +12,6 @@ sys.path.append(os.getcwd()) - - class ArrayProcessor: NAME = '' SORT = -1 @@ -41,8 +36,6 @@ def __call__(self): for i, row in enumerate(self.array): post[i] = tuple(row) post.flags.writeable = False - return post - #------------------------------------------------------------------------------- NUMBER = 200 diff --git a/doc/articles/array2d_tuple_iter.py b/doc/articles/array2d_tuple_iter.py new file mode 100644 index 00000000..5c2fd8d1 --- /dev/null +++ b/doc/articles/array2d_tuple_iter.py @@ -0,0 +1,260 @@ +import os +import sys +import timeit +import typing as tp + +from arraykit import array2d_tuple_iter +import arraykit as ak + +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd + +sys.path.append(os.getcwd()) + +class ArrayProcessor: + NAME = '' + SORT = -1 + + def __init__(self, array: np.ndarray): + self.array = array + +#------------------------------------------------------------------------------- +class AKArray2DTupleList(ArrayProcessor): + NAME = 'list(ak.array2d_tuple_iter(a2d))' + SORT = 0 + + def __call__(self): + _ = list(array2d_tuple_iter(self.array)) + +class AKArray2DTupleNext(ArrayProcessor): + NAME = 'next(ak.array2d_tuple_iter(a2d))' + SORT = 1 + + def __call__(self): + it = array2d_tuple_iter(self.array) + while True: + try: + _ = next(it) + except StopIteration: + break + +class PyArray2DTupleMapList(ArrayProcessor): + NAME = 'list(map(tuple, a2d))' + SORT = 2 + + def __call__(self): + array = self.array + _ = list(map(tuple, array)) + +class PyArray2DTupleIterNext(ArrayProcessor): + NAME = 'tuple(next(iter(a2d)))' + SORT = 3 + + def __call__(self): + it = iter(self.array) + while True: + try: + _ = tuple(next(it)) + except StopIteration: + break + + + + + +#------------------------------------------------------------------------------- +NUMBER = 200 + +def seconds_to_display(seconds: float) -> str: + seconds /= NUMBER + if seconds < 1e-4: + return f'{seconds * 1e6: .1f} (µs)' + if seconds < 1e-1: + return f'{seconds * 1e3: .1f} (ms)' + return f'{seconds: .1f} (s)' + + +def plot_performance(frame): + fixture_total = len(frame['fixture'].unique()) + cat_total = len(frame['size'].unique()) + processor_total = len(frame['cls_processor'].unique()) + fig, axes = plt.subplots(cat_total, fixture_total) + + # cmap = plt.get_cmap('terrain') + cmap = plt.get_cmap('plasma') + + color = cmap(np.arange(processor_total) / max(processor_total, 3)) + + # category is the size of the array + for cat_count, (cat_label, cat) in enumerate(frame.groupby('size')): + # each fixture is a collection of tests for one display + fixtures = {fixture_label: fixture for fixture_label, fixture in cat.groupby('fixture')} + for fixture_count, (fixture_label, fixture) in enumerate( + (k, fixtures[k]) for k in FixtureFactory.DENSITY_TO_DISPLAY): + ax = axes[cat_count][fixture_count] + + # set order + fixture['sort'] = [f.SORT for f in fixture['cls_processor']] + fixture = fixture.sort_values('sort') + + results = fixture['time'].values.tolist() + names = [cls.NAME for cls in fixture['cls_processor']] + # x = np.arange(len(results)) + names_display = names + post = ax.bar(names_display, results, color=color) + + # density, position = fixture_label.split('-') + # cat_label is the size of the array + title = f'{cat_label:.0e}\n{FixtureFactory.DENSITY_TO_DISPLAY[fixture_label]}' + + ax.set_title(title, fontsize=6) + ax.set_box_aspect(0.75) # makes taller than wide + time_max = fixture['time'].max() + ax.set_yticks([0, time_max * 0.5, time_max]) + ax.set_yticklabels(['', + seconds_to_display(time_max * .5), + seconds_to_display(time_max), + ], fontsize=4) + # ax.set_xticks(x, names_display, rotation='vertical') + ax.tick_params( + axis='x', + which='both', + bottom=False, + top=False, + labelbottom=False, + ) + + fig.set_size_inches(8, 4) # width, height + fig.legend(post, names_display, loc='center right', fontsize=6) + # horizontal, vertical + fig.text(.05, .96, f'array2d_tuple_iter() Performance: {NUMBER} Iterations', fontsize=10) + fig.text(.05, .90, get_versions(), fontsize=6) + + fp = '/tmp/array2d_tuple_iter.png' + plt.subplots_adjust( + left=0.05, + bottom=0.05, + right=0.8, + top=0.85, + wspace=0.1, # width + hspace=0.5, + ) + # plt.rcParams.update({'font.size': 22}) + plt.savefig(fp, dpi=300) + + if sys.platform.startswith('linux'): + os.system(f'eog {fp}&') + else: + os.system(f'open {fp}') + + +#------------------------------------------------------------------------------- + +class FixtureFactory: + NAME = '' + + @staticmethod + def get_array(size: int, width_ratio: int) -> np.ndarray: + return np.arange(size).reshape(size // width_ratio, width_ratio) + + @classmethod + def get_label_array(cls, size: int) -> tp.Tuple[str, np.ndarray]: + array = cls.get_array(size) + return cls.NAME, array + + DENSITY_TO_DISPLAY = { + 'column-2': '2 Column', + 'column-5': '5 Column', + 'column-10': '10 Column', + 'column-20': '20 Column', + } + + +class FFC2(FixtureFactory): + NAME = 'column-2' + + @staticmethod + def get_array(size: int) -> np.ndarray: + a = FixtureFactory.get_array(size, 2) + return a + +class FFC5(FixtureFactory): + NAME = 'column-5' + + @staticmethod + def get_array(size: int) -> np.ndarray: + a = FixtureFactory.get_array(size, 5) + return a + +class FFC10(FixtureFactory): + NAME = 'column-10' + + @staticmethod + def get_array(size: int) -> np.ndarray: + a = FixtureFactory.get_array(size, 10) + return a + +class FFC20(FixtureFactory): + NAME = 'column-20' + + @staticmethod + def get_array(size: int) -> np.ndarray: + a = FixtureFactory.get_array(size, 20) + return a + +def get_versions() -> str: + import platform + return f'OS: {platform.system()} / ArrayKit: {ak.__version__} / NumPy: {np.__version__}\n' + + +CLS_PROCESSOR = ( + AKArray2DTupleList, + AKArray2DTupleNext, + PyArray2DTupleMapList, + PyArray2DTupleIterNext, + ) + + +CLS_FF = ( + FFC2, + FFC5, + FFC10, + FFC20, +) + + +def run_test(): + records = [] + for size in (1_000, 10_000, 100_000, 1_000_000): + for ff in CLS_FF: + fixture_label, fixture = ff.get_label_array(size) + for cls in CLS_PROCESSOR: + runner = cls(fixture) + + record = [cls, NUMBER, fixture_label, size] + print(record) + try: + result = timeit.timeit( + f'runner()', + globals=locals(), + number=NUMBER) + except OSError: + result = np.nan + finally: + pass + record.append(result) + records.append(record) + + f = pd.DataFrame.from_records(records, + columns=('cls_processor', 'number', 'fixture', 'size', 'time') + ) + print(f) + plot_performance(f) + +if __name__ == '__main__': + + run_test() + + + From 4e810c6d82049a200c88d3cedc6a835f752574d5 Mon Sep 17 00:00:00 2001 From: Christopher Ariza Date: Thu, 20 Jun 2024 18:30:53 -0400 Subject: [PATCH 13/28] additional test of lenght hint --- src/_arraykit.c | 4 +--- test/test_util.py | 11 +++++++++++ 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/src/_arraykit.c b/src/_arraykit.c index e34cda9f..c5293cbb 100644 --- a/src/_arraykit.c +++ b/src/_arraykit.c @@ -3644,9 +3644,7 @@ A2DTuple_iternext(A2DTupleObject *self) { self->pos++; return tuple; } - else { - return NULL; - } + return NULL; } // static PyObject * diff --git a/test/test_util.py b/test/test_util.py index b8202e1e..1e79a13f 100644 --- a/test/test_util.py +++ b/test/test_util.py @@ -333,12 +333,23 @@ def test_array2d_tuple_iter_b(self) -> None: def test_array2d_tuple_iter_c(self) -> None: a1 = np.array([['aaa', 'bb'], ['c', 'dd'], ['ee', 'fffff']]) it = array2d_tuple_iter(a1) + self.assertEqual(it.__length_hint__(), 3) self.assertEqual(next(it), ('aaa', 'bb')) + self.assertEqual(it.__length_hint__(), 2) self.assertEqual(next(it), ('c', 'dd')) + self.assertEqual(it.__length_hint__(), 1) self.assertEqual(next(it), ('ee', 'fffff')) + self.assertEqual(it.__length_hint__(), 0) with self.assertRaises(StopIteration): next(it) + def test_array2d_tuple_iter_d(self) -> None: + a1 = np.array([['aaa', 'bb'], ['c', 'dd'], ['ee', 'fffff']]) + it = array2d_tuple_iter(a1) + # __reversed__ not implemented + with self.assertRaises(TypeError): + reversed(it) + #--------------------------------------------------------------------------- def test_isna_element_a(self) -> None: From f355c903a8a56e9a32484dc778df5fd3397c487c Mon Sep 17 00:00:00 2001 From: Christopher Ariza Date: Thu, 20 Jun 2024 18:39:24 -0400 Subject: [PATCH 14/28] additional tests of array2d_tuple_iter --- src/_arraykit.c | 2 +- test/test_util.py | 22 ++++++++++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/src/_arraykit.c b/src/_arraykit.c index c5293cbb..d2bc9933 100644 --- a/src/_arraykit.c +++ b/src/_arraykit.c @@ -3633,7 +3633,7 @@ A2DTuple_iternext(A2DTupleObject *self) { return NULL; } for (npy_intp j = 0; j < num_cols; ++j) { - // cannot assume input_array is contiguous + // cannot assume array is contiguous item = PyArray_ToScalar(PyArray_GETPTR2(array, i, j), array); if (item == NULL) { Py_DECREF(tuple); diff --git a/test/test_util.py b/test/test_util.py index 1e79a13f..2c3c0d29 100644 --- a/test/test_util.py +++ b/test/test_util.py @@ -350,6 +350,28 @@ def test_array2d_tuple_iter_d(self) -> None: with self.assertRaises(TypeError): reversed(it) + def test_array2d_tuple_iter_e(self) -> None: + a1 = np.array([[None, 'bb'], [None, 'dd'], [3, None]]) + it = array2d_tuple_iter(a1) + del a1 + self.assertEqual(list(it), [(None, 'bb'), (None, 'dd'), (3, None)]) + + def test_array2d_tuple_iter_f(self) -> None: + a1 = np.array([[None, 'bb'], [None, 'dd'], [3, None]]) + it1 = array2d_tuple_iter(a1) + del a1 + it2 = iter(it1) + self.assertEqual(list(it1), [(None, 'bb'), (None, 'dd'), (3, None)]) + self.assertEqual(list(it2), []) # expected behavior + + def test_array2d_tuple_iter_g(self) -> None: + a1 = np.array([[None, 'bb'], [None, 'dd'], [3, None]]) + it1 = array2d_tuple_iter(a1) + it2 = array2d_tuple_iter(a1) + del a1 + self.assertEqual(list(it1), [(None, 'bb'), (None, 'dd'), (3, None)]) + self.assertEqual(list(it2), [(None, 'bb'), (None, 'dd'), (3, None)]) + #--------------------------------------------------------------------------- def test_isna_element_a(self) -> None: From 74b478d105cba543a99faf78202012dddf312b52 Mon Sep 17 00:00:00 2001 From: Christopher Ariza Date: Thu, 20 Jun 2024 18:42:08 -0400 Subject: [PATCH 15/28] code cleanup --- src/_arraykit.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/_arraykit.c b/src/_arraykit.c index d2bc9933..d6ab0501 100644 --- a/src/_arraykit.c +++ b/src/_arraykit.c @@ -3589,7 +3589,6 @@ typedef struct A2DTupleObject { npy_intp num_rows; npy_intp num_cols; Py_ssize_t pos; // current index state, mutated in-place - } A2DTupleObject; static PyObject * @@ -3616,7 +3615,6 @@ A2DTuple_dealloc(A2DTupleObject *self) { static PyObject* A2DTuple_iter(A2DTupleObject *self) { - // NOTE: why do we not need to incref components sof self? Py_INCREF(self); return (PyObject*)self; } From 8e7937fbfd118640a68320d7dc5fe9008dbf0131 Mon Sep 17 00:00:00 2001 From: Christopher Ariza Date: Fri, 21 Jun 2024 11:20:05 -0700 Subject: [PATCH 16/28] 0.7.0 RC 1 --- README.rst | 8 ++++++++ setup.py | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 3be5d899..2f3bf1b2 100644 --- a/README.rst +++ b/README.rst @@ -37,6 +37,14 @@ ArrayKit requires the following: What is New in ArrayKit ------------------------- +0.7.0 +............ + +Added ``array2d_to_array1d()``. + +Added ``array2d_tuple_iter()``. + + 0.6.3 ............ diff --git a/setup.py b/setup.py index 253ae931..e8b881fa 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ from setuptools import setup from pathlib import Path -AK_VERSION = '0.6.3' +AK_VERSION = '0.7.0' def get_long_description() -> str: return '''The ArrayKit library provides utilities for creating and transforming NumPy arrays, implementing performance-critical StaticFrame operations as Python C extensions. From 0cd2dee0e99f11f9beabc4f6b9d384236b67c443 Mon Sep 17 00:00:00 2001 From: Christopher Ariza Date: Sat, 22 Jun 2024 07:30:43 -0700 Subject: [PATCH 17/28] array2d_to_array1d now supports 1d arrays --- src/_arraykit.c | 52 ++++++++++++++++++++++++++++++++++------------- test/test_util.py | 28 +++++++++++++++++++++---- 2 files changed, 62 insertions(+), 18 deletions(-) diff --git a/src/_arraykit.c b/src/_arraykit.c index d6ab0501..cff487b9 100644 --- a/src/_arraykit.c +++ b/src/_arraykit.c @@ -3528,12 +3528,16 @@ array_deepcopy(PyObject *m, PyObject *args, PyObject *kwargs) static PyObject * array2d_to_array1d(PyObject *Py_UNUSED(m), PyObject *a) { - AK_CHECK_NUMPY_ARRAY_2D(a); + AK_CHECK_NUMPY_ARRAY(a); PyArrayObject *input_array = (PyArrayObject *)a; + int ndim = PyArray_NDIM(input_array); + if (ndim != 1 && ndim != 2) { + return PyErr_Format(PyExc_NotImplementedError, + "Expected 1D or 2D array, not %i.", + ndim); + } npy_intp num_rows = PyArray_DIM(input_array, 0); - npy_intp num_cols = PyArray_DIM(input_array, 1); - npy_intp dims[] = {num_rows}; // NOTE: this initializes values to NULL, not None PyObject* output = PyArray_SimpleNew(1, dims, NPY_OBJECT); @@ -3545,26 +3549,46 @@ array2d_to_array1d(PyObject *Py_UNUSED(m), PyObject *a) PyObject** p = output_data; PyObject** p_end = p + num_rows; npy_intp i = 0; - npy_intp j; PyObject* tuple; PyObject* item; - while (p < p_end) { - tuple = PyTuple_New(num_cols); - if (tuple == NULL) { - goto error; + if (ndim == 2) { + npy_intp num_cols = PyArray_DIM(input_array, 1); + npy_intp j; + while (p < p_end) { + tuple = PyTuple_New(num_cols); + if (tuple == NULL) { + goto error; + } + for (j = 0; j < num_cols; ++j) { + // cannot assume input_array is contiguous + item = PyArray_ToScalar(PyArray_GETPTR2(input_array, i, j), input_array); + if (item == NULL) { + Py_DECREF(tuple); + goto error; + } + PyTuple_SET_ITEM(tuple, j, item); // steals reference to item + } + *p++ = tuple; // assign with new ref, no incr needed + i++; } - for (j = 0; j < num_cols; ++j) { - // cannot assume input_array is contiguous - item = PyArray_ToScalar(PyArray_GETPTR2(input_array, i, j), input_array); + } + else { // ndim == 1 + while (p < p_end) { + tuple = PyTuple_New(1); + if (tuple == NULL) { + goto error; + } + // scalar returned in is native PyObject from object arrays + item = PyArray_ToScalar(PyArray_GETPTR1(input_array, i), input_array); if (item == NULL) { Py_DECREF(tuple); goto error; } - PyTuple_SET_ITEM(tuple, j, item); // steals reference to item + PyTuple_SET_ITEM(tuple, 0, item); // steals reference to item + *p++ = tuple; // assign with new ref, no incr needed + i++; } - *p++ = tuple; // assign with new ref, no incr needed - i++; } PyArray_CLEARFLAGS((PyArrayObject *)output, NPY_ARRAY_WRITEABLE); return output; diff --git a/test/test_util.py b/test/test_util.py index 2c3c0d29..f008c983 100644 --- a/test/test_util.py +++ b/test/test_util.py @@ -286,11 +286,31 @@ def test_array_deepcopy_h(self) -> None: a2 = array_deepcopy(a1, ()) #--------------------------------------------------------------------------- - def test_array2d_to_array1d_dummy(self) -> None: + def test_array2d_to_array1d_1d_a(self) -> None: a1 = np.arange(10) - with self.assertRaises(NotImplementedError): - # 1 dimensional - _ = array2d_to_array1d(a1) + a2 = array2d_to_array1d(a1) + self.assertEqual(a2.tolist(), [(0,), (1,), (2,), (3,), (4,), (5,), (6,), (7,), (8,), (9,)]) + + def test_array2d_to_array1d_1d_b(self) -> None: + a1 = np.array(['aaa', 'b', 'ccc']) + a2 = array2d_to_array1d(a1) + self.assertEqual(a2.tolist(), [('aaa',), ('b',), ('ccc',)]) + + def test_array2d_to_array1d_1d_c(self) -> None: + a1 = np.array([None, 'b', 30]) + a2 = array2d_to_array1d(a1) + self.assertEqual(a2.tolist(), [(None,), ('b',), (30,)]) + + def test_array2d_to_array1d_1d_d(self) -> None: + a1 = np.array([('a', 10), ('b', 30), ('c', 5)], dtype=object) + a2 = array2d_to_array1d(a1) + self.assertEqual(a2.tolist(), [('a', 10), ('b', 30), ('c', 5)]) + + def test_array2d_to_array1d_1d_e(self) -> None: + a1 = np.array([True, False, True], dtype=object) + a2 = array2d_to_array1d(a1) + self.assertIs(a2[0][0].__class__, bool) + self.assertEqual(a2.tolist(), [(True,), (False,), (True,)]) def test_array2d_to_array1d_b(self) -> None: a1 = np.arange(10, dtype=np.int64).reshape(5, 2) From 6ebe86e8a2432c2af86cf5dd863b6dc6093cf613 Mon Sep 17 00:00:00 2001 From: Christopher Ariza Date: Sat, 22 Jun 2024 08:00:44 -0700 Subject: [PATCH 18/28] refactored array2d_tuple_iter to handle 1D array --- src/_arraykit.c | 47 ++++++++++++++++++++++++++++++++++++----------- 1 file changed, 36 insertions(+), 11 deletions(-) diff --git a/src/_arraykit.c b/src/_arraykit.c index cff487b9..67991262 100644 --- a/src/_arraykit.c +++ b/src/_arraykit.c @@ -3626,7 +3626,7 @@ A2DTuple_new(PyArrayObject* array, Py_INCREF((PyObject*)array); a2dt->array = array; a2dt->num_rows = num_rows; - a2dt->num_cols = num_cols; + a2dt->num_cols = num_cols; // -1 for 1D array a2dt->pos = 0; return (PyObject *)a2dt; } @@ -3649,19 +3649,35 @@ A2DTuple_iternext(A2DTupleObject *self) { if (i < self->num_rows) { npy_intp num_cols = self->num_cols; PyArrayObject* array = self->array; - PyObject* tuple = PyTuple_New(num_cols); PyObject* item; - if (tuple == NULL) { - return NULL; + PyObject* tuple; + + if (num_cols > -1) { // ndim == 2 + tuple = PyTuple_New(num_cols); + if (tuple == NULL) { + return NULL; + } + for (npy_intp j = 0; j < num_cols; ++j) { + // cannot assume array is contiguous + item = PyArray_ToScalar(PyArray_GETPTR2(array, i, j), array); + if (item == NULL) { + Py_DECREF(tuple); + return NULL; + } + PyTuple_SET_ITEM(tuple, j, item); // steals reference to item + } } - for (npy_intp j = 0; j < num_cols; ++j) { - // cannot assume array is contiguous - item = PyArray_ToScalar(PyArray_GETPTR2(array, i, j), array); + else { // ndim == 1 + tuple = PyTuple_New(1); + if (tuple == NULL) { + return NULL; + } + item = PyArray_ToScalar(PyArray_GETPTR1(array, i), array); if (item == NULL) { Py_DECREF(tuple); return NULL; } - PyTuple_SET_ITEM(tuple, j, item); // steals reference to item + PyTuple_SET_ITEM(tuple, 0, item); // steals reference to item } self->pos++; return tuple; @@ -3700,10 +3716,19 @@ static PyTypeObject A2DTupleType = { static PyObject * array2d_tuple_iter(PyObject *Py_UNUSED(m), PyObject *a) { - AK_CHECK_NUMPY_ARRAY_2D(a); - PyArrayObject* array = (PyArrayObject *)a; + AK_CHECK_NUMPY_ARRAY(a); + PyArrayObject *array = (PyArrayObject *)a; + int ndim = PyArray_NDIM(array); + if (ndim != 1 && ndim != 2) { + return PyErr_Format(PyExc_NotImplementedError, + "Expected 1D or 2D array, not %i.", + ndim); + } npy_intp num_rows = PyArray_DIM(array, 0); - npy_intp num_cols = PyArray_DIM(array, 1); + npy_intp num_cols = -1; // indicate 1d + if (ndim == 2) { + num_cols = PyArray_DIM(array, 1); + } return A2DTuple_new(array, num_rows, num_cols); } From efbad1f3b96792850be2d3c4281f64cb5af9b2a2 Mon Sep 17 00:00:00 2001 From: Christopher Ariza Date: Sat, 22 Jun 2024 08:10:10 -0700 Subject: [PATCH 19/28] refactored array2d_tuple_iter to handle 1d array --- test/test_util.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/test/test_util.py b/test/test_util.py index f008c983..057d6a68 100644 --- a/test/test_util.py +++ b/test/test_util.py @@ -392,6 +392,24 @@ def test_array2d_tuple_iter_g(self) -> None: self.assertEqual(list(it1), [(None, 'bb'), (None, 'dd'), (3, None)]) self.assertEqual(list(it2), [(None, 'bb'), (None, 'dd'), (3, None)]) + def test_array2d_tuple_iter_1d_a(self) -> None: + a1 = np.array(['bb', 'c', 'aaa']) + result = list(array2d_tuple_iter(a1)) + self.assertEqual(len(result), 3) + self.assertEqual(result, [('bb',), ('c',), ('aaa',)]) + + def test_array2d_tuple_iter_1d_b(self) -> None: + a1 = np.array([20, -1, 8]) + result = list(array2d_tuple_iter(a1)) + self.assertEqual(len(result), 3) + self.assertEqual(result, [(20,), (-1,), (8,)]) + + def test_array2d_tuple_iter_1d_c(self) -> None: + a1 = np.array([('a', 4), ('c', -1), ('d', 8)], dtype=object) + result = list(array2d_tuple_iter(a1)) + self.assertEqual(len(result), 3) + self.assertEqual(result, [('a', 4), ('c', -1), ('d', 8)]) + #--------------------------------------------------------------------------- def test_isna_element_a(self) -> None: From 83ecf712ef8a5ec2a0cf87de6ae9fdc025ba81f8 Mon Sep 17 00:00:00 2001 From: Christopher Ariza Date: Sat, 22 Jun 2024 08:35:26 -0700 Subject: [PATCH 20/28] renamed functions --- README.rst | 4 +-- doc/articles/array2d_to_1d.py | 10 +++---- doc/articles/array2d_tuple_iter.py | 14 +++++----- src/__init__.py | 4 +-- src/__init__.pyi | 4 +-- src/_arraykit.c | 10 +++---- test/test_util.py | 44 +++++++++++++++--------------- 7 files changed, 45 insertions(+), 45 deletions(-) diff --git a/README.rst b/README.rst index 2f3bf1b2..68f10907 100644 --- a/README.rst +++ b/README.rst @@ -40,9 +40,9 @@ What is New in ArrayKit 0.7.0 ............ -Added ``array2d_to_array1d()``. +Added ``array_to_tuple_array()``. -Added ``array2d_tuple_iter()``. +Added ``array_to_tuple_iter()``. 0.6.3 diff --git a/doc/articles/array2d_to_1d.py b/doc/articles/array2d_to_1d.py index 99bd158f..c8a6b8ee 100644 --- a/doc/articles/array2d_to_1d.py +++ b/doc/articles/array2d_to_1d.py @@ -3,7 +3,7 @@ import timeit import typing as tp -from arraykit import array2d_to_array1d +from arraykit import array_to_tuple_array import arraykit as ak import matplotlib.pyplot as plt @@ -21,11 +21,11 @@ def __init__(self, array: np.ndarray): #------------------------------------------------------------------------------- class AKArray2D1D(ArrayProcessor): - NAME = 'ak.array2d_to_array1d()' + NAME = 'ak.array_to_tuple_array()' SORT = 0 def __call__(self): - _ = array2d_to_array1d(self.array) + _ = array_to_tuple_array(self.array) class PyArray2D1D(ArrayProcessor): NAME = 'Python construction' @@ -102,10 +102,10 @@ def plot_performance(frame): fig.set_size_inches(8, 4) # width, height fig.legend(post, names_display, loc='center right', fontsize=6) # horizontal, vertical - fig.text(.05, .96, f'array2d_to_array1d() Performance: {NUMBER} Iterations', fontsize=10) + fig.text(.05, .96, f'array_to_tuple_array() Performance: {NUMBER} Iterations', fontsize=10) fig.text(.05, .90, get_versions(), fontsize=6) - fp = '/tmp/array2d_to_array1d.png' + fp = '/tmp/array_to_tuple_array.png' plt.subplots_adjust( left=0.05, bottom=0.05, diff --git a/doc/articles/array2d_tuple_iter.py b/doc/articles/array2d_tuple_iter.py index 5c2fd8d1..5ef49fb6 100644 --- a/doc/articles/array2d_tuple_iter.py +++ b/doc/articles/array2d_tuple_iter.py @@ -3,7 +3,7 @@ import timeit import typing as tp -from arraykit import array2d_tuple_iter +from arraykit import array_to_tuple_iter import arraykit as ak import matplotlib.pyplot as plt @@ -21,18 +21,18 @@ def __init__(self, array: np.ndarray): #------------------------------------------------------------------------------- class AKArray2DTupleList(ArrayProcessor): - NAME = 'list(ak.array2d_tuple_iter(a2d))' + NAME = 'list(ak.array_to_tuple_iter(a2d))' SORT = 0 def __call__(self): - _ = list(array2d_tuple_iter(self.array)) + _ = list(array_to_tuple_iter(self.array)) class AKArray2DTupleNext(ArrayProcessor): - NAME = 'next(ak.array2d_tuple_iter(a2d))' + NAME = 'next(ak.array_to_tuple_iter(a2d))' SORT = 1 def __call__(self): - it = array2d_tuple_iter(self.array) + it = array_to_tuple_iter(self.array) while True: try: _ = next(it) @@ -128,10 +128,10 @@ def plot_performance(frame): fig.set_size_inches(8, 4) # width, height fig.legend(post, names_display, loc='center right', fontsize=6) # horizontal, vertical - fig.text(.05, .96, f'array2d_tuple_iter() Performance: {NUMBER} Iterations', fontsize=10) + fig.text(.05, .96, f'array_to_tuple_iter() Performance: {NUMBER} Iterations', fontsize=10) fig.text(.05, .90, get_versions(), fontsize=6) - fp = '/tmp/array2d_tuple_iter.png' + fp = '/tmp/array_to_tuple_iter.png' plt.subplots_adjust( left=0.05, bottom=0.05, diff --git a/src/__init__.py b/src/__init__.py index 0f963278..22a7e8dc 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -28,6 +28,6 @@ from ._arraykit import first_true_1d as first_true_1d from ._arraykit import first_true_2d as first_true_2d from ._arraykit import slice_to_ascending_slice as slice_to_ascending_slice -from ._arraykit import array2d_to_array1d as array2d_to_array1d -from ._arraykit import array2d_tuple_iter as array2d_tuple_iter +from ._arraykit import array_to_tuple_array as array_to_tuple_array +from ._arraykit import array_to_tuple_iter as array_to_tuple_iter from ._arraykit import nonzero_1d as nonzero_1d diff --git a/src/__init__.pyi b/src/__init__.pyi index 71f30ac7..1e9f3e3a 100644 --- a/src/__init__.pyi +++ b/src/__init__.pyi @@ -161,5 +161,5 @@ def first_true_1d(__array: np.ndarray, *, forward: bool) -> int: ... def first_true_2d(__array: np.ndarray, *, forward: bool, axis: int) -> np.ndarray: ... def nonzero_1d(__array: np.ndarray, /) -> np.ndarray: ... def slice_to_ascending_slice(__slice: slice, __size: int) -> slice: ... -def array2d_to_array1d(__array: np.ndarray) -> np.ndarray: ... -def array2d_tuple_iter(__array: np.ndarray) -> tp.Iterator[tp.Tuple[tp.Any, ...]]: ... \ No newline at end of file +def array_to_tuple_array(__array: np.ndarray) -> np.ndarray: ... +def array_to_tuple_iter(__array: np.ndarray) -> tp.Iterator[tp.Tuple[tp.Any, ...]]: ... \ No newline at end of file diff --git a/src/_arraykit.c b/src/_arraykit.c index 67991262..8201d48a 100644 --- a/src/_arraykit.c +++ b/src/_arraykit.c @@ -3524,9 +3524,9 @@ array_deepcopy(PyObject *m, PyObject *args, PyObject *kwargs) } //------------------------------------------------------------------------------ -// Given a 2D array, return a 1D object array of tuples. +// Given a 1D or 2D array, return a 1D object array of tuples. static PyObject * -array2d_to_array1d(PyObject *Py_UNUSED(m), PyObject *a) +array_to_tuple_array(PyObject *Py_UNUSED(m), PyObject *a) { AK_CHECK_NUMPY_ARRAY(a); PyArrayObject *input_array = (PyArrayObject *)a; @@ -3714,7 +3714,7 @@ static PyTypeObject A2DTupleType = { // Given a 2D array, return an iterator of row tuples. static PyObject * -array2d_tuple_iter(PyObject *Py_UNUSED(m), PyObject *a) +array_to_tuple_iter(PyObject *Py_UNUSED(m), PyObject *a) { AK_CHECK_NUMPY_ARRAY(a); PyArrayObject *array = (PyArrayObject *)a; @@ -7480,8 +7480,8 @@ static PyMethodDef arraykit_methods[] = { (PyCFunction)array_deepcopy, METH_VARARGS | METH_KEYWORDS, NULL}, - {"array2d_to_array1d", array2d_to_array1d, METH_O, NULL}, - {"array2d_tuple_iter", array2d_tuple_iter, METH_O, NULL}, + {"array_to_tuple_array", array_to_tuple_array, METH_O, NULL}, + {"array_to_tuple_iter", array_to_tuple_iter, METH_O, NULL}, {"resolve_dtype", resolve_dtype, METH_VARARGS, NULL}, {"resolve_dtype_iter", resolve_dtype_iter, METH_O, NULL}, {"first_true_1d", diff --git a/test/test_util.py b/test/test_util.py index 057d6a68..0b651430 100644 --- a/test/test_util.py +++ b/test/test_util.py @@ -22,8 +22,8 @@ from arraykit import first_true_1d from arraykit import first_true_2d from arraykit import slice_to_ascending_slice -from arraykit import array2d_to_array1d -from arraykit import array2d_tuple_iter +from arraykit import array_to_tuple_array +from arraykit import array_to_tuple_iter from performance.reference.util import get_new_indexers_and_screen_ak as get_new_indexers_and_screen_full from arraykit import get_new_indexers_and_screen @@ -288,33 +288,33 @@ def test_array_deepcopy_h(self) -> None: #--------------------------------------------------------------------------- def test_array2d_to_array1d_1d_a(self) -> None: a1 = np.arange(10) - a2 = array2d_to_array1d(a1) + a2 = array_to_tuple_array(a1) self.assertEqual(a2.tolist(), [(0,), (1,), (2,), (3,), (4,), (5,), (6,), (7,), (8,), (9,)]) def test_array2d_to_array1d_1d_b(self) -> None: a1 = np.array(['aaa', 'b', 'ccc']) - a2 = array2d_to_array1d(a1) + a2 = array_to_tuple_array(a1) self.assertEqual(a2.tolist(), [('aaa',), ('b',), ('ccc',)]) def test_array2d_to_array1d_1d_c(self) -> None: a1 = np.array([None, 'b', 30]) - a2 = array2d_to_array1d(a1) + a2 = array_to_tuple_array(a1) self.assertEqual(a2.tolist(), [(None,), ('b',), (30,)]) def test_array2d_to_array1d_1d_d(self) -> None: a1 = np.array([('a', 10), ('b', 30), ('c', 5)], dtype=object) - a2 = array2d_to_array1d(a1) + a2 = array_to_tuple_array(a1) self.assertEqual(a2.tolist(), [('a', 10), ('b', 30), ('c', 5)]) def test_array2d_to_array1d_1d_e(self) -> None: a1 = np.array([True, False, True], dtype=object) - a2 = array2d_to_array1d(a1) + a2 = array_to_tuple_array(a1) self.assertIs(a2[0][0].__class__, bool) self.assertEqual(a2.tolist(), [(True,), (False,), (True,)]) def test_array2d_to_array1d_b(self) -> None: a1 = np.arange(10, dtype=np.int64).reshape(5, 2) - result = array2d_to_array1d(a1) + result = array_to_tuple_array(a1) assert isinstance(result[0], tuple) assert result[0] == (0, 1) self.assertIs(type(result[0][0]), np.int64) @@ -324,35 +324,35 @@ def test_array2d_to_array1d_b(self) -> None: def test_array2d_to_array1d_c(self) -> None: a1 = np.array([["a", "b"], ["ccc", "ddd"], ["ee", "ff"]]) - a2 = array2d_to_array1d(a1) + a2 = array_to_tuple_array(a1) self.assertEqual(a2.tolist(), [('a', 'b'), ('ccc', 'ddd'), ('ee', 'ff')]) def test_array2d_to_array1d_d(self) -> None: a1 = np.array([[3, 5], [10, 20], [7, 2]], dtype=np.uint8) - a2 = array2d_to_array1d(a1) + a2 = array_to_tuple_array(a1) self.assertEqual(a2.tolist(), [(3, 5), (10, 20), (7, 2)]) self.assertIs(type(a2[0][0]), np.uint8) def test_array2d_to_array1d_e(self) -> None: a1 = np.arange(20, dtype=np.int64).reshape(4, 5) - result = array2d_to_array1d(a1) + result = array_to_tuple_array(a1) self.assertEqual(result.tolist(), [(0, 1, 2, 3, 4), (5, 6, 7, 8, 9), (10, 11, 12, 13, 14), (15, 16, 17, 18, 19)]) #--------------------------------------------------------------------------- def test_array2d_tuple_iter_a(self) -> None: a1 = np.arange(20, dtype=np.int64).reshape(4, 5) - result = list(array2d_tuple_iter(a1)) + result = list(array_to_tuple_iter(a1)) self.assertEqual(len(result), 4) self.assertEqual(result, [(0, 1, 2, 3, 4), (5, 6, 7, 8, 9), (10, 11, 12, 13, 14), (15, 16, 17, 18, 19)]) def test_array2d_tuple_iter_b(self) -> None: a1 = np.arange(20, dtype=np.int64).reshape(10, 2) - result = list(array2d_tuple_iter(a1)) + result = list(array_to_tuple_iter(a1)) self.assertEqual(result, [(0, 1), (2, 3), (4, 5), (6, 7), (8, 9), (10, 11), (12, 13), (14, 15), (16, 17), (18, 19)]) def test_array2d_tuple_iter_c(self) -> None: a1 = np.array([['aaa', 'bb'], ['c', 'dd'], ['ee', 'fffff']]) - it = array2d_tuple_iter(a1) + it = array_to_tuple_iter(a1) self.assertEqual(it.__length_hint__(), 3) self.assertEqual(next(it), ('aaa', 'bb')) self.assertEqual(it.__length_hint__(), 2) @@ -365,20 +365,20 @@ def test_array2d_tuple_iter_c(self) -> None: def test_array2d_tuple_iter_d(self) -> None: a1 = np.array([['aaa', 'bb'], ['c', 'dd'], ['ee', 'fffff']]) - it = array2d_tuple_iter(a1) + it = array_to_tuple_iter(a1) # __reversed__ not implemented with self.assertRaises(TypeError): reversed(it) def test_array2d_tuple_iter_e(self) -> None: a1 = np.array([[None, 'bb'], [None, 'dd'], [3, None]]) - it = array2d_tuple_iter(a1) + it = array_to_tuple_iter(a1) del a1 self.assertEqual(list(it), [(None, 'bb'), (None, 'dd'), (3, None)]) def test_array2d_tuple_iter_f(self) -> None: a1 = np.array([[None, 'bb'], [None, 'dd'], [3, None]]) - it1 = array2d_tuple_iter(a1) + it1 = array_to_tuple_iter(a1) del a1 it2 = iter(it1) self.assertEqual(list(it1), [(None, 'bb'), (None, 'dd'), (3, None)]) @@ -386,27 +386,27 @@ def test_array2d_tuple_iter_f(self) -> None: def test_array2d_tuple_iter_g(self) -> None: a1 = np.array([[None, 'bb'], [None, 'dd'], [3, None]]) - it1 = array2d_tuple_iter(a1) - it2 = array2d_tuple_iter(a1) + it1 = array_to_tuple_iter(a1) + it2 = array_to_tuple_iter(a1) del a1 self.assertEqual(list(it1), [(None, 'bb'), (None, 'dd'), (3, None)]) self.assertEqual(list(it2), [(None, 'bb'), (None, 'dd'), (3, None)]) def test_array2d_tuple_iter_1d_a(self) -> None: a1 = np.array(['bb', 'c', 'aaa']) - result = list(array2d_tuple_iter(a1)) + result = list(array_to_tuple_iter(a1)) self.assertEqual(len(result), 3) self.assertEqual(result, [('bb',), ('c',), ('aaa',)]) def test_array2d_tuple_iter_1d_b(self) -> None: a1 = np.array([20, -1, 8]) - result = list(array2d_tuple_iter(a1)) + result = list(array_to_tuple_iter(a1)) self.assertEqual(len(result), 3) self.assertEqual(result, [(20,), (-1,), (8,)]) def test_array2d_tuple_iter_1d_c(self) -> None: a1 = np.array([('a', 4), ('c', -1), ('d', 8)], dtype=object) - result = list(array2d_tuple_iter(a1)) + result = list(array_to_tuple_iter(a1)) self.assertEqual(len(result), 3) self.assertEqual(result, [('a', 4), ('c', -1), ('d', 8)]) From 09ea4d803a53726584e15de9cf02dfa2b390f13d Mon Sep 17 00:00:00 2001 From: Christopher Ariza Date: Sat, 22 Jun 2024 10:26:07 -0700 Subject: [PATCH 21/28] updated performance tests --- ...ray2d_to_1d.py => array_to_tuple_array.py} | 27 ++++++++--- ...d_tuple_iter.py => array_to_tuple_iter.py} | 45 ++++++++++++++----- 2 files changed, 57 insertions(+), 15 deletions(-) rename doc/articles/{array2d_to_1d.py => array_to_tuple_array.py} (91%) rename doc/articles/{array2d_tuple_iter.py => array_to_tuple_iter.py} (87%) diff --git a/doc/articles/array2d_to_1d.py b/doc/articles/array_to_tuple_array.py similarity index 91% rename from doc/articles/array2d_to_1d.py rename to doc/articles/array_to_tuple_array.py index c8a6b8ee..fa2efc2d 100644 --- a/doc/articles/array2d_to_1d.py +++ b/doc/articles/array_to_tuple_array.py @@ -33,12 +33,16 @@ class PyArray2D1D(ArrayProcessor): def __call__(self): post = np.empty(self.array.shape[0], dtype=object) - for i, row in enumerate(self.array): - post[i] = tuple(row) + if self.array.ndim == 1: + for i, e in enumerate(self.array): + post[i] = (e,) + else: + for i, row in enumerate(self.array): + post[i] = tuple(row) post.flags.writeable = False #------------------------------------------------------------------------------- -NUMBER = 200 +NUMBER = 1 def seconds_to_display(seconds: float) -> str: seconds /= NUMBER @@ -111,7 +115,7 @@ def plot_performance(frame): bottom=0.05, right=0.8, top=0.85, - wspace=0.9, # width + wspace=1.0, # width hspace=0.5, ) # plt.rcParams.update({'font.size': 22}) @@ -130,7 +134,9 @@ class FixtureFactory: @staticmethod def get_array(size: int, width_ratio: int) -> np.ndarray: - return np.arange(size).reshape(size // width_ratio, width_ratio) + if width_ratio > 1: + return np.arange(size).reshape(size // width_ratio, width_ratio) + return np.arange(size) # return 1D array @classmethod def get_label_array(cls, size: int) -> tp.Tuple[str, np.ndarray]: @@ -138,6 +144,7 @@ def get_label_array(cls, size: int) -> tp.Tuple[str, np.ndarray]: return cls.NAME, array DENSITY_TO_DISPLAY = { + 'column-1': '1 Column', 'column-2': '2 Column', 'column-5': '5 Column', 'column-10': '10 Column', @@ -150,6 +157,15 @@ def get_label_array(cls, size: int) -> tp.Tuple[str, np.ndarray]: # } +class FFC1(FixtureFactory): + NAME = 'column-1' + + @staticmethod + def get_array(size: int) -> np.ndarray: + a = FixtureFactory.get_array(size, 1) + return a + + class FFC2(FixtureFactory): NAME = 'column-2' @@ -193,6 +209,7 @@ def get_versions() -> str: ) CLS_FF = ( + FFC1, FFC2, FFC5, FFC10, diff --git a/doc/articles/array2d_tuple_iter.py b/doc/articles/array_to_tuple_iter.py similarity index 87% rename from doc/articles/array2d_tuple_iter.py rename to doc/articles/array_to_tuple_iter.py index 5ef49fb6..0bfb6a9a 100644 --- a/doc/articles/array2d_tuple_iter.py +++ b/doc/articles/array_to_tuple_iter.py @@ -45,26 +45,38 @@ class PyArray2DTupleMapList(ArrayProcessor): def __call__(self): array = self.array - _ = list(map(tuple, array)) + if array.ndim == 2: + _ = list(map(tuple, array)) + else: + _ = list(map(lambda e: (e,), array)) class PyArray2DTupleIterNext(ArrayProcessor): NAME = 'tuple(next(iter(a2d)))' SORT = 3 def __call__(self): - it = iter(self.array) - while True: - try: - _ = tuple(next(it)) - except StopIteration: - break + array = self.array + it = iter(array) + if array.ndim == 2: + while True: + try: + _ = tuple(next(it)) + except StopIteration: + break + else: + while True: + try: + _ = (next(it),) + except StopIteration: + break + #------------------------------------------------------------------------------- -NUMBER = 200 +NUMBER = 100 def seconds_to_display(seconds: float) -> str: seconds /= NUMBER @@ -156,7 +168,9 @@ class FixtureFactory: @staticmethod def get_array(size: int, width_ratio: int) -> np.ndarray: - return np.arange(size).reshape(size // width_ratio, width_ratio) + if width_ratio > 1: + return np.arange(size).reshape(size // width_ratio, width_ratio) + return np.arange(size) # return 1D array @classmethod def get_label_array(cls, size: int) -> tp.Tuple[str, np.ndarray]: @@ -164,6 +178,7 @@ def get_label_array(cls, size: int) -> tp.Tuple[str, np.ndarray]: return cls.NAME, array DENSITY_TO_DISPLAY = { + 'column-1': '1 Column', 'column-2': '2 Column', 'column-5': '5 Column', 'column-10': '10 Column', @@ -171,6 +186,15 @@ def get_label_array(cls, size: int) -> tp.Tuple[str, np.ndarray]: } +class FFC1(FixtureFactory): + NAME = 'column-1' + + @staticmethod + def get_array(size: int) -> np.ndarray: + a = FixtureFactory.get_array(size, 1) + return a + + class FFC2(FixtureFactory): NAME = 'column-2' @@ -217,6 +241,7 @@ def get_versions() -> str: CLS_FF = ( + FFC1, FFC2, FFC5, FFC10, @@ -226,7 +251,7 @@ def get_versions() -> str: def run_test(): records = [] - for size in (1_000, 10_000, 100_000, 1_000_000): + for size in (1_000, 10_000, 100_000): #, 1_000_000): for ff in CLS_FF: fixture_label, fixture = ff.get_label_array(size) for cls in CLS_PROCESSOR: From 9e703b509ac176577fe8a5b8cd8708298560dca4 Mon Sep 17 00:00:00 2001 From: Christopher Ariza Date: Sat, 22 Jun 2024 12:10:56 -0700 Subject: [PATCH 22/28] updated peformance panels --- doc/articles/array_to_tuple_array.py | 2 +- doc/articles/array_to_tuple_iter.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/articles/array_to_tuple_array.py b/doc/articles/array_to_tuple_array.py index fa2efc2d..c9a21074 100644 --- a/doc/articles/array_to_tuple_array.py +++ b/doc/articles/array_to_tuple_array.py @@ -42,7 +42,7 @@ def __call__(self): post.flags.writeable = False #------------------------------------------------------------------------------- -NUMBER = 1 +NUMBER = 200 def seconds_to_display(seconds: float) -> str: seconds /= NUMBER diff --git a/doc/articles/array_to_tuple_iter.py b/doc/articles/array_to_tuple_iter.py index 0bfb6a9a..a05d5513 100644 --- a/doc/articles/array_to_tuple_iter.py +++ b/doc/articles/array_to_tuple_iter.py @@ -76,7 +76,7 @@ def __call__(self): #------------------------------------------------------------------------------- -NUMBER = 100 +NUMBER = 200 def seconds_to_display(seconds: float) -> str: seconds /= NUMBER @@ -251,7 +251,7 @@ def get_versions() -> str: def run_test(): records = [] - for size in (1_000, 10_000, 100_000): #, 1_000_000): + for size in (1_000, 10_000, 100_000, 1_000_000): for ff in CLS_FF: fixture_label, fixture = ff.get_label_array(size) for cls in CLS_PROCESSOR: From 4bc3192d9fb6c939029cc2655f74b4aa1852c51f Mon Sep 17 00:00:00 2001 From: Christopher Ariza Date: Sat, 22 Jun 2024 12:19:06 -0700 Subject: [PATCH 23/28] renamed ATTType and components --- src/_arraykit.c | 47 +++++++++++++++++++++++------------------------ 1 file changed, 23 insertions(+), 24 deletions(-) diff --git a/src/_arraykit.c b/src/_arraykit.c index 8201d48a..db4da764 100644 --- a/src/_arraykit.c +++ b/src/_arraykit.c @@ -3603,23 +3603,23 @@ array_to_tuple_array(PyObject *Py_UNUSED(m), PyObject *a) } //------------------------------------------------------------------------------ -// Array2DTuple Iterator +// ArrayToTupleIterator -static PyTypeObject A2DTupleType; +static PyTypeObject ATTType; -typedef struct A2DTupleObject { +typedef struct ATTObject { PyObject_HEAD PyArrayObject* array; npy_intp num_rows; npy_intp num_cols; Py_ssize_t pos; // current index state, mutated in-place -} A2DTupleObject; +} ATTObject; static PyObject * -A2DTuple_new(PyArrayObject* array, +ATT_new(PyArrayObject* array, npy_intp num_rows, npy_intp num_cols) { - A2DTupleObject* a2dt = PyObject_New(A2DTupleObject, &A2DTupleType); + ATTObject* a2dt = PyObject_New(ATTObject, &ATTType); if (!a2dt) { return NULL; } @@ -3632,19 +3632,19 @@ A2DTuple_new(PyArrayObject* array, } static void -A2DTuple_dealloc(A2DTupleObject *self) { +ATT_dealloc(ATTObject *self) { Py_DECREF((PyObject*)self->array); PyObject_Del((PyObject*)self); } static PyObject* -A2DTuple_iter(A2DTupleObject *self) { +ATT_iter(ATTObject *self) { Py_INCREF(self); return (PyObject*)self; } static PyObject * -A2DTuple_iternext(A2DTupleObject *self) { +ATT_iternext(ATTObject *self) { Py_ssize_t i = self->pos; if (i < self->num_rows) { npy_intp num_cols = self->num_cols; @@ -3686,30 +3686,30 @@ A2DTuple_iternext(A2DTupleObject *self) { } // static PyObject * -// A2DTuple_reversed(A2DTupleObject *self) { -// return A2DTuple_new(self->bi, !self->reversed); +// ATT_reversed(ATTObject *self) { +// return ATT_new(self->bi, !self->reversed); // } static PyObject * -A2DTuple_length_hint(A2DTupleObject *self) { +ATT_length_hint(ATTObject *self) { Py_ssize_t len = Py_MAX(0, self->num_rows - self->pos); return PyLong_FromSsize_t(len); } -static PyMethodDef A2DTuple_methods[] = { - {"__length_hint__", (PyCFunction)A2DTuple_length_hint, METH_NOARGS, NULL}, - // {"__reversed__", (PyCFunction)A2DTuple_reversed, METH_NOARGS, NULL}, +static PyMethodDef ATT_methods[] = { + {"__length_hint__", (PyCFunction)ATT_length_hint, METH_NOARGS, NULL}, + // {"__reversed__", (PyCFunction)ATT_reversed, METH_NOARGS, NULL}, {NULL}, }; -static PyTypeObject A2DTupleType = { +static PyTypeObject ATTType = { PyVarObject_HEAD_INIT(NULL, 0) - .tp_basicsize = sizeof(A2DTupleObject), - .tp_dealloc = (destructor) A2DTuple_dealloc, - .tp_iter = (getiterfunc) A2DTuple_iter, - .tp_iternext = (iternextfunc) A2DTuple_iternext, - .tp_methods = A2DTuple_methods, - .tp_name = "arraykit.A2DTupleIterator", + .tp_basicsize = sizeof(ATTObject), + .tp_dealloc = (destructor) ATT_dealloc, + .tp_iter = (getiterfunc) ATT_iter, + .tp_iternext = (iternextfunc) ATT_iternext, + .tp_methods = ATT_methods, + .tp_name = "arraykit.ATTIterator", }; // Given a 2D array, return an iterator of row tuples. @@ -3729,10 +3729,9 @@ array_to_tuple_iter(PyObject *Py_UNUSED(m), PyObject *a) if (ndim == 2) { num_cols = PyArray_DIM(array, 1); } - return A2DTuple_new(array, num_rows, num_cols); + return ATT_new(array, num_rows, num_cols); } - //------------------------------------------------------------------------------ // type resolution From bae0ba2f59f501052208066c6be392b8bafd9a5d Mon Sep 17 00:00:00 2001 From: Christopher Ariza Date: Sat, 22 Jun 2024 12:43:07 -0700 Subject: [PATCH 24/28] 0.7.1 RC 1 --- README.rst | 7 +++++++ setup.py | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 68f10907..f3828658 100644 --- a/README.rst +++ b/README.rst @@ -37,6 +37,13 @@ ArrayKit requires the following: What is New in ArrayKit ------------------------- + +0.7.1 +............ + +Extended ``array_to_tuple_array()`` and ``array_to_tuple_iter()`` to support 1D arrays. + + 0.7.0 ............ diff --git a/setup.py b/setup.py index e8b881fa..0def4834 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ from setuptools import setup from pathlib import Path -AK_VERSION = '0.7.0' +AK_VERSION = '0.7.1' def get_long_description() -> str: return '''The ArrayKit library provides utilities for creating and transforming NumPy arrays, implementing performance-critical StaticFrame operations as Python C extensions. From 85a9e0829c8792f617d9be44d1764a5bf985c973 Mon Sep 17 00:00:00 2001 From: Christopher Ariza Date: Sat, 22 Jun 2024 18:18:46 -0700 Subject: [PATCH 25/28] special handling for object ararys --- src/_arraykit.c | 18 +++++++++++++++++- test/test_util.py | 5 +++-- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/src/_arraykit.c b/src/_arraykit.c index db4da764..bf4e5fcd 100644 --- a/src/_arraykit.c +++ b/src/_arraykit.c @@ -3573,7 +3573,7 @@ array_to_tuple_array(PyObject *Py_UNUSED(m), PyObject *a) i++; } } - else { // ndim == 1 + else if (PyArray_TYPE(input_array) != NPY_OBJECT) { // ndim == 1, not object while (p < p_end) { tuple = PyTuple_New(1); if (tuple == NULL) { @@ -3590,6 +3590,22 @@ array_to_tuple_array(PyObject *Py_UNUSED(m), PyObject *a) i++; } } + else { // ndim == 1, object + while (p < p_end) { + tuple = PyTuple_New(1); + if (tuple == NULL) { + goto error; + } + // scalar returned in is native PyObject from object arrays + item = *(PyObject**)PyArray_GETPTR1(input_array, i); + Py_INCREF(item); + // TODO: identify tuple + PyTuple_SET_ITEM(tuple, 0, item); // steals reference to item + *p++ = tuple; // assign with new ref, no incr needed + i++; + } + } + PyArray_CLEARFLAGS((PyArrayObject *)output, NPY_ARRAY_WRITEABLE); return output; error: diff --git a/test/test_util.py b/test/test_util.py index 0b651430..58092ea5 100644 --- a/test/test_util.py +++ b/test/test_util.py @@ -303,8 +303,9 @@ def test_array2d_to_array1d_1d_c(self) -> None: def test_array2d_to_array1d_1d_d(self) -> None: a1 = np.array([('a', 10), ('b', 30), ('c', 5)], dtype=object) - a2 = array_to_tuple_array(a1) - self.assertEqual(a2.tolist(), [('a', 10), ('b', 30), ('c', 5)]) + a2 = array_to_tuple_array(a1) # from 2d + a3 = array_to_tuple_array(a2) # from 1d + self.assertEqual(a3.tolist(), [('a', 10), ('b', 30), ('c', 5)]) def test_array2d_to_array1d_1d_e(self) -> None: a1 = np.array([True, False, True], dtype=object) From d44be7d8201cff8b514c2fa4365aeb65a9ddc521 Mon Sep 17 00:00:00 2001 From: Christopher Ariza Date: Sun, 23 Jun 2024 07:40:21 -0700 Subject: [PATCH 26/28] implement handling for tuples in object dtypes --- src/_arraykit.c | 41 +++++++++++++++++++++++++++++------------ test/test_util.py | 31 ++++++++++++++++++++++--------- 2 files changed, 51 insertions(+), 21 deletions(-) diff --git a/src/_arraykit.c b/src/_arraykit.c index bf4e5fcd..fd55c7ec 100644 --- a/src/_arraykit.c +++ b/src/_arraykit.c @@ -3592,20 +3592,22 @@ array_to_tuple_array(PyObject *Py_UNUSED(m), PyObject *a) } else { // ndim == 1, object while (p < p_end) { - tuple = PyTuple_New(1); - if (tuple == NULL) { - goto error; - } - // scalar returned in is native PyObject from object arrays item = *(PyObject**)PyArray_GETPTR1(input_array, i); - Py_INCREF(item); - // TODO: identify tuple - PyTuple_SET_ITEM(tuple, 0, item); // steals reference to item + Py_INCREF(item); // always incref + if (PyTuple_Check(item)) { + tuple = item; // do not double pack + } + else { + tuple = PyTuple_New(1); + if (tuple == NULL) { + goto error; + } + PyTuple_SET_ITEM(tuple, 0, item); // steals reference to item + } *p++ = tuple; // assign with new ref, no incr needed i++; } } - PyArray_CLEARFLAGS((PyArrayObject *)output, NPY_ARRAY_WRITEABLE); return output; error: @@ -3680,10 +3682,10 @@ ATT_iternext(ATTObject *self) { Py_DECREF(tuple); return NULL; } - PyTuple_SET_ITEM(tuple, j, item); // steals reference to item + PyTuple_SET_ITEM(tuple, j, item); // steals ref } } - else { // ndim == 1 + else if (PyArray_TYPE(array) != NPY_OBJECT) { // ndim == 1, not object tuple = PyTuple_New(1); if (tuple == NULL) { return NULL; @@ -3693,7 +3695,22 @@ ATT_iternext(ATTObject *self) { Py_DECREF(tuple); return NULL; } - PyTuple_SET_ITEM(tuple, 0, item); // steals reference to item + PyTuple_SET_ITEM(tuple, 0, item); // steals ref + } + else { // ndim == 1, object + item = *(PyObject**)PyArray_GETPTR1(array, i); + Py_INCREF(item); // always incref + if (PyTuple_Check(item)) { + tuple = item; // do not double pack + } + else { + tuple = PyTuple_New(1); + if (tuple == NULL) { + Py_DECREF(item); + return NULL; + } + PyTuple_SET_ITEM(tuple, 0, item); // steals ref + } } self->pos++; return tuple; diff --git a/test/test_util.py b/test/test_util.py index 58092ea5..93b819ec 100644 --- a/test/test_util.py +++ b/test/test_util.py @@ -286,34 +286,47 @@ def test_array_deepcopy_h(self) -> None: a2 = array_deepcopy(a1, ()) #--------------------------------------------------------------------------- - def test_array2d_to_array1d_1d_a(self) -> None: + def test_array_to_tuple_array_1d_a(self) -> None: a1 = np.arange(10) a2 = array_to_tuple_array(a1) self.assertEqual(a2.tolist(), [(0,), (1,), (2,), (3,), (4,), (5,), (6,), (7,), (8,), (9,)]) - def test_array2d_to_array1d_1d_b(self) -> None: + def test_array_to_tuple_array_1d_b(self) -> None: a1 = np.array(['aaa', 'b', 'ccc']) a2 = array_to_tuple_array(a1) self.assertEqual(a2.tolist(), [('aaa',), ('b',), ('ccc',)]) - def test_array2d_to_array1d_1d_c(self) -> None: + def test_array_to_tuple_array_1d_c(self) -> None: a1 = np.array([None, 'b', 30]) a2 = array_to_tuple_array(a1) self.assertEqual(a2.tolist(), [(None,), ('b',), (30,)]) - def test_array2d_to_array1d_1d_d(self) -> None: + def test_array_to_tuple_array_1d_d(self) -> None: a1 = np.array([('a', 10), ('b', 30), ('c', 5)], dtype=object) a2 = array_to_tuple_array(a1) # from 2d + self.assertEqual(a2.tolist(), [('a', 10), ('b', 30), ('c', 5)]) a3 = array_to_tuple_array(a2) # from 1d self.assertEqual(a3.tolist(), [('a', 10), ('b', 30), ('c', 5)]) - def test_array2d_to_array1d_1d_e(self) -> None: + def test_array_to_tuple_array_1d_e(self) -> None: a1 = np.array([True, False, True], dtype=object) a2 = array_to_tuple_array(a1) self.assertIs(a2[0][0].__class__, bool) self.assertEqual(a2.tolist(), [(True,), (False,), (True,)]) - def test_array2d_to_array1d_b(self) -> None: + def test_array_to_tuple_array_1d_f(self) -> None: + a1 = np.array([None, None, None], dtype=object) + a1[0] = 3 + a1[1] = ('a', 30) + a1[2] = (None, True, 90000000) + + a2 = array_to_tuple_array(a1) + self.assertEqual(a2.tolist(), [(3,), ('a', 30), (None, True, 90000000)]) + + a3 = array_to_tuple_array(a2) + self.assertEqual(a3.tolist(), [(3,), ('a', 30), (None, True, 90000000)]) + + def test_array_to_tuple_array_b(self) -> None: a1 = np.arange(10, dtype=np.int64).reshape(5, 2) result = array_to_tuple_array(a1) assert isinstance(result[0], tuple) @@ -323,18 +336,18 @@ def test_array2d_to_array1d_b(self) -> None: self.assertEqual(tuple(result), ((0, 1), (2, 3), (4, 5), (6, 7), (8, 9))) - def test_array2d_to_array1d_c(self) -> None: + def test_array_to_tuple_array_c(self) -> None: a1 = np.array([["a", "b"], ["ccc", "ddd"], ["ee", "ff"]]) a2 = array_to_tuple_array(a1) self.assertEqual(a2.tolist(), [('a', 'b'), ('ccc', 'ddd'), ('ee', 'ff')]) - def test_array2d_to_array1d_d(self) -> None: + def test_array_to_tuple_array_d(self) -> None: a1 = np.array([[3, 5], [10, 20], [7, 2]], dtype=np.uint8) a2 = array_to_tuple_array(a1) self.assertEqual(a2.tolist(), [(3, 5), (10, 20), (7, 2)]) self.assertIs(type(a2[0][0]), np.uint8) - def test_array2d_to_array1d_e(self) -> None: + def test_array_to_tuple_array_e(self) -> None: a1 = np.arange(20, dtype=np.int64).reshape(4, 5) result = array_to_tuple_array(a1) self.assertEqual(result.tolist(), [(0, 1, 2, 3, 4), (5, 6, 7, 8, 9), (10, 11, 12, 13, 14), (15, 16, 17, 18, 19)]) From f76e22e73447844a5ec2db8cc252f4a22efaaa48 Mon Sep 17 00:00:00 2001 From: Christopher Ariza Date: Sun, 23 Jun 2024 07:45:35 -0700 Subject: [PATCH 27/28] additional tests --- test/test_util.py | 36 +++++++++++++++++++++++------------- 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/test/test_util.py b/test/test_util.py index 93b819ec..9e00f142 100644 --- a/test/test_util.py +++ b/test/test_util.py @@ -353,18 +353,18 @@ def test_array_to_tuple_array_e(self) -> None: self.assertEqual(result.tolist(), [(0, 1, 2, 3, 4), (5, 6, 7, 8, 9), (10, 11, 12, 13, 14), (15, 16, 17, 18, 19)]) #--------------------------------------------------------------------------- - def test_array2d_tuple_iter_a(self) -> None: + def test_array_to_tuple_iter_a(self) -> None: a1 = np.arange(20, dtype=np.int64).reshape(4, 5) result = list(array_to_tuple_iter(a1)) self.assertEqual(len(result), 4) self.assertEqual(result, [(0, 1, 2, 3, 4), (5, 6, 7, 8, 9), (10, 11, 12, 13, 14), (15, 16, 17, 18, 19)]) - def test_array2d_tuple_iter_b(self) -> None: + def test_array_to_tuple_iter_b(self) -> None: a1 = np.arange(20, dtype=np.int64).reshape(10, 2) result = list(array_to_tuple_iter(a1)) self.assertEqual(result, [(0, 1), (2, 3), (4, 5), (6, 7), (8, 9), (10, 11), (12, 13), (14, 15), (16, 17), (18, 19)]) - def test_array2d_tuple_iter_c(self) -> None: + def test_array_to_tuple_iter_c(self) -> None: a1 = np.array([['aaa', 'bb'], ['c', 'dd'], ['ee', 'fffff']]) it = array_to_tuple_iter(a1) self.assertEqual(it.__length_hint__(), 3) @@ -377,20 +377,20 @@ def test_array2d_tuple_iter_c(self) -> None: with self.assertRaises(StopIteration): next(it) - def test_array2d_tuple_iter_d(self) -> None: + def test_array_to_tuple_iter_d(self) -> None: a1 = np.array([['aaa', 'bb'], ['c', 'dd'], ['ee', 'fffff']]) it = array_to_tuple_iter(a1) # __reversed__ not implemented with self.assertRaises(TypeError): reversed(it) - def test_array2d_tuple_iter_e(self) -> None: + def test_array_to_tuple_iter_e(self) -> None: a1 = np.array([[None, 'bb'], [None, 'dd'], [3, None]]) it = array_to_tuple_iter(a1) del a1 self.assertEqual(list(it), [(None, 'bb'), (None, 'dd'), (3, None)]) - def test_array2d_tuple_iter_f(self) -> None: + def test_array_to_tuple_iter_f(self) -> None: a1 = np.array([[None, 'bb'], [None, 'dd'], [3, None]]) it1 = array_to_tuple_iter(a1) del a1 @@ -398,7 +398,7 @@ def test_array2d_tuple_iter_f(self) -> None: self.assertEqual(list(it1), [(None, 'bb'), (None, 'dd'), (3, None)]) self.assertEqual(list(it2), []) # expected behavior - def test_array2d_tuple_iter_g(self) -> None: + def test_array_to_tuple_iter_g(self) -> None: a1 = np.array([[None, 'bb'], [None, 'dd'], [3, None]]) it1 = array_to_tuple_iter(a1) it2 = array_to_tuple_iter(a1) @@ -406,23 +406,33 @@ def test_array2d_tuple_iter_g(self) -> None: self.assertEqual(list(it1), [(None, 'bb'), (None, 'dd'), (3, None)]) self.assertEqual(list(it2), [(None, 'bb'), (None, 'dd'), (3, None)]) - def test_array2d_tuple_iter_1d_a(self) -> None: + def test_array_to_tuple_iter_1d_a(self) -> None: a1 = np.array(['bb', 'c', 'aaa']) result = list(array_to_tuple_iter(a1)) self.assertEqual(len(result), 3) self.assertEqual(result, [('bb',), ('c',), ('aaa',)]) - def test_array2d_tuple_iter_1d_b(self) -> None: + def test_array_to_tuple_iter_1d_b(self) -> None: a1 = np.array([20, -1, 8]) result = list(array_to_tuple_iter(a1)) self.assertEqual(len(result), 3) self.assertEqual(result, [(20,), (-1,), (8,)]) - def test_array2d_tuple_iter_1d_c(self) -> None: + def test_array_to_tuple_iter_1d_c(self) -> None: a1 = np.array([('a', 4), ('c', -1), ('d', 8)], dtype=object) - result = list(array_to_tuple_iter(a1)) - self.assertEqual(len(result), 3) - self.assertEqual(result, [('a', 4), ('c', -1), ('d', 8)]) + a2 = list(array_to_tuple_iter(a1)) + self.assertEqual(len(a2), 3) + self.assertEqual(a2, [('a', 4), ('c', -1), ('d', 8)]) + + def test_array_to_tuple_iter_1d_d(self) -> None: + a1 = np.array([None, None, None], dtype=object) + a1[0] = 3 + a1[1] = ('a', 30) + a1[2] = (None, True, 90000000) + + a2 = list(array_to_tuple_iter(a1)) + self.assertEqual(a2, [(3,), ('a', 30), (None, True, 90000000)]) + #--------------------------------------------------------------------------- From 55e19ee7f209db82da658257db011498024813ee Mon Sep 17 00:00:00 2001 From: Christopher Ariza Date: Sun, 23 Jun 2024 09:55:46 -0700 Subject: [PATCH 28/28] 0.7.2 RC 1 --- README.rst | 6 ++++++ setup.py | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/README.rst b/README.rst index f3828658..b1272a2b 100644 --- a/README.rst +++ b/README.rst @@ -38,6 +38,12 @@ What is New in ArrayKit ------------------------- +0.7.2 +............ + +Improved ``array_to_tuple_array()`` and ``array_to_tuple_iter()`` to preserve ``tuple`` in 1D arrays. + + 0.7.1 ............ diff --git a/setup.py b/setup.py index 0def4834..2b9a40f6 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ from setuptools import setup from pathlib import Path -AK_VERSION = '0.7.1' +AK_VERSION = '0.7.2' def get_long_description() -> str: return '''The ArrayKit library provides utilities for creating and transforming NumPy arrays, implementing performance-critical StaticFrame operations as Python C extensions.