From fe5082f0c71d57d55770e0175fa40ecb6f1028c6 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Wed, 1 May 2024 09:16:39 -0300 Subject: [PATCH] feat(python): Allow creation of dictionary and list types (#445) This PR adds support for creating dictionary and list types: ```python import nanoarrow as na na.list_of(na.int32()) #> Schema(LIST, value_type=Schema(INT32, name='item')) na.dictionary(na.int32(), na.string()) #> Schema(DICTIONARY, index_type=Schema(INT32), value_type=Schema(STRING), dictionary_ordered=False) ``` Before, creating these types (or associated arrays from buffer) was not possible. This required some changes to `modify()` to ensure we could also set `children` and `dictionary` there. --- python/src/nanoarrow/__init__.py | 12 +- python/src/nanoarrow/_lib.pyx | 80 ++++++-- python/src/nanoarrow/schema.py | 289 ++++++++++++++++++++++------- python/tests/test_c_array.py | 2 +- python/tests/test_c_buffer.py | 6 +- python/tests/test_c_buffer_view.py | 4 +- python/tests/test_c_schema.py | 42 +++++ python/tests/test_iterator.py | 28 +-- python/tests/test_schema.py | 48 ++++- 9 files changed, 405 insertions(+), 106 deletions(-) diff --git a/python/src/nanoarrow/__init__.py b/python/src/nanoarrow/__init__.py index c1cd12dd6..d96ed5c24 100644 --- a/python/src/nanoarrow/__init__.py +++ b/python/src/nanoarrow/__init__.py @@ -43,7 +43,7 @@ Type, TimeUnit, null, - bool, + bool_, int8, uint8, int16, @@ -57,6 +57,10 @@ float64, string, large_string, + list_, + large_list, + fixed_size_list, + dictionary, binary, large_binary, fixed_size_binary, @@ -88,7 +92,7 @@ "allocate_c_array_stream", "allocate_c_schema", "binary", - "bool", + "bool_", "c_array", "c_array_from_buffers", "c_array_stream", @@ -102,9 +106,11 @@ "date64", "decimal128", "decimal256", + "dictionary", "duration", "extension_type", "fixed_size_binary", + "fixed_size_list", "float16", "float32", "float64", @@ -117,6 +123,8 @@ "interval_months", "large_binary", "large_string", + "large_list", + "list_", "null", "string", "struct", diff --git a/python/src/nanoarrow/_lib.pyx b/python/src/nanoarrow/_lib.pyx index 99127087a..04602120d 100644 --- a/python/src/nanoarrow/_lib.pyx +++ b/python/src/nanoarrow/_lib.pyx @@ -769,23 +769,55 @@ cdef class CSchema: else: return None - def modify(self, *, name=None, flags=None, nullable=None, metadata=None, - validate=True): - builder = CSchemaBuilder.copy(self) + def modify(self, *, format=None, name=None, flags=None, nullable=None, + metadata=None, children=None, dictionary=None, validate=True): + cdef CSchemaBuilder builder = CSchemaBuilder.allocate() - if name is not None: + if format is None: + builder.set_format(self.format) + else: + builder.set_format(format) + + if name is None: + builder.set_name(self.name) + elif name is not False: builder.set_name(name) - if flags is not None: + if flags is None: + builder.set_flags(self.flags) + else: builder.set_flags(flags) if nullable is not None: builder.set_nullable(nullable) - if metadata is not None: - builder.clear_metadata() + if metadata is None: + if self.metadata is not None: + builder.append_metadata(self.metadata) + else: builder.append_metadata(metadata) + if children is None: + if self.n_children > 0: + builder.allocate_children(self.n_children) + for i, child in enumerate(self.children): + builder.set_child(i, None, child) + elif hasattr(children, "items"): + builder.allocate_children(len(children)) + for i, item in enumerate(children.items()): + name, child = item + builder.set_child(i, name, child) + else: + builder.allocate_children(len(children)) + for i, child in enumerate(children): + builder.set_child(i, None, child) + + if dictionary is None: + if self.dictionary: + builder.set_dictionary(self.dictionary) + elif dictionary is not False: + builder.set_dictionary(dictionary) + if validate: builder.validate() @@ -1036,19 +1068,10 @@ cdef class CSchemaBuilder: if self._ptr.release == NULL: ArrowSchemaInit(self._ptr) - @staticmethod - def copy(CSchema schema): - return CSchemaBuilder(schema.__deepcopy__()) - @staticmethod def allocate(): return CSchemaBuilder(CSchema.allocate()) - def clear_metadata(self): - cdef int code = ArrowSchemaSetMetadata(self.c_schema._ptr, NULL) - Error.raise_error_not_ok("ArrowSchemaSetMetadata()", code) - return self - def append_metadata(self, metadata): cdef CBuffer buffer = CBuffer.empty() @@ -1164,6 +1187,23 @@ cdef class CSchemaBuilder: if name is not None: name = str(name) code = ArrowSchemaSetName(self._ptr.children[i], name.encode("UTF-8")) + Error.raise_error_not_ok("ArrowSchemaSetName()", code) + + return self + + def set_dictionary(self, CSchema dictionary): + self.c_schema._assert_valid() + + cdef int code + if self._ptr.dictionary == NULL: + code = ArrowSchemaAllocateDictionary(self._ptr) + Error.raise_error_not_ok("ArrowSchemaAllocateDictionary()", code) + + if self._ptr.dictionary.release != NULL: + ArrowSchemaRelease(self._ptr.dictionary) + + code = ArrowSchemaDeepCopy(dictionary._ptr, self._ptr.dictionary) + Error.raise_error_not_ok("ArrowSchemaDeepCopy()", code) return self @@ -1179,6 +1219,14 @@ cdef class CSchemaBuilder: return self + def set_dictionary_ordered(self, dictionary_ordered): + if dictionary_ordered: + self._ptr.flags = self._ptr.flags | ARROW_FLAG_DICTIONARY_ORDERED + else: + self._ptr.flags = self._ptr.flags & ~ARROW_FLAG_DICTIONARY_ORDERED + + return self + def validate(self): return CSchemaView(self.c_schema) diff --git a/python/src/nanoarrow/schema.py b/python/src/nanoarrow/schema.py index 97bb45b8f..94d1a8c8f 100644 --- a/python/src/nanoarrow/schema.py +++ b/python/src/nanoarrow/schema.py @@ -193,23 +193,27 @@ def __init__( name=None, nullable=None, metadata=None, + fields=None, **params, ) -> None: if isinstance(obj, Type): - self._c_schema = _c_schema_from_type_and_params( - obj, params, name, nullable, metadata - ) - self._c_schema_view = CSchemaView(self._c_schema) - return - - if params: - raise ValueError("params are only supported for obj of class Type") - - self._c_schema = c_schema(obj) - - if name is not None or nullable is not None or metadata is not None: + self._c_schema = _c_schema_from_type_and_params(obj, params) + else: + if params: + raise ValueError("params are only supported for obj of class Type") + self._c_schema = c_schema(obj) + + if ( + name is not None + or nullable is not None + or metadata is not None + or fields is not None + ): self._c_schema = self._c_schema.modify( - name=name, nullable=nullable, metadata=metadata + name=name, + nullable=nullable, + metadata=metadata, + children=_clean_fields(fields), ) self._c_schema_view = CSchemaView(self._c_schema) @@ -343,6 +347,72 @@ def scale(self) -> int: return self._c_schema_view.decimal_scale + @property + def index_type(self) -> Union["Schema", None]: + """Dictionary index type + + For dictionary types, the type corresponding to the indices. + See also :attr:`value_type`. + + >>> import nanoarrow as na + >>> na.dictionary(na.int32(), na.string()).index_type + Schema(INT32) + """ + if self._c_schema_view.type_id == CArrowType.DICTIONARY: + index_schema = self._c_schema.modify( + dictionary=False, flags=0, nullable=self.nullable + ) + return Schema(index_schema) + else: + return None + + @property + def dictionary_ordered(self) -> Union[bool, None]: + """Dictionary ordering + + For dictionary types, returns ``True`` if the order of dictionary values + are meaningful. + + >>> import nanoarrow as na + >>> na.dictionary(na.int32(), na.string()).dictionary_ordered + False + """ + return self._c_schema_view.dictionary_ordered + + @property + def value_type(self): + """Dictionary or list value type + + >>> import nanoarrow as na + >>> na.list_(na.int32()).value_type + Schema(INT32, name='item') + >>> na.dictionary(na.int32(), na.string()).value_type + Schema(STRING) + """ + if self._c_schema_view.type_id in ( + CArrowType.LIST, + CArrowType.LARGE_LIST, + CArrowType.FIXED_SIZE_LIST, + ): + return self.field(0) + elif self._c_schema_view.type_id == CArrowType.DICTIONARY: + return Schema(self._c_schema.dictionary) + else: + return None + + @property + def list_size(self) -> Union[int, None]: + """Fixed-size list element size + + >>> import nanoarrow as na + >>> na.fixed_size_list(na.int32(), 123).list_size + 123 + """ + if self._c_schema_view.type_id == CArrowType.FIXED_SIZE_LIST: + return self._c_schema_view.fixed_size + else: + return None + @property def n_fields(self) -> int: """Number of child Schemas @@ -408,7 +478,7 @@ def null(nullable: bool = True) -> Schema: return Schema(Type.NULL, nullable=nullable) -def bool(nullable: bool = True) -> Schema: +def bool_(nullable: bool = True) -> Schema: """Create an instance of a boolean type. Parameters @@ -420,7 +490,7 @@ def bool(nullable: bool = True) -> Schema: -------- >>> import nanoarrow as na - >>> na.bool() + >>> na.bool_() Schema(BOOL) """ return Schema(Type.BOOL, nullable=nullable) @@ -945,9 +1015,8 @@ def struct(fields, nullable=True) -> Schema: ---------- fields : * A dictionary whose keys are field names and values are schema-like objects - * An iterable whose items are a schema like object or a two-tuple of the - field name and a schema-like object. If a field name is not specified - from the tuple, the field name is inherited from the schema-like object. + * An iterable whose items are a schema like objects where the field name is + inherited from the schema-like object. nullable : bool, optional Use ``False`` to mark this field as non-nullable. @@ -957,14 +1026,113 @@ def struct(fields, nullable=True) -> Schema: >>> import nanoarrow as na >>> na.struct([na.int32()]) Schema(STRUCT, fields=[Schema(INT32)]) - >>> na.struct([("col1", na.int32())]) - Schema(STRUCT, fields=[Schema(INT32, name='col1')]) >>> na.struct({"col1": na.int32()}) Schema(STRUCT, fields=[Schema(INT32, name='col1')]) """ return Schema(Type.STRUCT, fields=fields, nullable=nullable) +def list_(value_type, nullable=True) -> Schema: + """Create a type representing a variable-size list of some other type. + + Parameters + ---------- + value_type : schema-like + The type of values in each list element. + nullable : bool, optional + Use ``False`` to mark this field as non-nullable. + + Examples + -------- + + >>> import nanoarrow as na + >>> na.list_(na.int32()) + Schema(LIST, value_type=Schema(INT32, name='item')) + """ + return Schema(Type.LIST, value_type=value_type, nullable=nullable) + + +def large_list(value_type, nullable=True) -> Schema: + """Create a type representing a variable-size list of some other type. + + Unlike :func:`list_`, the func:`large_list` can accomodate arrays + with more than ``2 ** 31 - 1`` items in the values array. + + Parameters + ---------- + value_type : schema-like + The type of values in each list element. + nullable : bool, optional + Use ``False`` to mark this field as non-nullable. + + Examples + -------- + + >>> import nanoarrow as na + >>> na.large_list(na.int32()) + Schema(LARGE_LIST, value_type=Schema(INT32, name='item')) + """ + return Schema(Type.LARGE_LIST, value_type=value_type, nullable=nullable) + + +def fixed_size_list(value_type, list_size, nullable=True) -> Schema: + """Create a type representing a fixed-size list of some other type. + + Parameters + ---------- + value_type : schema-like + The type of values in each list element. + list_size : int + The number of values in each list element. + nullable : bool, optional + Use ``False`` to mark this field as non-nullable. + + Examples + -------- + + >>> import nanoarrow as na + >>> na.fixed_size_list(na.int32(), 123) + Schema(FIXED_SIZE_LIST, value_type=Schema(INT32, name='item'), list_size=123) + """ + return Schema( + Type.FIXED_SIZE_LIST, + value_type=value_type, + list_size=list_size, + nullable=nullable, + ) + + +def dictionary(index_type, value_type, dictionary_ordered=False): + """Create a type representing dictionary-encoded values + + Parameters + ---------- + index_type : schema-like + The data type of the indices. Must be an integral type. + value_type : schema-like + The type of the dictionary array. + ordered: bool, optional + Use ``True`` if the order of values in the dictionary array is + meaningful. + nullable : bool, optional + Use ``False`` to mark this field as non-nullable. + + Examples + -------- + + >>> import nanoarrow as na + >>> na.dictionary(na.int32(), na.string()) + Schema(DICTIONARY, index_type=Schema(INT32), value_type=Schema(STRING), \ +dictionary_ordered=False) + """ + return Schema( + Type.DICTIONARY, + index_type=index_type, + value_type=value_type, + dictionary_ordered=dictionary_ordered, + ) + + def extension_type( storage_schema, extension_name: str, @@ -993,24 +1161,10 @@ def extension_type( return Schema(storage_schema, nullable=nullable, metadata=metadata) -def _c_schema_from_type_and_params( - type: Type, - params: dict, - name: Union[bool, None, bool], - nullable: Union[bool, None], - metadata: Mapping[Union[str, bytes], Union[str, bytes]], -): +def _c_schema_from_type_and_params(type: Type, params: dict): factory = CSchemaBuilder.allocate() - if type == Type.STRUCT: - fields = _clean_fields(params.pop("fields")) - - factory.set_format("+s").allocate_children(len(fields)) - for i, item in enumerate(fields): - child_name, c_schema = item - factory.set_child(i, child_name, c_schema) - - elif type.value in CSchemaView._decimal_types: + if type.value in CSchemaView._decimal_types: precision = int(params.pop("precision")) scale = int(params.pop("scale")) factory.set_type_decimal(type.value, precision, scale) @@ -1029,6 +1183,32 @@ def _c_schema_from_type_and_params( elif type == Type.FIXED_SIZE_BINARY: factory.set_type_fixed_size(type.value, int(params.pop("byte_width"))) + elif type == Type.LIST: + factory.set_format("+l") + factory.allocate_children(1) + factory.set_child(0, "item", c_schema(params.pop("value_type"))) + + elif type == Type.LARGE_LIST: + factory.set_format("+L") + factory.allocate_children(1) + factory.set_child(0, "item", c_schema(params.pop("value_type"))) + + elif type == Type.FIXED_SIZE_LIST: + fixed_size = int(params.pop("list_size")) + factory.set_format(f"+w:{fixed_size}") + factory.allocate_children(1) + factory.set_child(0, "item", c_schema(params.pop("value_type"))) + + elif type == Type.DICTIONARY: + index_type = c_schema(params.pop("index_type")) + factory.set_format(index_type.format) + + value_type = c_schema(params.pop("value_type")) + factory.set_dictionary(value_type) + + if "dictionary_ordered" in params and bool(params.pop("dictionary_ordered")): + factory.set_dictionary_ordered(True) + else: factory.set_type(type.value) @@ -1036,38 +1216,19 @@ def _c_schema_from_type_and_params( unused = ", ".join(f"'{item}'" for item in params.keys()) raise ValueError(f"Unused parameters whilst constructing Schema: {unused}") - # Apply default nullability (True) - if nullable is None: - nullable = True - factory.set_nullable(nullable) - - # Apply default name (an empty string). To explicitly set a NULL - # name, a caller would have to specify False. - if name is None: - name = "" - elif name is False: - name = None - factory.set_name(name) - - # Apply metadata - if metadata is not None: - factory.append_metadata(metadata) + # Better default than NULL, which causes some implementations to crash + factory.set_name("") return factory.finish() def _clean_fields(fields): - if isinstance(fields, dict): - return [(str(k), c_schema(v)) for k, v in fields.items()] + if fields is None: + return None + elif hasattr(fields, "items"): + return {k: c_schema(v) for k, v in fields.items()} else: - fields_clean = [] - for item in fields: - if isinstance(item, tuple) and len(item) == 2: - fields_clean.append((str(item[0]), c_schema(item[1]))) - else: - fields_clean.append((None, c_schema(item))) - - return fields_clean + return [c_schema(v) for v in fields] def _schema_repr(obj): @@ -1120,4 +1281,8 @@ def _schema_param_repr(name, value): CArrowType.DECIMAL128: ("precision", "scale"), CArrowType.DECIMAL256: ("precision", "scale"), CArrowType.STRUCT: ("fields",), + CArrowType.LIST: ("value_type",), + CArrowType.LARGE_LIST: ("value_type",), + CArrowType.FIXED_SIZE_LIST: ("value_type", "list_size"), + CArrowType.DICTIONARY: ("index_type", "value_type", "dictionary_ordered"), } diff --git a/python/tests/test_c_array.py b/python/tests/test_c_array.py index b64370da8..b5ec3b900 100644 --- a/python/tests/test_c_array.py +++ b/python/tests/test_c_array.py @@ -303,7 +303,7 @@ def test_c_array_from_iterable_float_with_nulls(): def test_c_array_from_iterable_bool_with_nulls(): - c_array = na.c_array([True, None, False], na.bool()) + c_array = na.c_array([True, None, False], na.bool_()) assert c_array.length == 3 assert c_array.null_count == 1 diff --git a/python/tests/test_c_buffer.py b/python/tests/test_c_buffer.py index 40ba4a3b3..d18435cc8 100644 --- a/python/tests/test_c_buffer.py +++ b/python/tests/test_c_buffer.py @@ -322,7 +322,7 @@ def test_c_buffer_from_decimal256_iterable(): def test_c_buffer_bitmap_from_iterable(): # Check something less than one byte - buffer = na.c_buffer([True, False, False, True], na.bool()) + buffer = na.c_buffer([True, False, False, True], na.bool_()) assert "10010000" in repr(buffer) assert buffer.size_bytes == 1 assert buffer.data_type == "bool" @@ -343,13 +343,13 @@ def test_c_buffer_bitmap_from_iterable(): ) # Check something exactly one byte - buffer = na.c_buffer([True, False, False, True] * 2, na.bool()) + buffer = na.c_buffer([True, False, False, True] * 2, na.bool_()) assert "10011001" in repr(buffer) assert buffer.size_bytes == 1 assert list(buffer.elements()) == [True, False, False, True] * 2 # Check something more than one byte - buffer = na.c_buffer([True, False, False, True] * 3, na.bool()) + buffer = na.c_buffer([True, False, False, True] * 3, na.bool_()) assert "1001100110010000" in repr(buffer) assert buffer.size_bytes == 2 assert list(buffer.elements()) == [True, False, False, True] * 3 + [ diff --git a/python/tests/test_c_buffer_view.py b/python/tests/test_c_buffer_view.py index b885488ea..e84c04dfe 100644 --- a/python/tests/test_c_buffer_view.py +++ b/python/tests/test_c_buffer_view.py @@ -20,8 +20,8 @@ import nanoarrow as na -def test_buffer_view_bool(): - bool_array_view = na.c_array_view([1, 0, 0, 1], na.bool()) +def test_buffer_view_bool_(): + bool_array_view = na.c_array_view([1, 0, 0, 1], na.bool_()) view = bool_array_view.buffer(1) assert view.element_size_bits == 1 diff --git a/python/tests/test_c_schema.py b/python/tests/test_c_schema.py index 5617fe7bd..e299157fd 100644 --- a/python/tests/test_c_schema.py +++ b/python/tests/test_c_schema.py @@ -133,6 +133,9 @@ def test_c_schema_modify(): assert schema_clone is not schema assert schema._addr() != schema_clone._addr() + schema_formatted = schema.modify(format="i") + assert schema_formatted.format == "i" + schema_named = schema.modify(name="something else") assert schema_named.name == "something else" assert schema_named.format == schema.format @@ -155,3 +158,42 @@ def test_c_schema_modify(): schema_no_metad = schema_metad.modify(metadata={}) assert schema_no_metad.metadata is None + + +def test_c_schema_modify_children(): + schema = na.c_schema(na.struct({"col1": na.null()})) + + schema_same_children = schema.modify() + assert schema_same_children.n_children == 1 + assert schema_same_children.child(0).name == "col1" + assert schema_same_children.child(0).format == "n" + + schema_new_children_list = schema.modify( + children=[na.c_schema(na.int32()).modify(name="new name")] + ) + assert schema_new_children_list.n_children == 1 + assert schema_new_children_list.child(0).name == "new name" + assert schema_new_children_list.child(0).format == "i" + + schema_new_children_dict = schema.modify( + children={"new name": na.c_schema(na.int32())} + ) + assert schema_new_children_dict.n_children == 1 + assert schema_new_children_dict.child(0).name == "new name" + assert schema_new_children_dict.child(0).format == "i" + + +def test_c_schema_modify_dictionary(): + schema = na.c_schema(na.int32()) + + schema_dictionary = schema.modify(dictionary=na.c_schema(na.string())) + assert schema_dictionary.format == "i" + assert schema_dictionary.dictionary.format == "u" + + schema_same_dictionary = schema_dictionary.modify() + assert schema_same_dictionary.format == "i" + assert schema_same_dictionary.dictionary.format == "u" + + schema_no_dictionary = schema_dictionary.modify(dictionary=False) + assert schema_no_dictionary.format == "i" + assert schema.dictionary is None diff --git a/python/tests/test_iterator.py b/python/tests/test_iterator.py index 826f9be2d..ff0b34e2c 100644 --- a/python/tests/test_iterator.py +++ b/python/tests/test_iterator.py @@ -106,10 +106,13 @@ def test_iterator_nullable_binary(): def test_iter_tuples(): array = na.c_array_from_buffers( - na.struct({"col1": na.int32(), "col2": na.bool()}), + na.struct({"col1": na.int32(), "col2": na.bool_()}), length=3, buffers=[None], - children=[na.c_array([1, 2, 3], na.int32()), na.c_array([1, 0, 1], na.bool())], + children=[ + na.c_array([1, 2, 3], na.int32()), + na.c_array([1, 0, 1], na.bool_()), + ], ) assert list(iter_tuples(array)) == [(1, True), (2, False), (3, True)] @@ -131,12 +134,12 @@ def test_iter_tuples(): def test_iter_tuples_nullable(): array = na.c_array_from_buffers( - na.struct({"col1": na.int32(), "col2": na.bool()}), + na.struct({"col1": na.int32(), "col2": na.bool_()}), length=4, - buffers=[na.c_buffer([True, True, True, False], na.bool())], + buffers=[na.c_buffer([True, True, True, False], na.bool_())], children=[ na.c_array([1, 2, 3, 4], na.int32()), - na.c_array([1, 0, 1, 0], na.bool()), + na.c_array([1, 0, 1, 0], na.bool_()), ], ) @@ -148,7 +151,7 @@ def test_iter_tuples_nullable(): sliced_child = na.c_array_from_buffers( array.schema, length=3, - buffers=[na.c_buffer([True, True, False], na.bool())], + buffers=[na.c_buffer([True, True, False], na.bool_())], children=[array.child(0)[1:], array.child(1)[1:]], ) assert list(iter_tuples(sliced_child)) == [(2, False), (3, True), None] @@ -164,10 +167,13 @@ def test_iter_tuples_errors(): def test_iterator_struct(): array = na.c_array_from_buffers( - na.struct({"col1": na.int32(), "col2": na.bool()}), + na.struct({"col1": na.int32(), "col2": na.bool_()}), length=3, buffers=[None], - children=[na.c_array([1, 2, 3], na.int32()), na.c_array([1, 0, 1], na.bool())], + children=[ + na.c_array([1, 2, 3], na.int32()), + na.c_array([1, 0, 1], na.bool_()), + ], ) assert list(iter_py(array)) == [ @@ -185,12 +191,12 @@ def test_iterator_struct(): def test_iterator_nullable_struct(): array = na.c_array_from_buffers( - na.struct({"col1": na.int32(), "col2": na.bool()}), + na.struct({"col1": na.int32(), "col2": na.bool_()}), length=4, - buffers=[na.c_buffer([True, True, True, False], na.bool())], + buffers=[na.c_buffer([True, True, True, False], na.bool_())], children=[ na.c_array([1, 2, 3, 4], na.int32()), - na.c_array([1, 0, 1, 0], na.bool()), + na.c_array([1, 0, 1, 0], na.bool_()), ], ) diff --git a/python/tests/test_schema.py b/python/tests/test_schema.py index cc9e42b24..38c412f3f 100644 --- a/python/tests/test_schema.py +++ b/python/tests/test_schema.py @@ -77,13 +77,17 @@ def test_schema_create_no_params(): assert schema_obj.name == "not empty" assert "name='not empty'" in repr(schema_obj) + msg = "params are only supported for obj of class Type" + with pytest.raises(ValueError, match=msg): + na.Schema(na.fixed_size_binary(123), byte_width=12) + with pytest.raises(ValueError, match=r"^Unused parameter"): na.Schema(na.Type.INT32, unused_param="unused_value") def test_schema_simple(): assert na.null().type == na.Type.NULL - assert na.bool().type == na.Type.BOOL + assert na.bool_().type == na.Type.BOOL assert na.int8().type == na.Type.INT8 assert na.uint8().type == na.Type.UINT8 assert na.int16().type == na.Type.INT16 @@ -171,13 +175,6 @@ def test_schema_struct(): assert "fields=[Schema(INT32)]" in repr(schema_obj) - # Make sure we can use a list of two-tuples - schema_obj = na.struct([("col_name", na.Type.INT32)]) - assert schema_obj.type == na.Type.STRUCT - assert schema_obj.field(0).type == na.Type.INT32 - assert schema_obj.field(0).name == "col_name" - assert "fields=[Schema(INT32, name='col_name')]" in repr(schema_obj) - # Make sure we can use a dictionary to specify fields schema_obj = na.struct({"col_name": na.Type.INT32}) assert schema_obj.type == na.Type.STRUCT @@ -185,13 +182,46 @@ def test_schema_struct(): assert schema_obj.field(0).name == "col_name" # Make sure we can use a Schema when constructing fields (and that - # fild names are taken from the input) + # field names are taken from the input) schema_obj = na.struct([schema_obj.field(0)]) assert schema_obj.type == na.Type.STRUCT assert schema_obj.field(0).type == na.Type.INT32 assert schema_obj.field(0).name == "col_name" +def test_schema_list_(): + schema_obj = na.list_(na.null()) + assert schema_obj.type == na.Type.LIST + assert schema_obj.value_type.type == na.Type.NULL + + +def test_schema_large_list(): + schema_obj = na.large_list(na.null()) + assert schema_obj.type == na.Type.LARGE_LIST + assert schema_obj.value_type.type == na.Type.NULL + + +def test_schema_fixed_size_list(): + schema_obj = na.fixed_size_list(na.null(), 123) + assert schema_obj.type == na.Type.FIXED_SIZE_LIST + assert schema_obj.value_type.type == na.Type.NULL + assert schema_obj.list_size == 123 + + +def test_schema_dictionary(): + schema_obj = na.dictionary(na.int8(), na.null()) + assert schema_obj.type == na.Type.DICTIONARY + assert schema_obj.index_type.type == na.Type.INT8 + assert schema_obj.value_type.type == na.Type.NULL + assert schema_obj.dictionary_ordered is False + + schema_obj_ordered = na.dictionary(na.int8(), na.null(), dictionary_ordered=True) + assert schema_obj_ordered.type == na.Type.DICTIONARY + assert schema_obj_ordered.index_type.type == na.Type.INT8 + assert schema_obj_ordered.value_type.type == na.Type.NULL + assert schema_obj_ordered.dictionary_ordered is True + + def test_schema_extension(): schema_obj = na.int32() assert schema_obj.extension is None