From 6fc5289e25e5933d81ce52dcb1b9568134e622d5 Mon Sep 17 00:00:00 2001 From: PhilipGutberlet <92990487+PhilipGutberlet@users.noreply.github.com> Date: Fri, 26 May 2023 15:57:48 +0200 Subject: [PATCH 01/51] _data_type function --- src/safeds/data/tabular/containers/_column.py | 2 +- .../data/tabular/typing/_column_type.py | 63 ++++++++++--- .../data/tabular/typing/test_column_type.py | 89 ++++++++++++------- 3 files changed, 106 insertions(+), 48 deletions(-) diff --git a/src/safeds/data/tabular/containers/_column.py b/src/safeds/data/tabular/containers/_column.py index ae42ace06..72f4c0741 100644 --- a/src/safeds/data/tabular/containers/_column.py +++ b/src/safeds/data/tabular/containers/_column.py @@ -105,7 +105,7 @@ def __init__(self, name: str, data: Sequence[T] | None = None) -> None: self._name: str = name self._data: pd.Series = data.rename(name) if isinstance(data, pd.Series) else pd.Series(data, name=name) # noinspection PyProtectedMember - self._type: ColumnType = ColumnType._from_numpy_data_type(self._data.dtype) + self._type: ColumnType = ColumnType._data_type(self) def __contains__(self, item: Any) -> bool: return item in self._data diff --git a/src/safeds/data/tabular/typing/_column_type.py b/src/safeds/data/tabular/typing/_column_type.py index 82396f334..891aa1ec1 100644 --- a/src/safeds/data/tabular/typing/_column_type.py +++ b/src/safeds/data/tabular/typing/_column_type.py @@ -2,23 +2,25 @@ from abc import ABC, abstractmethod from dataclasses import dataclass -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Any + if TYPE_CHECKING: import numpy as np + from safeds.data.tabular.containers import Column class ColumnType(ABC): """Abstract base class for column types.""" @staticmethod - def _from_numpy_data_type(data_type: np.dtype) -> ColumnType: + def _data_type(column: Column) -> ColumnType: """ Return the column type for a given `numpy` data type. Parameters ---------- - data_type : numpy.dtype + self : Column The `numpy` data type. Returns @@ -31,17 +33,50 @@ def _from_numpy_data_type(data_type: np.dtype) -> ColumnType: NotImplementedError If the given data type is not supported. """ - if data_type.kind in ("u", "i"): - return Integer() - if data_type.kind == "b": - return Boolean() - if data_type.kind == "f": - return RealNumber() - if data_type.kind in ("S", "U", "O", "M", "m"): - return String() - - message = f"Unsupported numpy data type '{data_type}'." - raise NotImplementedError(message) + + def columntype_of_type(celltype: Any) -> ColumnType: + if celltype == int: + return Integer() + if celltype == bool: + return Boolean() + if celltype == float: + return RealNumber() + if celltype == str: + return String() + if celltype is None: + return Anything(is_nullable=True) + else: + message = f"Unsupported numpy data type '{celltype}'." + raise NotImplementedError(message) + + for cell in column: + print("Hallo") + if column.type is None: + column.type = columntype_of_type(type(cell)) + elif column.type != columntype_of_type(type(cell)): + if column.type == Integer and type(cell) == float: + column.type = RealNumber() + else: + column.type = Anything() + return column.type + + + + # if celltype == int: + # return Integer() + # if celltype == bool: + # return Boolean() + # if celltype == float: + # return RealNumber() + # if celltype == str: + # return String() + # else: + # return Anything() + + + # message = f"Unsupported numpy data type '{data_type}'." + # raise NotImplementedError(message) + @abstractmethod def is_nullable(self) -> bool: diff --git a/tests/safeds/data/tabular/typing/test_column_type.py b/tests/safeds/data/tabular/typing/test_column_type.py index 5ad58915b..d198e6573 100644 --- a/tests/safeds/data/tabular/typing/test_column_type.py +++ b/tests/safeds/data/tabular/typing/test_column_type.py @@ -1,5 +1,7 @@ import numpy as np import pytest + +from safeds.data.tabular.containers import Column from safeds.data.tabular.typing import ( Anything, Boolean, @@ -10,44 +12,65 @@ ) -class TestFromNumpyDataType: - # Test cases taken from https://numpy.org/doc/stable/reference/arrays.scalars.html#scalars +# class TestFromNumpyDataType: +# # Test cases taken from https://numpy.org/doc/stable/reference/arrays.scalars.html#scalars +# @pytest.mark.parametrize( +# ("data_type", "expected"), +# [ +# # Boolean +# (np.dtype(np.bool_), Boolean()), +# # Number +# (np.dtype(np.half), RealNumber()), +# (np.dtype(np.single), RealNumber()), +# (np.dtype(np.float_), RealNumber()), +# (np.dtype(np.longfloat), RealNumber()), +# # Int +# (np.dtype(np.byte), Integer()), +# (np.dtype(np.short), Integer()), +# (np.dtype(np.intc), Integer()), +# (np.dtype(np.int_), Integer()), +# (np.dtype(np.longlong), Integer()), +# (np.dtype(np.ubyte), Integer()), +# (np.dtype(np.ushort), Integer()), +# (np.dtype(np.uintc), Integer()), +# (np.dtype(np.uint), Integer()), +# (np.dtype(np.ulonglong), Integer()), +# # String +# (np.dtype(np.str_), String()), +# (np.dtype(np.unicode_), String()), +# (np.dtype(np.object_), String()), +# (np.dtype(np.datetime64), String()), +# (np.dtype(np.timedelta64), String()), +# ], +# ids=repr, +# ) +# def test_should_create_column_type_from_numpy_data_type(self, data_type: np.dtype, expected: ColumnType) -> None: +# assert ColumnType._from_numpy_data_type(data_type) == expected +# +# def test_should_raise_if_data_type_is_not_supported(self) -> None: +# with pytest.raises(NotImplementedError): +# ColumnType._from_numpy_data_type(np.dtype(np.void)) +# + +class TestDataType: @pytest.mark.parametrize( - ("data_type", "expected"), + ("column", "expected"), [ - # Boolean - (np.dtype(np.bool_), Boolean()), - # Number - (np.dtype(np.half), RealNumber()), - (np.dtype(np.single), RealNumber()), - (np.dtype(np.float_), RealNumber()), - (np.dtype(np.longfloat), RealNumber()), - # Int - (np.dtype(np.byte), Integer()), - (np.dtype(np.short), Integer()), - (np.dtype(np.intc), Integer()), - (np.dtype(np.int_), Integer()), - (np.dtype(np.longlong), Integer()), - (np.dtype(np.ubyte), Integer()), - (np.dtype(np.ushort), Integer()), - (np.dtype(np.uintc), Integer()), - (np.dtype(np.uint), Integer()), - (np.dtype(np.ulonglong), Integer()), - # String - (np.dtype(np.str_), String()), - (np.dtype(np.unicode_), String()), - (np.dtype(np.object_), String()), - (np.dtype(np.datetime64), String()), - (np.dtype(np.timedelta64), String()), + (Column("a", [1, 2, 3]), Integer()), + (Column("a", [1.0, 2.0, 3.0]), RealNumber()), + (Column("a", [True, False, True]), Boolean()), + (Column("a", ["a", "b", "c"]), String()), + (Column("a", [None, None, None]), Anything(is_nullable=True)), + (Column("a", [1, 2, None]), Anything(is_nullable=True)), + (Column("a", [1.0, 2.0, None]), Anything(is_nullable=True)), + (Column("a", [True, False, None]), Anything(is_nullable=True)), + (Column("a", ["a", "b", None]), Anything(is_nullable=True)), + ], ids=repr, ) - def test_should_create_column_type_from_numpy_data_type(self, data_type: np.dtype, expected: ColumnType) -> None: - assert ColumnType._from_numpy_data_type(data_type) == expected - - def test_should_raise_if_data_type_is_not_supported(self) -> None: - with pytest.raises(NotImplementedError): - ColumnType._from_numpy_data_type(np.dtype(np.void)) + def test_should_return_the_data_type(self, column: Column, expected: ColumnType) -> None: + assert ColumnType._data_type(column) == expected class TestRepr: From f94a226af1cbf2ffdfdbe34c53c14ce7445a5a1b Mon Sep 17 00:00:00 2001 From: PhilipGutberlet <92990487+PhilipGutberlet@users.noreply.github.com> Date: Fri, 2 Jun 2023 09:27:27 +0200 Subject: [PATCH 02/51] Columny_type is working without is_nullable --- src/safeds/data/tabular/containers/_column.py | 5 ++-- .../data/tabular/typing/_column_type.py | 29 ++++++++++--------- .../data/tabular/typing/test_column_type.py | 25 ++++++++-------- 3 files changed, 32 insertions(+), 27 deletions(-) diff --git a/src/safeds/data/tabular/containers/_column.py b/src/safeds/data/tabular/containers/_column.py index 72f4c0741..f142bff12 100644 --- a/src/safeds/data/tabular/containers/_column.py +++ b/src/safeds/data/tabular/containers/_column.py @@ -75,7 +75,7 @@ def _from_pandas_series(data: pd.Series, type_: ColumnType | None = None) -> Col result._name = data.name result._data = data # noinspection PyProtectedMember - result._type = type_ if type_ is not None else ColumnType._from_numpy_data_type(data.dtype) + result._type = type_ if type_ is not None else ColumnType._data_type(data) return result @@ -105,7 +105,7 @@ def __init__(self, name: str, data: Sequence[T] | None = None) -> None: self._name: str = name self._data: pd.Series = data.rename(name) if isinstance(data, pd.Series) else pd.Series(data, name=name) # noinspection PyProtectedMember - self._type: ColumnType = ColumnType._data_type(self) + self._type: ColumnType = ColumnType._data_type(data) def __contains__(self, item: Any) -> bool: return item in self._data @@ -688,3 +688,4 @@ def _count_missing_values(self) -> int: The number of null values. """ return self._data.isna().sum() + diff --git a/src/safeds/data/tabular/typing/_column_type.py b/src/safeds/data/tabular/typing/_column_type.py index 891aa1ec1..4dc60699a 100644 --- a/src/safeds/data/tabular/typing/_column_type.py +++ b/src/safeds/data/tabular/typing/_column_type.py @@ -4,6 +4,7 @@ from dataclasses import dataclass from typing import TYPE_CHECKING, Any +import pandas as pd if TYPE_CHECKING: import numpy as np @@ -14,14 +15,14 @@ class ColumnType(ABC): """Abstract base class for column types.""" @staticmethod - def _data_type(column: Column) -> ColumnType: + def _data_type(data: pd.Series) -> ColumnType: """ Return the column type for a given `numpy` data type. Parameters ---------- - self : Column - The `numpy` data type. + data : pd.Series + The data to be checked. Returns ------- @@ -44,21 +45,23 @@ def columntype_of_type(celltype: Any) -> ColumnType: if celltype == str: return String() if celltype is None: - return Anything(is_nullable=True) + return Anything(is_nullable=True) #when Nothing() exists Nothing() else: message = f"Unsupported numpy data type '{celltype}'." raise NotImplementedError(message) - for cell in column: - print("Hallo") - if column.type is None: - column.type = columntype_of_type(type(cell)) - elif column.type != columntype_of_type(type(cell)): - if column.type == Integer and type(cell) == float: - column.type = RealNumber() + for cell in data: + result = None #set type to Nothing as a default + is_nullable = False + if result is None: + result = columntype_of_type(type(cell)) + elif result != columntype_of_type(type(cell)): + is_nullable = True + if result == Integer and type(cell) == float: + result = RealNumber() else: - column.type = Anything() - return column.type + result = Anything() + return result diff --git a/tests/safeds/data/tabular/typing/test_column_type.py b/tests/safeds/data/tabular/typing/test_column_type.py index d198e6573..928d7284a 100644 --- a/tests/safeds/data/tabular/typing/test_column_type.py +++ b/tests/safeds/data/tabular/typing/test_column_type.py @@ -1,4 +1,5 @@ import numpy as np +import pandas as pd import pytest from safeds.data.tabular.containers import Column @@ -54,23 +55,23 @@ class TestDataType: @pytest.mark.parametrize( - ("column", "expected"), + ("data", "expected"), [ - (Column("a", [1, 2, 3]), Integer()), - (Column("a", [1.0, 2.0, 3.0]), RealNumber()), - (Column("a", [True, False, True]), Boolean()), - (Column("a", ["a", "b", "c"]), String()), - (Column("a", [None, None, None]), Anything(is_nullable=True)), - (Column("a", [1, 2, None]), Anything(is_nullable=True)), - (Column("a", [1.0, 2.0, None]), Anything(is_nullable=True)), - (Column("a", [True, False, None]), Anything(is_nullable=True)), - (Column("a", ["a", "b", None]), Anything(is_nullable=True)), + (pd.Series([1, 2, 3]), Integer()), + (pd.Series([1.0, 2.0, 3.0]), RealNumber()), + (pd.Series([True, False, True]), Boolean()), + (pd.Series(["a", "b", "c"]), String()), + (pd.Series([None, None, None]), Anything(is_nullable=True)), + (pd.Series([1, 2, None]), Anything(is_nullable=True)), + (pd.Series([1.0, 2.0, None]), Anything(is_nullable=True)), + (pd.Series([True, False, None]), Anything(is_nullable=True)), + (pd.Series(["a", "b", None]), Anything(is_nullable=True)), ], ids=repr, ) - def test_should_return_the_data_type(self, column: Column, expected: ColumnType) -> None: - assert ColumnType._data_type(column) == expected + def test_should_return_the_data_type(self, data: pd.Series, expected: ColumnType) -> None: + assert ColumnType._data_type(data) == expected class TestRepr: From cd7d745e79cf96c240fd1d665a5a2b9d4d5d586a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alexander=20Gr=C3=A9us?= Date: Fri, 2 Jun 2023 11:56:13 +0200 Subject: [PATCH 03/51] feat: Added new static method `Schema.merge_multiple_schemas` to merge multiple schemas into one feat: Added ability to `Table.add_row` and `Table.add_rows` to allow new rows with different schemas feat: Added the method `Row.sort_columns` to sort the columns in a row --- src/safeds/data/tabular/containers/_row.py | 34 ++++- src/safeds/data/tabular/containers/_table.py | 71 ++++++---- src/safeds/data/tabular/typing/_schema.py | 55 ++++++++ .../tabular/containers/_table/test_add_row.py | 40 ++++-- .../containers/_table/test_add_rows.py | 19 ++- .../safeds/data/tabular/typing/test_schema.py | 121 +++++++++++++++++- 6 files changed, 298 insertions(+), 42 deletions(-) diff --git a/src/safeds/data/tabular/containers/_row.py b/src/safeds/data/tabular/containers/_row.py index 0e338caed..b0aa5a1b7 100644 --- a/src/safeds/data/tabular/containers/_row.py +++ b/src/safeds/data/tabular/containers/_row.py @@ -1,7 +1,8 @@ from __future__ import annotations +import functools from collections.abc import Mapping -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, Callable, Tuple import pandas as pd @@ -440,6 +441,37 @@ def get_column_type(self, column_name: str) -> ColumnType: """ return self._schema.get_column_type(column_name) + # ------------------------------------------------------------------------------------------------------------------ + # Transformations + # ------------------------------------------------------------------------------------------------------------------ + + def sort_columns(self, comparator: Callable[[Tuple, Tuple], int] = lambda col1, col2: (col1[0] > col2[0]) + - (col1[0] < col2[0])) -> Row: + """ + Sort the columns of a `Row` with the given comparator and return a new `Row`. + + The original row is not modified. The comparator is a function that takes two Tuples of (ColumnName: Value) `col1` and `col2` and + returns an integer: + + * If `col1` should be ordered before `col2`, the function should return a negative number. + * If `col1` should be ordered after `col2`, the function should return a positive number. + * If the original order of `col1` and `col2` should be kept, the function should return 0. + + If no comparator is given, the columns will be sorted alphabetically by their name. + + Parameters + ---------- + comparator : Callable[[Tuple, Tuple], int] + The function used to compare two Tuples of (ColumnName: Value). + + Returns + ------- + new_row : Row + A new row with sorted columns. + """ + sorted_row_dict = dict(sorted(self.to_dict().items(), key=functools.cmp_to_key(comparator))) + return Row.from_dict(sorted_row_dict) + # ------------------------------------------------------------------------------------------------------------------ # Conversion # ------------------------------------------------------------------------------------------------------------------ diff --git a/src/safeds/data/tabular/containers/_table.py b/src/safeds/data/tabular/containers/_table.py index 99709faea..06e9ac6f1 100644 --- a/src/safeds/data/tabular/containers/_table.py +++ b/src/safeds/data/tabular/containers/_table.py @@ -245,23 +245,28 @@ def from_rows(rows: list[Row]) -> Table: Raises ------ - SchemaMismatchError - If any of the row schemas does not match with the others. + UnknownColumnNameError + If any of the row column names does not match with the first row. """ if len(rows) == 0: return Table._from_pandas_dataframe(pd.DataFrame()) - schema_compare: Schema = rows[0]._schema + column_names_compare: list = list(rows[0].column_names) + unknown_column_names = set() row_array: list[pd.DataFrame] = [] for row in rows: - if schema_compare != row._schema: - raise SchemaMismatchError + unknown_column_names.update(set(column_names_compare) - set(row.column_names)) row_array.append(row._data) + if len(unknown_column_names) > 0: + raise UnknownColumnNameError(list(unknown_column_names)) dataframe: DataFrame = pd.concat(row_array, ignore_index=True) - dataframe.columns = schema_compare.column_names - return Table._from_pandas_dataframe(dataframe) + dataframe.columns = column_names_compare + + schema = Schema.merge_multiple_schemas(list(row.schema for row in rows)) + + return Table._from_pandas_dataframe(dataframe, schema) @staticmethod def _from_pandas_dataframe(data: pd.DataFrame, schema: Schema | None = None) -> Table: @@ -636,7 +641,8 @@ def add_row(self, row: Row) -> Table: """ Add a row to the table. - This table is not modified. + The order of columns of the new row will be adjusted to the order of columns in the table. + This table will contain the merged schema. Parameters ---------- @@ -650,21 +656,30 @@ def add_row(self, row: Row) -> Table: Raises ------ - SchemaMismatchError - If the schema of the row does not match the table schema. + UnknownColumnNameError + If the row has different column names than the table. """ - if self._schema != row.schema: - raise SchemaMismatchError + if self.number_of_columns == 0: + return Table.from_rows([row]) + + if len(set(self.column_names) - set(row.column_names)) > 0: + raise UnknownColumnNameError(list(set(self.column_names) - set(row.column_names))) + + row = row.sort_columns(lambda col1, col2: self.column_names.index(col2[0]) - self.column_names.index(col1[0])) new_df = pd.concat([self._data, row._data]).infer_objects() new_df.columns = self.column_names - return Table._from_pandas_dataframe(new_df) + + schema = Schema.merge_multiple_schemas([self.schema, row.schema]) + + return Table._from_pandas_dataframe(new_df, schema) def add_rows(self, rows: list[Row] | Table) -> Table: """ Add multiple rows to a table. - This table is not modified. + The order of columns of the new rows will be adjusted to the order of columns in the table. + This table will contain the merged schema. Parameters ---------- @@ -678,21 +693,35 @@ def add_rows(self, rows: list[Row] | Table) -> Table: Raises ------ - SchemaMismatchError - If the schema of on of the row does not match the table schema. + UnknownColumnNameError + If at least one of the rows have different column names than the table. """ + if self.number_of_columns == 0: + return Table.from_rows(rows) + if isinstance(rows, Table): rows = rows.to_rows() - result = self._data + + sorted_rows = list() + for row in rows: + sorted_rows.append(row.sort_columns(lambda col1, col2: self.column_names.index(col2[0]) - self.column_names.index(col1[0]))) + rows = sorted_rows + + missing_col_names = set() for row in rows: - if self._schema != row.schema: - raise SchemaMismatchError + missing_col_names.update(set(self.column_names) - set(row.column_names)) + if len(missing_col_names) > 0: + raise UnknownColumnNameError(list(missing_col_names)) + result = self._data row_frames = (row._data for row in rows) result = pd.concat([result, *row_frames]).infer_objects() result.columns = self.column_names - return Table._from_pandas_dataframe(result) + + schema = Schema.merge_multiple_schemas([self.schema] + list(row.schema for row in rows)) + + return Table._from_pandas_dataframe(result, schema) def filter_rows(self, query: Callable[[Row], bool]) -> Table: """ @@ -1025,8 +1054,6 @@ def sort_columns( If no comparator is given, the columns will be sorted alphabetically by their name. - This table is not modified. - Parameters ---------- comparator : Callable[[Column, Column], int] diff --git a/src/safeds/data/tabular/typing/_schema.py b/src/safeds/data/tabular/typing/_schema.py index a75a87241..60aee88bd 100644 --- a/src/safeds/data/tabular/typing/_schema.py +++ b/src/safeds/data/tabular/typing/_schema.py @@ -1,8 +1,10 @@ from __future__ import annotations +from copy import deepcopy from dataclasses import dataclass from typing import TYPE_CHECKING +from safeds.data.tabular.typing import Integer, RealNumber, Anything from safeds.data.tabular.typing._column_type import ColumnType from safeds.exceptions import UnknownColumnNameError @@ -220,6 +222,59 @@ def to_dict(self) -> dict[str, ColumnType]: """ return dict(self._schema) # defensive copy + @staticmethod + def merge_multiple_schemas(schemas: list[Schema]): + """ + Merge multiple schemas into one. + + For each type missmatch the new schema will have the least common supertype. + + The type hierarchy is as follows: + * Anything + * RealNumber + * Integer + * Boolean + * String + + Parameters + ---------- + schemas : list[Schema] + the list of schemas you want to merge + + Returns + ------- + schema : Schema + the new merged schema + + Raises + ------ + UnknownColumnNameError + if not all schemas have the same column names + """ + schema_dict = schemas[0]._schema + missing_col_names = set() + for schema in schemas: + missing_col_names.update(set(schema.column_names) - set(schema_dict.keys())) + if len(missing_col_names) > 0: + raise UnknownColumnNameError(list(missing_col_names)) + for schema in schemas: + if schema_dict != schema._schema: + for col_name in schema_dict.keys(): + nullable = False + if schema_dict[col_name].is_nullable() or schema.get_column_type(col_name).is_nullable(): + nullable = True + if isinstance(schema_dict[col_name], type(schema.get_column_type(col_name))): + if schema.get_column_type(col_name).is_nullable() and not schema_dict[col_name].is_nullable(): + new_type = deepcopy(schema.get_column_type(col_name)) + new_type._is_nullable = nullable + schema_dict[col_name] = new_type + continue + if (isinstance(schema_dict[col_name], RealNumber) and isinstance(schema.get_column_type(col_name), Integer)) or (isinstance(schema_dict[col_name], Integer) and isinstance(schema.get_column_type(col_name), RealNumber)): + schema_dict[col_name] = RealNumber(nullable) + continue + schema_dict[col_name] = Anything(nullable) + return Schema(schema_dict) + # ------------------------------------------------------------------------------------------------------------------ # IPython Integration # ------------------------------------------------------------------------------------------------------------------ diff --git a/tests/safeds/data/tabular/containers/_table/test_add_row.py b/tests/safeds/data/tabular/containers/_table/test_add_row.py index e75e96ebb..64bc39846 100644 --- a/tests/safeds/data/tabular/containers/_table/test_add_row.py +++ b/tests/safeds/data/tabular/containers/_table/test_add_row.py @@ -1,25 +1,41 @@ import pytest from _pytest.python_api import raises from safeds.data.tabular.containers import Row, Table -from safeds.exceptions import SchemaMismatchError +from safeds.data.tabular.typing import Schema, Integer, Anything +from safeds.exceptions import UnknownColumnNameError @pytest.mark.parametrize( - ("table", "row"), + ("table", "row", "expected", "expected_schema"), [ - (Table({"col1": [1, 2, 1], "col2": [1, 2, 4]}), Row({"col1": 5, "col2": 6})), + (Table({"col1": [1, 2, 1], "col2": [1, 2, 4]}), + Row({"col1": 5, "col2": 6}), + Table({"col1": [1, 2, 1, 5], "col2": [1, 2, 4, 6]}), + Schema({"col1": Integer(), "col2": Integer()})), + (Table({"col1": [1, 2, 1], "col2": [1, 2, 4]}), + Row({"col1": "5", "col2": 6}), + Table({"col1": [1, 2, 1, "5"], "col2": [1, 2, 4, 6]}), + Schema({"col1": Anything(), "col2": Integer()})), + (Table({"col1": [1, 2, 1], "col2": [1, 2, 4]}), + Table.from_rows([Row({"col1": "5", "col2": None}), + Row({"col1": "5", "col2": 2})]).get_row(0), + Table({"col1": [1, 2, 1, "5"], "col2": [1, 2, 4, None]}), + Schema({"col1": Anything(), "col2": Integer(False)})), ], - ids=["added row"], + ids=["added row", "different schemas", "different schemas and nullable"], ) -def test_should_add_row(table: Table, row: Row) -> None: +def test_should_add_row(table: Table, row: Row, expected: Table, expected_schema: Schema) -> None: table = table.add_row(row) assert table.number_of_rows == 4 - assert table.get_row(3) == row - assert table.schema == row._schema + assert table.schema == expected_schema + assert table == expected -def test_should_raise_error_if_row_schema_invalid() -> None: - table1 = Table({"col1": [1, 2, 1], "col2": [1, 2, 4]}) - row = Row({"col1": 5, "col2": "Hallo"}) - with raises(SchemaMismatchError, match=r"Failed because at least two schemas didn't match."): - table1.add_row(row) +@pytest.mark.parametrize( + ("table", "row", "expected_error_msg"), + [(Table({"col1": [1, 2, 1], "col2": [1, 2, 4]}), Row({"col1": 5, "col3": "Hallo"}), r"Could not find column\(s\) 'col2'")], + ids=["unknown column col2 in row"] +) +def test_should_raise_error_if_row_column_names_invalid(table, row, expected_error_msg) -> None: + with raises(UnknownColumnNameError, match=expected_error_msg): + table.add_row(row) diff --git a/tests/safeds/data/tabular/containers/_table/test_add_rows.py b/tests/safeds/data/tabular/containers/_table/test_add_rows.py index 88d427784..ab2164a35 100644 --- a/tests/safeds/data/tabular/containers/_table/test_add_rows.py +++ b/tests/safeds/data/tabular/containers/_table/test_add_rows.py @@ -1,6 +1,6 @@ import pytest from safeds.data.tabular.containers import Row, Table -from safeds.exceptions import SchemaMismatchError +from safeds.exceptions import UnknownColumnNameError @pytest.mark.parametrize( @@ -35,8 +35,15 @@ def test_should_add_rows_from_table(table1: Table, table2: Table, expected: Tabl assert table1 == expected -def test_should_raise_error_if_row_schema_invalid() -> None: - table1 = Table({"col1": [1, 2, 1], "col2": [1, 2, 4]}) - row = [Row({"col1": 2, "col2": 4}), Row({"col1": 5, "col2": "Hallo"})] - with pytest.raises(SchemaMismatchError, match=r"Failed because at least two schemas didn't match."): - table1.add_rows(row) +@pytest.mark.parametrize( + ("table", "rows", "expected_error_msg"), + [ + (Table({"col1": [1, 2, 1], "col2": [1, 2, 4]}), + [Row({"col1": 2, "col3": 4}), Row({"col1": 5, "col2": "Hallo"})], + r"aa" + ), + ] +) +def test_should_raise_error_if_row_column_names_invalid(table: Table, rows: list[Row], expected_error_msg: str) -> None: + with pytest.raises(UnknownColumnNameError, match=expected_error_msg): + table.add_rows(rows) diff --git a/tests/safeds/data/tabular/typing/test_schema.py b/tests/safeds/data/tabular/typing/test_schema.py index f4e827353..8b185a8f9 100644 --- a/tests/safeds/data/tabular/typing/test_schema.py +++ b/tests/safeds/data/tabular/typing/test_schema.py @@ -4,7 +4,7 @@ import pandas as pd import pytest -from safeds.data.tabular.typing import Boolean, ColumnType, Integer, RealNumber, Schema, String +from safeds.data.tabular.typing import Boolean, ColumnType, Integer, RealNumber, Schema, String, Anything from safeds.exceptions import UnknownColumnNameError if TYPE_CHECKING: @@ -235,6 +235,125 @@ def test_should_return_dict_for_schema(self, schema: Schema, expected: str) -> N assert schema.to_dict() == expected +class TestMergeMultipleSchemas: + @pytest.mark.parametrize( + ("schemas", "error_msg_regex"), + [ + ([Schema({"Column1": Anything()}), Schema({"Column2": Anything()})], r"Could not find column\(s\) 'Column2'") + ], + ids=["different_column_names"] + ) + def test_should_raise_if_column_names_are_different(self, schemas: list[Schema], error_msg_regex: str): + with pytest.raises(UnknownColumnNameError, match=error_msg_regex): + Schema.merge_multiple_schemas(schemas) + + @pytest.mark.parametrize( + ("schemas", "expected"), + [ + ([Schema({"Column1": Integer()}), Schema({"Column1": Integer()})], Schema({"Column1": Integer()})), + ([Schema({"Column1": RealNumber()}), Schema({"Column1": RealNumber()})], Schema({"Column1": RealNumber()})), + ([Schema({"Column1": Boolean()}), Schema({"Column1": Boolean()})], Schema({"Column1": Boolean()})), + ([Schema({"Column1": String()}), Schema({"Column1": String()})], Schema({"Column1": String()})), + ([Schema({"Column1": Anything()}), Schema({"Column1": Anything()})], Schema({"Column1": Anything()})), + ([Schema({"Column1": Integer()}), Schema({"Column1": RealNumber()})], Schema({"Column1": RealNumber()})), + ([Schema({"Column1": Integer()}), Schema({"Column1": Boolean()})], Schema({"Column1": Anything()})), + ([Schema({"Column1": Integer()}), Schema({"Column1": String()})], Schema({"Column1": Anything()})), + ([Schema({"Column1": Integer()}), Schema({"Column1": Anything()})], Schema({"Column1": Anything()})), + ([Schema({"Column1": RealNumber()}), Schema({"Column1": Boolean()})], Schema({"Column1": Anything()})), + ([Schema({"Column1": RealNumber()}), Schema({"Column1": String()})], Schema({"Column1": Anything()})), + ([Schema({"Column1": RealNumber()}), Schema({"Column1": Anything()})], Schema({"Column1": Anything()})), + ([Schema({"Column1": Boolean()}), Schema({"Column1": String()})], Schema({"Column1": Anything()})), + ([Schema({"Column1": Boolean()}), Schema({"Column1": Anything()})], Schema({"Column1": Anything()})), + ([Schema({"Column1": String()}), Schema({"Column1": Anything()})], Schema({"Column1": Anything()})), + + ([Schema({"Column1": Integer(True)}), Schema({"Column1": Integer()})], Schema({"Column1": Integer(True)})), + ([Schema({"Column1": RealNumber(True)}), Schema({"Column1": RealNumber()})], Schema({"Column1": RealNumber(True)})), + ([Schema({"Column1": Boolean(True)}), Schema({"Column1": Boolean()})], Schema({"Column1": Boolean(True)})), + ([Schema({"Column1": String(True)}), Schema({"Column1": String()})], Schema({"Column1": String(True)})), + ([Schema({"Column1": Anything(True)}), Schema({"Column1": Anything()})], Schema({"Column1": Anything(True)})), + ([Schema({"Column1": Integer(True)}), Schema({"Column1": RealNumber()})], Schema({"Column1": RealNumber(True)})), + ([Schema({"Column1": Integer(True)}), Schema({"Column1": Boolean()})], Schema({"Column1": Anything(True)})), + ([Schema({"Column1": Integer(True)}), Schema({"Column1": String()})], Schema({"Column1": Anything(True)})), + ([Schema({"Column1": Integer(True)}), Schema({"Column1": Anything()})], Schema({"Column1": Anything(True)})), + ([Schema({"Column1": RealNumber(True)}), Schema({"Column1": Boolean()})], Schema({"Column1": Anything(True)})), + ([Schema({"Column1": RealNumber(True)}), Schema({"Column1": String()})], Schema({"Column1": Anything(True)})), + ([Schema({"Column1": RealNumber(True)}), Schema({"Column1": Anything()})], Schema({"Column1": Anything(True)})), + ([Schema({"Column1": Boolean(True)}), Schema({"Column1": String()})], Schema({"Column1": Anything(True)})), + ([Schema({"Column1": Boolean(True)}), Schema({"Column1": Anything()})], Schema({"Column1": Anything(True)})), + ([Schema({"Column1": String(True)}), Schema({"Column1": Anything()})], Schema({"Column1": Anything(True)})), + + ([Schema({"Column1": Integer()}), Schema({"Column1": Integer(True)})], Schema({"Column1": Integer(True)})), + ([Schema({"Column1": RealNumber()}), Schema({"Column1": RealNumber(True)})], Schema({"Column1": RealNumber(True)})), + ([Schema({"Column1": Boolean()}), Schema({"Column1": Boolean(True)})], Schema({"Column1": Boolean(True)})), + ([Schema({"Column1": String()}), Schema({"Column1": String(True)})], Schema({"Column1": String(True)})), + ([Schema({"Column1": Anything()}), Schema({"Column1": Anything(True)})], Schema({"Column1": Anything(True)})), + ([Schema({"Column1": Integer()}), Schema({"Column1": RealNumber(True)})], Schema({"Column1": RealNumber(True)})), + ([Schema({"Column1": Integer()}), Schema({"Column1": Boolean(True)})], Schema({"Column1": Anything(True)})), + ([Schema({"Column1": Integer()}), Schema({"Column1": String(True)})], Schema({"Column1": Anything(True)})), + ([Schema({"Column1": Integer()}), Schema({"Column1": Anything(True)})], Schema({"Column1": Anything(True)})), + ([Schema({"Column1": RealNumber()}), Schema({"Column1": Boolean(True)})], Schema({"Column1": Anything(True)})), + ([Schema({"Column1": RealNumber()}), Schema({"Column1": String(True)})], Schema({"Column1": Anything(True)})), + ([Schema({"Column1": RealNumber()}), Schema({"Column1": Anything(True)})], Schema({"Column1": Anything(True)})), + ([Schema({"Column1": Boolean()}), Schema({"Column1": String(True)})], Schema({"Column1": Anything(True)})), + ([Schema({"Column1": Boolean()}), Schema({"Column1": Anything(True)})], Schema({"Column1": Anything(True)})), + ([Schema({"Column1": String()}), Schema({"Column1": Anything(True)})], Schema({"Column1": Anything(True)})), + ], + ids=[ + "Integer Integer", + "RealNumber RealNumber", + "Boolean Boolean", + "String String", + "Anything Anything", + "Integer RealNumber", + "Integer Boolean", + "Integer String", + "Integer Anything", + "RealNumber Boolean", + "RealNumber String", + "RealNumber Anything", + "Boolean String", + "Boolean Anything", + "String Anything", + + "Integer(null) Integer", + "RealNumber(null) RealNumber", + "Boolean(null) Boolean", + "String(null) String", + "Anything(null) Anything", + "Integer(null) RealNumber", + "Integer(null) Boolean", + "Integer(null) String", + "Integer(null) Anything", + "RealNumber(null) Boolean", + "RealNumber(null) String", + "RealNumber(null) Anything", + "Boolean(null) String", + "Boolean(null) Anything", + "String(null) Anything", + + "Integer Integer(null)", + "RealNumber RealNumber(null)", + "Boolean Boolean(null)", + "String String(null)", + "Anything Anything(null)", + "Integer RealNumber(null)", + "Integer Boolean(null)", + "Integer String(null)", + "Integer Anything(null)", + "RealNumber Boolean(null)", + "RealNumber String(null)", + "RealNumber Anything(null)", + "Boolean String(null)", + "Boolean Anything(null)", + "String Anything(null)", + ] + ) + def test_should_return_merged_schema(self, schemas: list[Schema], expected: Schema): + assert Schema.merge_multiple_schemas(schemas) == expected + schemas.reverse() + assert Schema.merge_multiple_schemas(schemas) == expected # test the reversed list because the first parameter is handled differently + + class TestReprMarkdown: @pytest.mark.parametrize( ("schema", "expected"), From faea224bcbc80e727ec6896364d6c4a40d638f8f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alexander=20Gr=C3=A9us?= Date: Fri, 2 Jun 2023 12:23:25 +0200 Subject: [PATCH 04/51] test: Corrected tests for different schemas Co-authored-by: alex-senger <91055000+alex-senger@users.noreply.github.com> --- src/safeds/data/tabular/containers/_table.py | 10 +++--- .../containers/_table/test_add_rows.py | 21 ++++++++--- .../containers/_table/test_from_rows.py | 35 +++++++++++++++---- 3 files changed, 51 insertions(+), 15 deletions(-) diff --git a/src/safeds/data/tabular/containers/_table.py b/src/safeds/data/tabular/containers/_table.py index 06e9ac6f1..079f79dfc 100644 --- a/src/safeds/data/tabular/containers/_table.py +++ b/src/safeds/data/tabular/containers/_table.py @@ -702,17 +702,17 @@ def add_rows(self, rows: list[Row] | Table) -> Table: if isinstance(rows, Table): rows = rows.to_rows() - sorted_rows = list() - for row in rows: - sorted_rows.append(row.sort_columns(lambda col1, col2: self.column_names.index(col2[0]) - self.column_names.index(col1[0]))) - rows = sorted_rows - missing_col_names = set() for row in rows: missing_col_names.update(set(self.column_names) - set(row.column_names)) if len(missing_col_names) > 0: raise UnknownColumnNameError(list(missing_col_names)) + sorted_rows = list() + for row in rows: + sorted_rows.append(row.sort_columns(lambda col1, col2: self.column_names.index(col2[0]) - self.column_names.index(col1[0]))) + rows = sorted_rows + result = self._data row_frames = (row._data for row in rows) diff --git a/tests/safeds/data/tabular/containers/_table/test_add_rows.py b/tests/safeds/data/tabular/containers/_table/test_add_rows.py index ab2164a35..1b3e74aaf 100644 --- a/tests/safeds/data/tabular/containers/_table/test_add_rows.py +++ b/tests/safeds/data/tabular/containers/_table/test_add_rows.py @@ -11,11 +11,17 @@ [Row({"col1": "d", "col2": 6}), Row({"col1": "e", "col2": 8})], Table({"col1": ["a", "b", "c", "d", "e"], "col2": [1, 2, 4, 6, 8]}), ), + ( + Table({"col1": ["a", "b", "c"], "col2": [1, 2, 4]}), + [Row({"col1": "d", "col2": 6}), Row({"col1": "e", "col2": "f"})], + Table({"col1": ["a", "b", "c", "d", "e"], "col2": [1, 2, 4, 6, "f"]}), + ), ], - ids=["Rows with string and integer values"], + ids=["Rows with string and integer values", "different schema"], ) def test_should_add_rows(table1: Table, rows: list[Row], table2: Table) -> None: table1 = table1.add_rows(rows) + assert table1.schema == table2.schema assert table1 == table2 @@ -27,11 +33,17 @@ def test_should_add_rows(table1: Table, rows: list[Row], table2: Table) -> None: Table({"col1": [5, 7], "col2": [6, 8]}), Table({"col1": [1, 2, 1, 5, 7], "col2": [1, 2, 4, 6, 8]}), ), + ( + Table({"col1": [1, 2, 1], "col2": [1, 2, 4]}), + Table({"col1": [5, "7"], "col2": [6, None]}), + Table({"col1": [1, 2, 1, 5, "7"], "col2": [1, 2, 4, 6, None]}), + ), ], - ids=["Rows from table"], + ids=["Rows from table", "different schema"], ) def test_should_add_rows_from_table(table1: Table, table2: Table, expected: Table) -> None: table1 = table1.add_rows(table2) + assert table1.schema == expected.schema assert table1 == expected @@ -40,9 +52,10 @@ def test_should_add_rows_from_table(table1: Table, table2: Table, expected: Tabl [ (Table({"col1": [1, 2, 1], "col2": [1, 2, 4]}), [Row({"col1": 2, "col3": 4}), Row({"col1": 5, "col2": "Hallo"})], - r"aa" + r"Could not find column\(s\) 'col2'" ), - ] + ], + ids=["column names do not match"] ) def test_should_raise_error_if_row_column_names_invalid(table: Table, rows: list[Row], expected_error_msg: str) -> None: with pytest.raises(UnknownColumnNameError, match=expected_error_msg): diff --git a/tests/safeds/data/tabular/containers/_table/test_from_rows.py b/tests/safeds/data/tabular/containers/_table/test_from_rows.py index 56c8a296a..a32e9508f 100644 --- a/tests/safeds/data/tabular/containers/_table/test_from_rows.py +++ b/tests/safeds/data/tabular/containers/_table/test_from_rows.py @@ -1,6 +1,6 @@ import pytest from safeds.data.tabular.containers import Row, Table -from safeds.exceptions import SchemaMismatchError +from safeds.exceptions import UnknownColumnNameError @pytest.mark.parametrize( @@ -24,14 +24,37 @@ }, ), ), + ( + [ + Row({"A": 1, "B": 4, "C": "d"}), + Row({"A": 2, "B": 5, "C": "e"}), + Row({"A": 3, "B": "6", "C": "f"}), + ], + Table( + { + "A": [1, 2, 3], + "B": [4, 5, "6"], + "C": ["d", "e", "f"], + }, + ), + ), ], - ids=["empty", "non-empty"], + ids=["empty", "non-empty", "different schemas"], ) def test_should_create_table_from_rows(rows: list[Row], expected: Table) -> None: - assert Table.from_rows(rows) == expected + table = Table.from_rows(rows) + assert table.schema == expected.schema + assert table == expected -def test_should_raise_error_if_mismatching_schema() -> None: - rows = [Row({"A": 1, "B": 2}), Row({"A": 2, "B": "a"})] - with pytest.raises(SchemaMismatchError, match=r"Failed because at least two schemas didn't match."): +@pytest.mark.parametrize( + ("rows", "expected_error_msg"), + [ + ( + [Row({"A": 1, "B": 2}), Row({"A": 2, "C": 4})], r"Could not find column\(s\) 'B'" + ) + ] +) +def test_should_raise_error_if_unknown_column_names(rows: list[Row], expected_error_msg: str) -> None: + with pytest.raises(UnknownColumnNameError, match=expected_error_msg): Table.from_rows(rows) From fbf0411bbebb29d7af2c5191d5b8842f39b52019 Mon Sep 17 00:00:00 2001 From: PhilipGutberlet <92990487+PhilipGutberlet@users.noreply.github.com> Date: Fri, 2 Jun 2023 12:24:13 +0200 Subject: [PATCH 05/51] Columny_type is working except numeric + print statements for Alex --- src/safeds/data/tabular/typing/_column_type.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/safeds/data/tabular/typing/_column_type.py b/src/safeds/data/tabular/typing/_column_type.py index 4dc60699a..185aa83f1 100644 --- a/src/safeds/data/tabular/typing/_column_type.py +++ b/src/safeds/data/tabular/typing/_column_type.py @@ -2,6 +2,7 @@ from abc import ABC, abstractmethod from dataclasses import dataclass +from types import NoneType from typing import TYPE_CHECKING, Any import pandas as pd @@ -44,21 +45,24 @@ def columntype_of_type(celltype: Any) -> ColumnType: return RealNumber() if celltype == str: return String() - if celltype is None: + if celltype is NoneType: return Anything(is_nullable=True) #when Nothing() exists Nothing() else: message = f"Unsupported numpy data type '{celltype}'." raise NotImplementedError(message) + result = None # set type to Nothing as a default + is_nullable = False for cell in data: - result = None #set type to Nothing as a default - is_nullable = False + print(result) + print(data.dtype) + print(type(cell)) if result is None: result = columntype_of_type(type(cell)) elif result != columntype_of_type(type(cell)): is_nullable = True if result == Integer and type(cell) == float: - result = RealNumber() + result = RealNumber(is_nullable) else: result = Anything() return result From c7774d6145e5a9c4ce633f2f906dede1a8f8d3fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alexander=20Gr=C3=A9us?= Date: Fri, 2 Jun 2023 13:41:02 +0200 Subject: [PATCH 06/51] feat: Added abstract constructor to `ColumnType` refactor: Sytisfied the linters Co-authored-by: alex-senger <91055000+alex-senger@users.noreply.github.com> --- src/safeds/data/tabular/containers/_table.py | 6 +- .../data/tabular/typing/_column_type.py | 4 ++ src/safeds/data/tabular/typing/_schema.py | 6 +- .../tabular/containers/_table/test_add_row.py | 4 +- .../safeds/data/tabular/typing/test_schema.py | 66 +++++++++---------- 5 files changed, 44 insertions(+), 42 deletions(-) diff --git a/src/safeds/data/tabular/containers/_table.py b/src/safeds/data/tabular/containers/_table.py index 079f79dfc..8612c244e 100644 --- a/src/safeds/data/tabular/containers/_table.py +++ b/src/safeds/data/tabular/containers/_table.py @@ -696,12 +696,12 @@ def add_rows(self, rows: list[Row] | Table) -> Table: UnknownColumnNameError If at least one of the rows have different column names than the table. """ - if self.number_of_columns == 0: - return Table.from_rows(rows) - if isinstance(rows, Table): rows = rows.to_rows() + if self.number_of_columns == 0: + return Table.from_rows(rows) + missing_col_names = set() for row in rows: missing_col_names.update(set(self.column_names) - set(row.column_names)) diff --git a/src/safeds/data/tabular/typing/_column_type.py b/src/safeds/data/tabular/typing/_column_type.py index 82396f334..ae239be07 100644 --- a/src/safeds/data/tabular/typing/_column_type.py +++ b/src/safeds/data/tabular/typing/_column_type.py @@ -11,6 +11,10 @@ class ColumnType(ABC): """Abstract base class for column types.""" + @abstractmethod + def __init__(self, is_nullable: bool = False): + pass + @staticmethod def _from_numpy_data_type(data_type: np.dtype) -> ColumnType: """ diff --git a/src/safeds/data/tabular/typing/_schema.py b/src/safeds/data/tabular/typing/_schema.py index 60aee88bd..9d9ee951b 100644 --- a/src/safeds/data/tabular/typing/_schema.py +++ b/src/safeds/data/tabular/typing/_schema.py @@ -223,7 +223,7 @@ def to_dict(self) -> dict[str, ColumnType]: return dict(self._schema) # defensive copy @staticmethod - def merge_multiple_schemas(schemas: list[Schema]): + def merge_multiple_schemas(schemas: list[Schema]) -> Schema: """ Merge multiple schemas into one. @@ -265,9 +265,7 @@ def merge_multiple_schemas(schemas: list[Schema]): nullable = True if isinstance(schema_dict[col_name], type(schema.get_column_type(col_name))): if schema.get_column_type(col_name).is_nullable() and not schema_dict[col_name].is_nullable(): - new_type = deepcopy(schema.get_column_type(col_name)) - new_type._is_nullable = nullable - schema_dict[col_name] = new_type + schema_dict[col_name] = type(schema.get_column_type(col_name))(nullable) continue if (isinstance(schema_dict[col_name], RealNumber) and isinstance(schema.get_column_type(col_name), Integer)) or (isinstance(schema_dict[col_name], Integer) and isinstance(schema.get_column_type(col_name), RealNumber)): schema_dict[col_name] = RealNumber(nullable) diff --git a/tests/safeds/data/tabular/containers/_table/test_add_row.py b/tests/safeds/data/tabular/containers/_table/test_add_row.py index 64bc39846..eb3f9cee2 100644 --- a/tests/safeds/data/tabular/containers/_table/test_add_row.py +++ b/tests/safeds/data/tabular/containers/_table/test_add_row.py @@ -20,7 +20,7 @@ Table.from_rows([Row({"col1": "5", "col2": None}), Row({"col1": "5", "col2": 2})]).get_row(0), Table({"col1": [1, 2, 1, "5"], "col2": [1, 2, 4, None]}), - Schema({"col1": Anything(), "col2": Integer(False)})), + Schema({"col1": Anything(), "col2": Integer(is_nullable=True)})), ], ids=["added row", "different schemas", "different schemas and nullable"], ) @@ -36,6 +36,6 @@ def test_should_add_row(table: Table, row: Row, expected: Table, expected_schema [(Table({"col1": [1, 2, 1], "col2": [1, 2, 4]}), Row({"col1": 5, "col3": "Hallo"}), r"Could not find column\(s\) 'col2'")], ids=["unknown column col2 in row"] ) -def test_should_raise_error_if_row_column_names_invalid(table, row, expected_error_msg) -> None: +def test_should_raise_error_if_row_column_names_invalid(table: Table, row: Row, expected_error_msg: str) -> None: with raises(UnknownColumnNameError, match=expected_error_msg): table.add_row(row) diff --git a/tests/safeds/data/tabular/typing/test_schema.py b/tests/safeds/data/tabular/typing/test_schema.py index 8b185a8f9..9aa9fb9ef 100644 --- a/tests/safeds/data/tabular/typing/test_schema.py +++ b/tests/safeds/data/tabular/typing/test_schema.py @@ -243,7 +243,7 @@ class TestMergeMultipleSchemas: ], ids=["different_column_names"] ) - def test_should_raise_if_column_names_are_different(self, schemas: list[Schema], error_msg_regex: str): + def test_should_raise_if_column_names_are_different(self, schemas: list[Schema], error_msg_regex: str) -> None: with pytest.raises(UnknownColumnNameError, match=error_msg_regex): Schema.merge_multiple_schemas(schemas) @@ -266,37 +266,37 @@ def test_should_raise_if_column_names_are_different(self, schemas: list[Schema], ([Schema({"Column1": Boolean()}), Schema({"Column1": Anything()})], Schema({"Column1": Anything()})), ([Schema({"Column1": String()}), Schema({"Column1": Anything()})], Schema({"Column1": Anything()})), - ([Schema({"Column1": Integer(True)}), Schema({"Column1": Integer()})], Schema({"Column1": Integer(True)})), - ([Schema({"Column1": RealNumber(True)}), Schema({"Column1": RealNumber()})], Schema({"Column1": RealNumber(True)})), - ([Schema({"Column1": Boolean(True)}), Schema({"Column1": Boolean()})], Schema({"Column1": Boolean(True)})), - ([Schema({"Column1": String(True)}), Schema({"Column1": String()})], Schema({"Column1": String(True)})), - ([Schema({"Column1": Anything(True)}), Schema({"Column1": Anything()})], Schema({"Column1": Anything(True)})), - ([Schema({"Column1": Integer(True)}), Schema({"Column1": RealNumber()})], Schema({"Column1": RealNumber(True)})), - ([Schema({"Column1": Integer(True)}), Schema({"Column1": Boolean()})], Schema({"Column1": Anything(True)})), - ([Schema({"Column1": Integer(True)}), Schema({"Column1": String()})], Schema({"Column1": Anything(True)})), - ([Schema({"Column1": Integer(True)}), Schema({"Column1": Anything()})], Schema({"Column1": Anything(True)})), - ([Schema({"Column1": RealNumber(True)}), Schema({"Column1": Boolean()})], Schema({"Column1": Anything(True)})), - ([Schema({"Column1": RealNumber(True)}), Schema({"Column1": String()})], Schema({"Column1": Anything(True)})), - ([Schema({"Column1": RealNumber(True)}), Schema({"Column1": Anything()})], Schema({"Column1": Anything(True)})), - ([Schema({"Column1": Boolean(True)}), Schema({"Column1": String()})], Schema({"Column1": Anything(True)})), - ([Schema({"Column1": Boolean(True)}), Schema({"Column1": Anything()})], Schema({"Column1": Anything(True)})), - ([Schema({"Column1": String(True)}), Schema({"Column1": Anything()})], Schema({"Column1": Anything(True)})), - - ([Schema({"Column1": Integer()}), Schema({"Column1": Integer(True)})], Schema({"Column1": Integer(True)})), - ([Schema({"Column1": RealNumber()}), Schema({"Column1": RealNumber(True)})], Schema({"Column1": RealNumber(True)})), - ([Schema({"Column1": Boolean()}), Schema({"Column1": Boolean(True)})], Schema({"Column1": Boolean(True)})), - ([Schema({"Column1": String()}), Schema({"Column1": String(True)})], Schema({"Column1": String(True)})), - ([Schema({"Column1": Anything()}), Schema({"Column1": Anything(True)})], Schema({"Column1": Anything(True)})), - ([Schema({"Column1": Integer()}), Schema({"Column1": RealNumber(True)})], Schema({"Column1": RealNumber(True)})), - ([Schema({"Column1": Integer()}), Schema({"Column1": Boolean(True)})], Schema({"Column1": Anything(True)})), - ([Schema({"Column1": Integer()}), Schema({"Column1": String(True)})], Schema({"Column1": Anything(True)})), - ([Schema({"Column1": Integer()}), Schema({"Column1": Anything(True)})], Schema({"Column1": Anything(True)})), - ([Schema({"Column1": RealNumber()}), Schema({"Column1": Boolean(True)})], Schema({"Column1": Anything(True)})), - ([Schema({"Column1": RealNumber()}), Schema({"Column1": String(True)})], Schema({"Column1": Anything(True)})), - ([Schema({"Column1": RealNumber()}), Schema({"Column1": Anything(True)})], Schema({"Column1": Anything(True)})), - ([Schema({"Column1": Boolean()}), Schema({"Column1": String(True)})], Schema({"Column1": Anything(True)})), - ([Schema({"Column1": Boolean()}), Schema({"Column1": Anything(True)})], Schema({"Column1": Anything(True)})), - ([Schema({"Column1": String()}), Schema({"Column1": Anything(True)})], Schema({"Column1": Anything(True)})), + ([Schema({"Column1": Integer(is_nullable=True)}), Schema({"Column1": Integer()})], Schema({"Column1": Integer(is_nullable=True)})), + ([Schema({"Column1": RealNumber(is_nullable=True)}), Schema({"Column1": RealNumber()})], Schema({"Column1": RealNumber(is_nullable=True)})), + ([Schema({"Column1": Boolean(is_nullable=True)}), Schema({"Column1": Boolean()})], Schema({"Column1": Boolean(is_nullable=True)})), + ([Schema({"Column1": String(is_nullable=True)}), Schema({"Column1": String()})], Schema({"Column1": String(is_nullable=True)})), + ([Schema({"Column1": Anything(is_nullable=True)}), Schema({"Column1": Anything()})], Schema({"Column1": Anything(is_nullable=True)})), + ([Schema({"Column1": Integer(is_nullable=True)}), Schema({"Column1": RealNumber()})], Schema({"Column1": RealNumber(is_nullable=True)})), + ([Schema({"Column1": Integer(is_nullable=True)}), Schema({"Column1": Boolean()})], Schema({"Column1": Anything(is_nullable=True)})), + ([Schema({"Column1": Integer(is_nullable=True)}), Schema({"Column1": String()})], Schema({"Column1": Anything(is_nullable=True)})), + ([Schema({"Column1": Integer(is_nullable=True)}), Schema({"Column1": Anything()})], Schema({"Column1": Anything(is_nullable=True)})), + ([Schema({"Column1": RealNumber(is_nullable=True)}), Schema({"Column1": Boolean()})], Schema({"Column1": Anything(is_nullable=True)})), + ([Schema({"Column1": RealNumber(is_nullable=True)}), Schema({"Column1": String()})], Schema({"Column1": Anything(is_nullable=True)})), + ([Schema({"Column1": RealNumber(is_nullable=True)}), Schema({"Column1": Anything()})], Schema({"Column1": Anything(is_nullable=True)})), + ([Schema({"Column1": Boolean(is_nullable=True)}), Schema({"Column1": String()})], Schema({"Column1": Anything(is_nullable=True)})), + ([Schema({"Column1": Boolean(is_nullable=True)}), Schema({"Column1": Anything()})], Schema({"Column1": Anything(is_nullable=True)})), + ([Schema({"Column1": String(is_nullable=True)}), Schema({"Column1": Anything()})], Schema({"Column1": Anything(is_nullable=True)})), + + ([Schema({"Column1": Integer()}), Schema({"Column1": Integer(is_nullable=True)})], Schema({"Column1": Integer(is_nullable=True)})), + ([Schema({"Column1": RealNumber()}), Schema({"Column1": RealNumber(is_nullable=True)})], Schema({"Column1": RealNumber(is_nullable=True)})), + ([Schema({"Column1": Boolean()}), Schema({"Column1": Boolean(is_nullable=True)})], Schema({"Column1": Boolean(is_nullable=True)})), + ([Schema({"Column1": String()}), Schema({"Column1": String(is_nullable=True)})], Schema({"Column1": String(is_nullable=True)})), + ([Schema({"Column1": Anything()}), Schema({"Column1": Anything(is_nullable=True)})], Schema({"Column1": Anything(is_nullable=True)})), + ([Schema({"Column1": Integer()}), Schema({"Column1": RealNumber(is_nullable=True)})], Schema({"Column1": RealNumber(is_nullable=True)})), + ([Schema({"Column1": Integer()}), Schema({"Column1": Boolean(is_nullable=True)})], Schema({"Column1": Anything(is_nullable=True)})), + ([Schema({"Column1": Integer()}), Schema({"Column1": String(is_nullable=True)})], Schema({"Column1": Anything(is_nullable=True)})), + ([Schema({"Column1": Integer()}), Schema({"Column1": Anything(is_nullable=True)})], Schema({"Column1": Anything(is_nullable=True)})), + ([Schema({"Column1": RealNumber()}), Schema({"Column1": Boolean(is_nullable=True)})], Schema({"Column1": Anything(is_nullable=True)})), + ([Schema({"Column1": RealNumber()}), Schema({"Column1": String(is_nullable=True)})], Schema({"Column1": Anything(is_nullable=True)})), + ([Schema({"Column1": RealNumber()}), Schema({"Column1": Anything(is_nullable=True)})], Schema({"Column1": Anything(is_nullable=True)})), + ([Schema({"Column1": Boolean()}), Schema({"Column1": String(is_nullable=True)})], Schema({"Column1": Anything(is_nullable=True)})), + ([Schema({"Column1": Boolean()}), Schema({"Column1": Anything(is_nullable=True)})], Schema({"Column1": Anything(is_nullable=True)})), + ([Schema({"Column1": String()}), Schema({"Column1": Anything(is_nullable=True)})], Schema({"Column1": Anything(is_nullable=True)})), ], ids=[ "Integer Integer", @@ -348,7 +348,7 @@ def test_should_raise_if_column_names_are_different(self, schemas: list[Schema], "String Anything(null)", ] ) - def test_should_return_merged_schema(self, schemas: list[Schema], expected: Schema): + def test_should_return_merged_schema(self, schemas: list[Schema], expected: Schema) -> None: assert Schema.merge_multiple_schemas(schemas) == expected schemas.reverse() assert Schema.merge_multiple_schemas(schemas) == expected # test the reversed list because the first parameter is handled differently From 80ee4155ca06ab8491ebddc787082c7b53948a5c Mon Sep 17 00:00:00 2001 From: megalinter-bot <129584137+megalinter-bot@users.noreply.github.com> Date: Fri, 2 Jun 2023 11:43:11 +0000 Subject: [PATCH 07/51] style: apply automated linter fixes --- src/safeds/data/tabular/containers/_row.py | 11 +- src/safeds/data/tabular/containers/_table.py | 11 +- src/safeds/data/tabular/typing/_schema.py | 15 +- .../tabular/containers/_table/test_add_row.py | 43 +++-- .../containers/_table/test_add_rows.py | 11 +- .../containers/_table/test_from_rows.py | 6 +- .../safeds/data/tabular/typing/test_schema.py | 168 +++++++++++++----- 7 files changed, 183 insertions(+), 82 deletions(-) diff --git a/src/safeds/data/tabular/containers/_row.py b/src/safeds/data/tabular/containers/_row.py index b0aa5a1b7..363cc8af1 100644 --- a/src/safeds/data/tabular/containers/_row.py +++ b/src/safeds/data/tabular/containers/_row.py @@ -1,8 +1,8 @@ from __future__ import annotations import functools -from collections.abc import Mapping -from typing import TYPE_CHECKING, Any, Callable, Tuple +from collections.abc import Callable, Mapping +from typing import TYPE_CHECKING, Any import pandas as pd @@ -264,7 +264,7 @@ def __repr__(self) -> str: >>> repr(row) "Row({'a': 1})" """ - return f"Row({str(self)})" + return f"Row({self!s})" def __str__(self) -> str: """ @@ -445,8 +445,9 @@ def get_column_type(self, column_name: str) -> ColumnType: # Transformations # ------------------------------------------------------------------------------------------------------------------ - def sort_columns(self, comparator: Callable[[Tuple, Tuple], int] = lambda col1, col2: (col1[0] > col2[0]) - - (col1[0] < col2[0])) -> Row: + def sort_columns( + self, comparator: Callable[[tuple, tuple], int] = lambda col1, col2: (col1[0] > col2[0]) - (col1[0] < col2[0]), + ) -> Row: """ Sort the columns of a `Row` with the given comparator and return a new `Row`. diff --git a/src/safeds/data/tabular/containers/_table.py b/src/safeds/data/tabular/containers/_table.py index 8612c244e..e459dca9d 100644 --- a/src/safeds/data/tabular/containers/_table.py +++ b/src/safeds/data/tabular/containers/_table.py @@ -23,7 +23,6 @@ DuplicateColumnNameError, IndexOutOfBoundsError, NonNumericColumnError, - SchemaMismatchError, UnknownColumnNameError, WrongFileExtensionError, ) @@ -264,7 +263,7 @@ def from_rows(rows: list[Row]) -> Table: dataframe: DataFrame = pd.concat(row_array, ignore_index=True) dataframe.columns = column_names_compare - schema = Schema.merge_multiple_schemas(list(row.schema for row in rows)) + schema = Schema.merge_multiple_schemas([row.schema for row in rows]) return Table._from_pandas_dataframe(dataframe, schema) @@ -708,9 +707,11 @@ def add_rows(self, rows: list[Row] | Table) -> Table: if len(missing_col_names) > 0: raise UnknownColumnNameError(list(missing_col_names)) - sorted_rows = list() + sorted_rows = [] for row in rows: - sorted_rows.append(row.sort_columns(lambda col1, col2: self.column_names.index(col2[0]) - self.column_names.index(col1[0]))) + sorted_rows.append( + row.sort_columns(lambda col1, col2: self.column_names.index(col2[0]) - self.column_names.index(col1[0])), + ) rows = sorted_rows result = self._data @@ -719,7 +720,7 @@ def add_rows(self, rows: list[Row] | Table) -> Table: result = pd.concat([result, *row_frames]).infer_objects() result.columns = self.column_names - schema = Schema.merge_multiple_schemas([self.schema] + list(row.schema for row in rows)) + schema = Schema.merge_multiple_schemas([self.schema, *[row.schema for row in rows]]) return Table._from_pandas_dataframe(result, schema) diff --git a/src/safeds/data/tabular/typing/_schema.py b/src/safeds/data/tabular/typing/_schema.py index 9d9ee951b..73ed9be7b 100644 --- a/src/safeds/data/tabular/typing/_schema.py +++ b/src/safeds/data/tabular/typing/_schema.py @@ -1,10 +1,9 @@ from __future__ import annotations -from copy import deepcopy from dataclasses import dataclass from typing import TYPE_CHECKING -from safeds.data.tabular.typing import Integer, RealNumber, Anything +from safeds.data.tabular.typing import Anything, Integer, RealNumber from safeds.data.tabular.typing._column_type import ColumnType from safeds.exceptions import UnknownColumnNameError @@ -97,7 +96,7 @@ def __repr__(self) -> str: >>> repr(schema) "Schema({'A': Integer})" """ - return f"Schema({str(self)})" + return f"Schema({self!s})" def __str__(self) -> str: """ @@ -259,7 +258,7 @@ def merge_multiple_schemas(schemas: list[Schema]) -> Schema: raise UnknownColumnNameError(list(missing_col_names)) for schema in schemas: if schema_dict != schema._schema: - for col_name in schema_dict.keys(): + for col_name in schema_dict: nullable = False if schema_dict[col_name].is_nullable() or schema.get_column_type(col_name).is_nullable(): nullable = True @@ -267,7 +266,13 @@ def merge_multiple_schemas(schemas: list[Schema]) -> Schema: if schema.get_column_type(col_name).is_nullable() and not schema_dict[col_name].is_nullable(): schema_dict[col_name] = type(schema.get_column_type(col_name))(nullable) continue - if (isinstance(schema_dict[col_name], RealNumber) and isinstance(schema.get_column_type(col_name), Integer)) or (isinstance(schema_dict[col_name], Integer) and isinstance(schema.get_column_type(col_name), RealNumber)): + if ( + isinstance(schema_dict[col_name], RealNumber) + and isinstance(schema.get_column_type(col_name), Integer) + ) or ( + isinstance(schema_dict[col_name], Integer) + and isinstance(schema.get_column_type(col_name), RealNumber) + ): schema_dict[col_name] = RealNumber(nullable) continue schema_dict[col_name] = Anything(nullable) diff --git a/tests/safeds/data/tabular/containers/_table/test_add_row.py b/tests/safeds/data/tabular/containers/_table/test_add_row.py index eb3f9cee2..e46279ad1 100644 --- a/tests/safeds/data/tabular/containers/_table/test_add_row.py +++ b/tests/safeds/data/tabular/containers/_table/test_add_row.py @@ -1,26 +1,31 @@ import pytest from _pytest.python_api import raises from safeds.data.tabular.containers import Row, Table -from safeds.data.tabular.typing import Schema, Integer, Anything +from safeds.data.tabular.typing import Anything, Integer, Schema from safeds.exceptions import UnknownColumnNameError @pytest.mark.parametrize( ("table", "row", "expected", "expected_schema"), [ - (Table({"col1": [1, 2, 1], "col2": [1, 2, 4]}), - Row({"col1": 5, "col2": 6}), - Table({"col1": [1, 2, 1, 5], "col2": [1, 2, 4, 6]}), - Schema({"col1": Integer(), "col2": Integer()})), - (Table({"col1": [1, 2, 1], "col2": [1, 2, 4]}), - Row({"col1": "5", "col2": 6}), - Table({"col1": [1, 2, 1, "5"], "col2": [1, 2, 4, 6]}), - Schema({"col1": Anything(), "col2": Integer()})), - (Table({"col1": [1, 2, 1], "col2": [1, 2, 4]}), - Table.from_rows([Row({"col1": "5", "col2": None}), - Row({"col1": "5", "col2": 2})]).get_row(0), - Table({"col1": [1, 2, 1, "5"], "col2": [1, 2, 4, None]}), - Schema({"col1": Anything(), "col2": Integer(is_nullable=True)})), + ( + Table({"col1": [1, 2, 1], "col2": [1, 2, 4]}), + Row({"col1": 5, "col2": 6}), + Table({"col1": [1, 2, 1, 5], "col2": [1, 2, 4, 6]}), + Schema({"col1": Integer(), "col2": Integer()}), + ), + ( + Table({"col1": [1, 2, 1], "col2": [1, 2, 4]}), + Row({"col1": "5", "col2": 6}), + Table({"col1": [1, 2, 1, "5"], "col2": [1, 2, 4, 6]}), + Schema({"col1": Anything(), "col2": Integer()}), + ), + ( + Table({"col1": [1, 2, 1], "col2": [1, 2, 4]}), + Table.from_rows([Row({"col1": "5", "col2": None}), Row({"col1": "5", "col2": 2})]).get_row(0), + Table({"col1": [1, 2, 1, "5"], "col2": [1, 2, 4, None]}), + Schema({"col1": Anything(), "col2": Integer(is_nullable=True)}), + ), ], ids=["added row", "different schemas", "different schemas and nullable"], ) @@ -33,8 +38,14 @@ def test_should_add_row(table: Table, row: Row, expected: Table, expected_schema @pytest.mark.parametrize( ("table", "row", "expected_error_msg"), - [(Table({"col1": [1, 2, 1], "col2": [1, 2, 4]}), Row({"col1": 5, "col3": "Hallo"}), r"Could not find column\(s\) 'col2'")], - ids=["unknown column col2 in row"] + [ + ( + Table({"col1": [1, 2, 1], "col2": [1, 2, 4]}), + Row({"col1": 5, "col3": "Hallo"}), + r"Could not find column\(s\) 'col2'", + ), + ], + ids=["unknown column col2 in row"], ) def test_should_raise_error_if_row_column_names_invalid(table: Table, row: Row, expected_error_msg: str) -> None: with raises(UnknownColumnNameError, match=expected_error_msg): diff --git a/tests/safeds/data/tabular/containers/_table/test_add_rows.py b/tests/safeds/data/tabular/containers/_table/test_add_rows.py index 1b3e74aaf..d69f4a47f 100644 --- a/tests/safeds/data/tabular/containers/_table/test_add_rows.py +++ b/tests/safeds/data/tabular/containers/_table/test_add_rows.py @@ -50,12 +50,13 @@ def test_should_add_rows_from_table(table1: Table, table2: Table, expected: Tabl @pytest.mark.parametrize( ("table", "rows", "expected_error_msg"), [ - (Table({"col1": [1, 2, 1], "col2": [1, 2, 4]}), - [Row({"col1": 2, "col3": 4}), Row({"col1": 5, "col2": "Hallo"})], - r"Could not find column\(s\) 'col2'" - ), + ( + Table({"col1": [1, 2, 1], "col2": [1, 2, 4]}), + [Row({"col1": 2, "col3": 4}), Row({"col1": 5, "col2": "Hallo"})], + r"Could not find column\(s\) 'col2'", + ), ], - ids=["column names do not match"] + ids=["column names do not match"], ) def test_should_raise_error_if_row_column_names_invalid(table: Table, rows: list[Row], expected_error_msg: str) -> None: with pytest.raises(UnknownColumnNameError, match=expected_error_msg): diff --git a/tests/safeds/data/tabular/containers/_table/test_from_rows.py b/tests/safeds/data/tabular/containers/_table/test_from_rows.py index a32e9508f..af8b459fe 100644 --- a/tests/safeds/data/tabular/containers/_table/test_from_rows.py +++ b/tests/safeds/data/tabular/containers/_table/test_from_rows.py @@ -49,11 +49,7 @@ def test_should_create_table_from_rows(rows: list[Row], expected: Table) -> None @pytest.mark.parametrize( ("rows", "expected_error_msg"), - [ - ( - [Row({"A": 1, "B": 2}), Row({"A": 2, "C": 4})], r"Could not find column\(s\) 'B'" - ) - ] + [([Row({"A": 1, "B": 2}), Row({"A": 2, "C": 4})], r"Could not find column\(s\) 'B'")], ) def test_should_raise_error_if_unknown_column_names(rows: list[Row], expected_error_msg: str) -> None: with pytest.raises(UnknownColumnNameError, match=expected_error_msg): diff --git a/tests/safeds/data/tabular/typing/test_schema.py b/tests/safeds/data/tabular/typing/test_schema.py index 9aa9fb9ef..1317123fd 100644 --- a/tests/safeds/data/tabular/typing/test_schema.py +++ b/tests/safeds/data/tabular/typing/test_schema.py @@ -4,7 +4,7 @@ import pandas as pd import pytest -from safeds.data.tabular.typing import Boolean, ColumnType, Integer, RealNumber, Schema, String, Anything +from safeds.data.tabular.typing import Anything, Boolean, ColumnType, Integer, RealNumber, Schema, String from safeds.exceptions import UnknownColumnNameError if TYPE_CHECKING: @@ -238,10 +238,8 @@ def test_should_return_dict_for_schema(self, schema: Schema, expected: str) -> N class TestMergeMultipleSchemas: @pytest.mark.parametrize( ("schemas", "error_msg_regex"), - [ - ([Schema({"Column1": Anything()}), Schema({"Column2": Anything()})], r"Could not find column\(s\) 'Column2'") - ], - ids=["different_column_names"] + [([Schema({"Column1": Anything()}), Schema({"Column2": Anything()})], r"Could not find column\(s\) 'Column2'")], + ids=["different_column_names"], ) def test_should_raise_if_column_names_are_different(self, schemas: list[Schema], error_msg_regex: str) -> None: with pytest.raises(UnknownColumnNameError, match=error_msg_regex): @@ -265,38 +263,126 @@ def test_should_raise_if_column_names_are_different(self, schemas: list[Schema], ([Schema({"Column1": Boolean()}), Schema({"Column1": String()})], Schema({"Column1": Anything()})), ([Schema({"Column1": Boolean()}), Schema({"Column1": Anything()})], Schema({"Column1": Anything()})), ([Schema({"Column1": String()}), Schema({"Column1": Anything()})], Schema({"Column1": Anything()})), - - ([Schema({"Column1": Integer(is_nullable=True)}), Schema({"Column1": Integer()})], Schema({"Column1": Integer(is_nullable=True)})), - ([Schema({"Column1": RealNumber(is_nullable=True)}), Schema({"Column1": RealNumber()})], Schema({"Column1": RealNumber(is_nullable=True)})), - ([Schema({"Column1": Boolean(is_nullable=True)}), Schema({"Column1": Boolean()})], Schema({"Column1": Boolean(is_nullable=True)})), - ([Schema({"Column1": String(is_nullable=True)}), Schema({"Column1": String()})], Schema({"Column1": String(is_nullable=True)})), - ([Schema({"Column1": Anything(is_nullable=True)}), Schema({"Column1": Anything()})], Schema({"Column1": Anything(is_nullable=True)})), - ([Schema({"Column1": Integer(is_nullable=True)}), Schema({"Column1": RealNumber()})], Schema({"Column1": RealNumber(is_nullable=True)})), - ([Schema({"Column1": Integer(is_nullable=True)}), Schema({"Column1": Boolean()})], Schema({"Column1": Anything(is_nullable=True)})), - ([Schema({"Column1": Integer(is_nullable=True)}), Schema({"Column1": String()})], Schema({"Column1": Anything(is_nullable=True)})), - ([Schema({"Column1": Integer(is_nullable=True)}), Schema({"Column1": Anything()})], Schema({"Column1": Anything(is_nullable=True)})), - ([Schema({"Column1": RealNumber(is_nullable=True)}), Schema({"Column1": Boolean()})], Schema({"Column1": Anything(is_nullable=True)})), - ([Schema({"Column1": RealNumber(is_nullable=True)}), Schema({"Column1": String()})], Schema({"Column1": Anything(is_nullable=True)})), - ([Schema({"Column1": RealNumber(is_nullable=True)}), Schema({"Column1": Anything()})], Schema({"Column1": Anything(is_nullable=True)})), - ([Schema({"Column1": Boolean(is_nullable=True)}), Schema({"Column1": String()})], Schema({"Column1": Anything(is_nullable=True)})), - ([Schema({"Column1": Boolean(is_nullable=True)}), Schema({"Column1": Anything()})], Schema({"Column1": Anything(is_nullable=True)})), - ([Schema({"Column1": String(is_nullable=True)}), Schema({"Column1": Anything()})], Schema({"Column1": Anything(is_nullable=True)})), - - ([Schema({"Column1": Integer()}), Schema({"Column1": Integer(is_nullable=True)})], Schema({"Column1": Integer(is_nullable=True)})), - ([Schema({"Column1": RealNumber()}), Schema({"Column1": RealNumber(is_nullable=True)})], Schema({"Column1": RealNumber(is_nullable=True)})), - ([Schema({"Column1": Boolean()}), Schema({"Column1": Boolean(is_nullable=True)})], Schema({"Column1": Boolean(is_nullable=True)})), - ([Schema({"Column1": String()}), Schema({"Column1": String(is_nullable=True)})], Schema({"Column1": String(is_nullable=True)})), - ([Schema({"Column1": Anything()}), Schema({"Column1": Anything(is_nullable=True)})], Schema({"Column1": Anything(is_nullable=True)})), - ([Schema({"Column1": Integer()}), Schema({"Column1": RealNumber(is_nullable=True)})], Schema({"Column1": RealNumber(is_nullable=True)})), - ([Schema({"Column1": Integer()}), Schema({"Column1": Boolean(is_nullable=True)})], Schema({"Column1": Anything(is_nullable=True)})), - ([Schema({"Column1": Integer()}), Schema({"Column1": String(is_nullable=True)})], Schema({"Column1": Anything(is_nullable=True)})), - ([Schema({"Column1": Integer()}), Schema({"Column1": Anything(is_nullable=True)})], Schema({"Column1": Anything(is_nullable=True)})), - ([Schema({"Column1": RealNumber()}), Schema({"Column1": Boolean(is_nullable=True)})], Schema({"Column1": Anything(is_nullable=True)})), - ([Schema({"Column1": RealNumber()}), Schema({"Column1": String(is_nullable=True)})], Schema({"Column1": Anything(is_nullable=True)})), - ([Schema({"Column1": RealNumber()}), Schema({"Column1": Anything(is_nullable=True)})], Schema({"Column1": Anything(is_nullable=True)})), - ([Schema({"Column1": Boolean()}), Schema({"Column1": String(is_nullable=True)})], Schema({"Column1": Anything(is_nullable=True)})), - ([Schema({"Column1": Boolean()}), Schema({"Column1": Anything(is_nullable=True)})], Schema({"Column1": Anything(is_nullable=True)})), - ([Schema({"Column1": String()}), Schema({"Column1": Anything(is_nullable=True)})], Schema({"Column1": Anything(is_nullable=True)})), + ( + [Schema({"Column1": Integer(is_nullable=True)}), Schema({"Column1": Integer()})], + Schema({"Column1": Integer(is_nullable=True)}), + ), + ( + [Schema({"Column1": RealNumber(is_nullable=True)}), Schema({"Column1": RealNumber()})], + Schema({"Column1": RealNumber(is_nullable=True)}), + ), + ( + [Schema({"Column1": Boolean(is_nullable=True)}), Schema({"Column1": Boolean()})], + Schema({"Column1": Boolean(is_nullable=True)}), + ), + ( + [Schema({"Column1": String(is_nullable=True)}), Schema({"Column1": String()})], + Schema({"Column1": String(is_nullable=True)}), + ), + ( + [Schema({"Column1": Anything(is_nullable=True)}), Schema({"Column1": Anything()})], + Schema({"Column1": Anything(is_nullable=True)}), + ), + ( + [Schema({"Column1": Integer(is_nullable=True)}), Schema({"Column1": RealNumber()})], + Schema({"Column1": RealNumber(is_nullable=True)}), + ), + ( + [Schema({"Column1": Integer(is_nullable=True)}), Schema({"Column1": Boolean()})], + Schema({"Column1": Anything(is_nullable=True)}), + ), + ( + [Schema({"Column1": Integer(is_nullable=True)}), Schema({"Column1": String()})], + Schema({"Column1": Anything(is_nullable=True)}), + ), + ( + [Schema({"Column1": Integer(is_nullable=True)}), Schema({"Column1": Anything()})], + Schema({"Column1": Anything(is_nullable=True)}), + ), + ( + [Schema({"Column1": RealNumber(is_nullable=True)}), Schema({"Column1": Boolean()})], + Schema({"Column1": Anything(is_nullable=True)}), + ), + ( + [Schema({"Column1": RealNumber(is_nullable=True)}), Schema({"Column1": String()})], + Schema({"Column1": Anything(is_nullable=True)}), + ), + ( + [Schema({"Column1": RealNumber(is_nullable=True)}), Schema({"Column1": Anything()})], + Schema({"Column1": Anything(is_nullable=True)}), + ), + ( + [Schema({"Column1": Boolean(is_nullable=True)}), Schema({"Column1": String()})], + Schema({"Column1": Anything(is_nullable=True)}), + ), + ( + [Schema({"Column1": Boolean(is_nullable=True)}), Schema({"Column1": Anything()})], + Schema({"Column1": Anything(is_nullable=True)}), + ), + ( + [Schema({"Column1": String(is_nullable=True)}), Schema({"Column1": Anything()})], + Schema({"Column1": Anything(is_nullable=True)}), + ), + ( + [Schema({"Column1": Integer()}), Schema({"Column1": Integer(is_nullable=True)})], + Schema({"Column1": Integer(is_nullable=True)}), + ), + ( + [Schema({"Column1": RealNumber()}), Schema({"Column1": RealNumber(is_nullable=True)})], + Schema({"Column1": RealNumber(is_nullable=True)}), + ), + ( + [Schema({"Column1": Boolean()}), Schema({"Column1": Boolean(is_nullable=True)})], + Schema({"Column1": Boolean(is_nullable=True)}), + ), + ( + [Schema({"Column1": String()}), Schema({"Column1": String(is_nullable=True)})], + Schema({"Column1": String(is_nullable=True)}), + ), + ( + [Schema({"Column1": Anything()}), Schema({"Column1": Anything(is_nullable=True)})], + Schema({"Column1": Anything(is_nullable=True)}), + ), + ( + [Schema({"Column1": Integer()}), Schema({"Column1": RealNumber(is_nullable=True)})], + Schema({"Column1": RealNumber(is_nullable=True)}), + ), + ( + [Schema({"Column1": Integer()}), Schema({"Column1": Boolean(is_nullable=True)})], + Schema({"Column1": Anything(is_nullable=True)}), + ), + ( + [Schema({"Column1": Integer()}), Schema({"Column1": String(is_nullable=True)})], + Schema({"Column1": Anything(is_nullable=True)}), + ), + ( + [Schema({"Column1": Integer()}), Schema({"Column1": Anything(is_nullable=True)})], + Schema({"Column1": Anything(is_nullable=True)}), + ), + ( + [Schema({"Column1": RealNumber()}), Schema({"Column1": Boolean(is_nullable=True)})], + Schema({"Column1": Anything(is_nullable=True)}), + ), + ( + [Schema({"Column1": RealNumber()}), Schema({"Column1": String(is_nullable=True)})], + Schema({"Column1": Anything(is_nullable=True)}), + ), + ( + [Schema({"Column1": RealNumber()}), Schema({"Column1": Anything(is_nullable=True)})], + Schema({"Column1": Anything(is_nullable=True)}), + ), + ( + [Schema({"Column1": Boolean()}), Schema({"Column1": String(is_nullable=True)})], + Schema({"Column1": Anything(is_nullable=True)}), + ), + ( + [Schema({"Column1": Boolean()}), Schema({"Column1": Anything(is_nullable=True)})], + Schema({"Column1": Anything(is_nullable=True)}), + ), + ( + [Schema({"Column1": String()}), Schema({"Column1": Anything(is_nullable=True)})], + Schema({"Column1": Anything(is_nullable=True)}), + ), ], ids=[ "Integer Integer", @@ -314,7 +400,6 @@ def test_should_raise_if_column_names_are_different(self, schemas: list[Schema], "Boolean String", "Boolean Anything", "String Anything", - "Integer(null) Integer", "RealNumber(null) RealNumber", "Boolean(null) Boolean", @@ -330,7 +415,6 @@ def test_should_raise_if_column_names_are_different(self, schemas: list[Schema], "Boolean(null) String", "Boolean(null) Anything", "String(null) Anything", - "Integer Integer(null)", "RealNumber RealNumber(null)", "Boolean Boolean(null)", @@ -346,12 +430,14 @@ def test_should_raise_if_column_names_are_different(self, schemas: list[Schema], "Boolean String(null)", "Boolean Anything(null)", "String Anything(null)", - ] + ], ) def test_should_return_merged_schema(self, schemas: list[Schema], expected: Schema) -> None: assert Schema.merge_multiple_schemas(schemas) == expected schemas.reverse() - assert Schema.merge_multiple_schemas(schemas) == expected # test the reversed list because the first parameter is handled differently + assert ( + Schema.merge_multiple_schemas(schemas) == expected + ) # test the reversed list because the first parameter is handled differently class TestReprMarkdown: From 88bb00bc4cab61eb90972a0a6d396357e6bcc4ae Mon Sep 17 00:00:00 2001 From: megalinter-bot <129584137+megalinter-bot@users.noreply.github.com> Date: Fri, 2 Jun 2023 11:44:51 +0000 Subject: [PATCH 08/51] style: apply automated linter fixes --- src/safeds/data/tabular/containers/_row.py | 3 ++- src/safeds/data/tabular/containers/_table.py | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/safeds/data/tabular/containers/_row.py b/src/safeds/data/tabular/containers/_row.py index 363cc8af1..5d2d55f04 100644 --- a/src/safeds/data/tabular/containers/_row.py +++ b/src/safeds/data/tabular/containers/_row.py @@ -446,7 +446,8 @@ def get_column_type(self, column_name: str) -> ColumnType: # ------------------------------------------------------------------------------------------------------------------ def sort_columns( - self, comparator: Callable[[tuple, tuple], int] = lambda col1, col2: (col1[0] > col2[0]) - (col1[0] < col2[0]), + self, + comparator: Callable[[tuple, tuple], int] = lambda col1, col2: (col1[0] > col2[0]) - (col1[0] < col2[0]), ) -> Row: """ Sort the columns of a `Row` with the given comparator and return a new `Row`. diff --git a/src/safeds/data/tabular/containers/_table.py b/src/safeds/data/tabular/containers/_table.py index e459dca9d..916e93a0c 100644 --- a/src/safeds/data/tabular/containers/_table.py +++ b/src/safeds/data/tabular/containers/_table.py @@ -710,7 +710,9 @@ def add_rows(self, rows: list[Row] | Table) -> Table: sorted_rows = [] for row in rows: sorted_rows.append( - row.sort_columns(lambda col1, col2: self.column_names.index(col2[0]) - self.column_names.index(col1[0])), + row.sort_columns( + lambda col1, col2: self.column_names.index(col2[0]) - self.column_names.index(col1[0]), + ), ) rows = sorted_rows From ffbf2d61addc56158a3f1ce4e6702e22f7930f8c Mon Sep 17 00:00:00 2001 From: PhilipGutberlet <92990487+PhilipGutberlet@users.noreply.github.com> Date: Fri, 2 Jun 2023 15:39:19 +0200 Subject: [PATCH 09/51] Everythings works properly except numeric Columns with None --- .../data/tabular/typing/_column_type.py | 38 +++++-------------- src/safeds/data/tabular/typing/_schema.py | 4 +- .../data/tabular/typing/test_column_type.py | 19 +++++----- .../safeds/data/tabular/typing/test_schema.py | 10 ++--- 4 files changed, 27 insertions(+), 44 deletions(-) diff --git a/src/safeds/data/tabular/typing/_column_type.py b/src/safeds/data/tabular/typing/_column_type.py index 185aa83f1..c12595541 100644 --- a/src/safeds/data/tabular/typing/_column_type.py +++ b/src/safeds/data/tabular/typing/_column_type.py @@ -38,52 +38,32 @@ def _data_type(data: pd.Series) -> ColumnType: def columntype_of_type(celltype: Any) -> ColumnType: if celltype == int: - return Integer() + return Integer(is_nullable) if celltype == bool: - return Boolean() + return Boolean(is_nullable) if celltype == float: - return RealNumber() + return RealNumber(is_nullable) if celltype == str: - return String() + return String(is_nullable) if celltype is NoneType: - return Anything(is_nullable=True) #when Nothing() exists Nothing() + return Nothing() else: message = f"Unsupported numpy data type '{celltype}'." raise NotImplementedError(message) - result = None # set type to Nothing as a default + result = Nothing() is_nullable = False for cell in data: - print(result) - print(data.dtype) - print(type(cell)) - if result is None: + if result == Nothing(): result = columntype_of_type(type(cell)) elif result != columntype_of_type(type(cell)): is_nullable = True if result == Integer and type(cell) == float: result = RealNumber(is_nullable) else: - result = Anything() - return result - - - - # if celltype == int: - # return Integer() - # if celltype == bool: - # return Boolean() - # if celltype == float: - # return RealNumber() - # if celltype == str: - # return String() - # else: - # return Anything() - - - # message = f"Unsupported numpy data type '{data_type}'." - # raise NotImplementedError(message) + result = Anything(is_nullable) + return result @abstractmethod def is_nullable(self) -> bool: diff --git a/src/safeds/data/tabular/typing/_schema.py b/src/safeds/data/tabular/typing/_schema.py index a75a87241..89765bb03 100644 --- a/src/safeds/data/tabular/typing/_schema.py +++ b/src/safeds/data/tabular/typing/_schema.py @@ -49,7 +49,9 @@ def _from_pandas_dataframe(dataframe: pd.DataFrame) -> Schema: """ names = dataframe.columns # noinspection PyProtectedMember - types = (ColumnType._from_numpy_data_type(data_type) for data_type in dataframe.dtypes) + types = [] + for col in dataframe: + types.append(ColumnType._data_type(dataframe[col])) return Schema(dict(zip(names, types, strict=True))) diff --git a/tests/safeds/data/tabular/typing/test_column_type.py b/tests/safeds/data/tabular/typing/test_column_type.py index 928d7284a..eebdffb49 100644 --- a/tests/safeds/data/tabular/typing/test_column_type.py +++ b/tests/safeds/data/tabular/typing/test_column_type.py @@ -57,15 +57,16 @@ class TestDataType: @pytest.mark.parametrize( ("data", "expected"), [ - (pd.Series([1, 2, 3]), Integer()), - (pd.Series([1.0, 2.0, 3.0]), RealNumber()), - (pd.Series([True, False, True]), Boolean()), - (pd.Series(["a", "b", "c"]), String()), - (pd.Series([None, None, None]), Anything(is_nullable=True)), - (pd.Series([1, 2, None]), Anything(is_nullable=True)), - (pd.Series([1.0, 2.0, None]), Anything(is_nullable=True)), - (pd.Series([True, False, None]), Anything(is_nullable=True)), - (pd.Series(["a", "b", None]), Anything(is_nullable=True)), + (pd.Series([1, 2, 3]), Integer(is_nullable=False)), + (pd.Series([1.0, 2.0, 3.0]), RealNumber(is_nullable=False)), + (pd.Series([True, False, True]), Boolean(is_nullable=False)), + (pd.Series(["a", "b", "c"]), String(is_nullable=False)), + (pd.Series(["a", 1, 2.0]), Anything(is_nullable=False)), + (pd.Series([None, None, None]), Nothing()), + (pd.Series([1, 2, None]), Integer(is_nullable=True)), + (pd.Series([1.0, 2.0, None]), RealNumber(is_nullable=True)), + (pd.Series([True, False, None]), Boolean(is_nullable=True)), + (pd.Series(["a", None, "b"]), String(is_nullable=True)), ], ids=repr, diff --git a/tests/safeds/data/tabular/typing/test_schema.py b/tests/safeds/data/tabular/typing/test_schema.py index f4e827353..97bc15053 100644 --- a/tests/safeds/data/tabular/typing/test_schema.py +++ b/tests/safeds/data/tabular/typing/test_schema.py @@ -4,7 +4,7 @@ import pandas as pd import pytest -from safeds.data.tabular.typing import Boolean, ColumnType, Integer, RealNumber, Schema, String +from safeds.data.tabular.typing import Boolean, ColumnType, Integer, RealNumber, Schema, String, Anything from safeds.exceptions import UnknownColumnNameError if TYPE_CHECKING: @@ -17,7 +17,7 @@ class TestFromPandasDataFrame: [ ( pd.DataFrame({"A": [True, False, True]}), - Schema({"A": Boolean()}), + Schema({"A": Boolean(is_nullable=False)}), ), ( pd.DataFrame({"A": [1, 2, 3]}), @@ -29,15 +29,15 @@ class TestFromPandasDataFrame: ), ( pd.DataFrame({"A": ["a", "b", "c"]}), - Schema({"A": String()}), + Schema({"A": String(is_nullable=False)}), ), ( pd.DataFrame({"A": [1, 2.0, "a", True]}), - Schema({"A": String()}), + Schema({"A": Anything(is_nullable=False)}), ), ( pd.DataFrame({"A": [1, 2, 3], "B": ["a", "b", "c"]}), - Schema({"A": Integer(), "B": String()}), + Schema({"A": Integer(is_nullable=False), "B": String(is_nullable=False)}), ), ], ids=[ From cbd7722b2bcb342780ccf57799ae7b0baf1ea99d Mon Sep 17 00:00:00 2001 From: PhilipGutberlet <92990487+PhilipGutberlet@users.noreply.github.com> Date: Fri, 2 Jun 2023 15:45:10 +0200 Subject: [PATCH 10/51] Changes by hussi --- src/safeds/data/tabular/typing/__init__.py | 3 +- .../data/tabular/typing/_column_type.py | 46 +++++++++++++++++-- .../data/tabular/typing/test_column_type.py | 43 +---------------- .../safeds/data/tabular/typing/test_schema.py | 41 +++++++++++++++-- 4 files changed, 84 insertions(+), 49 deletions(-) diff --git a/src/safeds/data/tabular/typing/__init__.py b/src/safeds/data/tabular/typing/__init__.py index 9a19c2b5d..14823d345 100644 --- a/src/safeds/data/tabular/typing/__init__.py +++ b/src/safeds/data/tabular/typing/__init__.py @@ -1,6 +1,6 @@ """Types used to define the schema of a tabular dataset.""" -from ._column_type import Anything, Boolean, ColumnType, Integer, RealNumber, String +from ._column_type import Anything, Boolean, ColumnType, Integer, RealNumber, String, Nothing from ._imputer_strategy import ImputerStrategy from ._schema import Schema @@ -13,4 +13,5 @@ "RealNumber", "Schema", "String", + "Nothing", ] diff --git a/src/safeds/data/tabular/typing/_column_type.py b/src/safeds/data/tabular/typing/_column_type.py index c12595541..b710e5256 100644 --- a/src/safeds/data/tabular/typing/_column_type.py +++ b/src/safeds/data/tabular/typing/_column_type.py @@ -56,9 +56,11 @@ def columntype_of_type(celltype: Any) -> ColumnType: for cell in data: if result == Nothing(): result = columntype_of_type(type(cell)) - elif result != columntype_of_type(type(cell)): - is_nullable = True - if result == Integer and type(cell) == float: + if result != columntype_of_type(type(cell)): + if type(cell) is NoneType: + is_nullable = True + result._is_nullable = is_nullable + elif result == Integer and type(cell) == float: result = RealNumber(is_nullable) else: result = Anything(is_nullable) @@ -311,3 +313,41 @@ def is_numeric(self) -> bool: True if the column is numeric. """ return False + + +@dataclass +class Nothing(ColumnType): + """Type for a column that contains None Values only.""" + + _is_nullable: bool + + def __init__(self): + self._is_nullable = True + + def __repr__(self) -> str: + result = "Nothing" + if self._is_nullable: + result += "?" + return result + + def is_nullable(self) -> bool: + """ + Return whether the given column type is nullable. + + Returns + ------- + is_nullable : bool + True if the column is nullable. + """ + return True + + def is_numeric(self) -> bool: + """ + Return whether the given column type is numeric. + + Returns + ------- + is_numeric : bool + True if the column is numeric. + """ + return False diff --git a/tests/safeds/data/tabular/typing/test_column_type.py b/tests/safeds/data/tabular/typing/test_column_type.py index eebdffb49..4b817967e 100644 --- a/tests/safeds/data/tabular/typing/test_column_type.py +++ b/tests/safeds/data/tabular/typing/test_column_type.py @@ -1,8 +1,6 @@ -import numpy as np import pandas as pd import pytest -from safeds.data.tabular.containers import Column from safeds.data.tabular.typing import ( Anything, Boolean, @@ -10,49 +8,10 @@ Integer, RealNumber, String, + Nothing, ) -# class TestFromNumpyDataType: -# # Test cases taken from https://numpy.org/doc/stable/reference/arrays.scalars.html#scalars -# @pytest.mark.parametrize( -# ("data_type", "expected"), -# [ -# # Boolean -# (np.dtype(np.bool_), Boolean()), -# # Number -# (np.dtype(np.half), RealNumber()), -# (np.dtype(np.single), RealNumber()), -# (np.dtype(np.float_), RealNumber()), -# (np.dtype(np.longfloat), RealNumber()), -# # Int -# (np.dtype(np.byte), Integer()), -# (np.dtype(np.short), Integer()), -# (np.dtype(np.intc), Integer()), -# (np.dtype(np.int_), Integer()), -# (np.dtype(np.longlong), Integer()), -# (np.dtype(np.ubyte), Integer()), -# (np.dtype(np.ushort), Integer()), -# (np.dtype(np.uintc), Integer()), -# (np.dtype(np.uint), Integer()), -# (np.dtype(np.ulonglong), Integer()), -# # String -# (np.dtype(np.str_), String()), -# (np.dtype(np.unicode_), String()), -# (np.dtype(np.object_), String()), -# (np.dtype(np.datetime64), String()), -# (np.dtype(np.timedelta64), String()), -# ], -# ids=repr, -# ) -# def test_should_create_column_type_from_numpy_data_type(self, data_type: np.dtype, expected: ColumnType) -> None: -# assert ColumnType._from_numpy_data_type(data_type) == expected -# -# def test_should_raise_if_data_type_is_not_supported(self) -> None: -# with pytest.raises(NotImplementedError): -# ColumnType._from_numpy_data_type(np.dtype(np.void)) -# - class TestDataType: @pytest.mark.parametrize( ("data", "expected"), diff --git a/tests/safeds/data/tabular/typing/test_schema.py b/tests/safeds/data/tabular/typing/test_schema.py index 97bc15053..1845694fe 100644 --- a/tests/safeds/data/tabular/typing/test_schema.py +++ b/tests/safeds/data/tabular/typing/test_schema.py @@ -21,11 +21,11 @@ class TestFromPandasDataFrame: ), ( pd.DataFrame({"A": [1, 2, 3]}), - Schema({"A": Integer()}), + Schema({"A": Integer(is_nullable=False)}), ), ( pd.DataFrame({"A": [1.0, 2.0, 3.0]}), - Schema({"A": RealNumber()}), + Schema({"A": RealNumber(is_nullable=False)}), ), ( pd.DataFrame({"A": ["a", "b", "c"]}), @@ -39,14 +39,49 @@ class TestFromPandasDataFrame: pd.DataFrame({"A": [1, 2, 3], "B": ["a", "b", "c"]}), Schema({"A": Integer(is_nullable=False), "B": String(is_nullable=False)}), ), + ( + pd.DataFrame({"A": [True, False, None]}), + Schema({"A": Boolean(is_nullable=True)}), + ), + ( + pd.DataFrame({"A": [None, 2, 3]}), + Schema({"A": Integer(is_nullable=True)}), + ), + ( + pd.DataFrame({"A": [None, 2.0, 3.0]}), + Schema({"A": RealNumber(is_nullable=True)}), + ), + ( + pd.DataFrame({"A": ["a", None, "b"]}), + Schema({"A": String(is_nullable=True)}), + ), + ( + pd.DataFrame({"A": [1, 2.0, "a", True, None]}), + Schema({"A": Anything(is_nullable=True)}), + ), + ( + pd.DataFrame({"A": [1, 2, 3], "B": ["a", "b", None]}), + Schema({"A": Integer(is_nullable=False), "B": String(is_nullable=True)}), + ), + ( + pd.DataFrame({"A": [1, 2, 3], "B": ["a", "b", "c"], "C": [True, True, False]}), + Schema({"A": Integer(is_nullable=False), "B": String(is_nullable=False), "C": Boolean(is_nullable=False)}), + ), ], ids=[ + "boolean", "integer", "real number", "string", - "boolean", "mixed", "multiple columns", + "boolean?", + "integer?", + "real number?", + "string?", + "anything?", + "integer, string?", + "integer, string, boolean" ], ) def test_should_create_schema_from_pandas_dataframe(self, dataframe: pd.DataFrame, expected: Schema) -> None: From aae7b81b7713b6a0b3ac9e075039239426ce5daa Mon Sep 17 00:00:00 2001 From: daniaHu Date: Fri, 23 Jun 2023 10:47:22 +0200 Subject: [PATCH 11/51] feat: fixed some tests, now Columns aren't wraped in pd.Series --- .../data/tabular/typing/_column_type.py | 18 ++++----- .../_column/test_from_pandas_series.py | 6 +-- .../tabular/containers/_column/test_init.py | 6 +-- .../tabular/containers/_table/test_split.py | 4 +- .../data/tabular/typing/test_column_type.py | 27 ++++++++------ .../safeds/data/tabular/typing/test_schema.py | 37 ++++++++++--------- 6 files changed, 52 insertions(+), 46 deletions(-) diff --git a/src/safeds/data/tabular/typing/_column_type.py b/src/safeds/data/tabular/typing/_column_type.py index b710e5256..4f95111dc 100644 --- a/src/safeds/data/tabular/typing/_column_type.py +++ b/src/safeds/data/tabular/typing/_column_type.py @@ -36,27 +36,27 @@ def _data_type(data: pd.Series) -> ColumnType: If the given data type is not supported. """ - def columntype_of_type(celltype: Any) -> ColumnType: - if celltype == int: + def column_type_of_type(cell_type: Any) -> ColumnType: + if cell_type == int: return Integer(is_nullable) - if celltype == bool: + if cell_type == bool: return Boolean(is_nullable) - if celltype == float: + if cell_type == float: return RealNumber(is_nullable) - if celltype == str: + if cell_type == str: return String(is_nullable) - if celltype is NoneType: + if cell_type is NoneType: return Nothing() else: - message = f"Unsupported numpy data type '{celltype}'." + message = f"Unsupported numpy data type '{cell_type}'." raise NotImplementedError(message) result = Nothing() is_nullable = False for cell in data: if result == Nothing(): - result = columntype_of_type(type(cell)) - if result != columntype_of_type(type(cell)): + result = column_type_of_type(type(cell)) + if result != column_type_of_type(type(cell)): if type(cell) is NoneType: is_nullable = True result._is_nullable = is_nullable diff --git a/tests/safeds/data/tabular/containers/_column/test_from_pandas_series.py b/tests/safeds/data/tabular/containers/_column/test_from_pandas_series.py index 9946120b2..f26d9a60c 100644 --- a/tests/safeds/data/tabular/containers/_column/test_from_pandas_series.py +++ b/tests/safeds/data/tabular/containers/_column/test_from_pandas_series.py @@ -1,7 +1,7 @@ import pandas as pd import pytest from safeds.data.tabular.containers import Column -from safeds.data.tabular.typing import Boolean, ColumnType, Integer, RealNumber, String +from safeds.data.tabular.typing import Boolean, ColumnType, Integer, RealNumber, String, Nothing, Anything @pytest.mark.parametrize( @@ -35,12 +35,12 @@ def test_should_use_type_if_passed(series: pd.Series, type_: ColumnType) -> None @pytest.mark.parametrize( ("series", "expected"), [ - (pd.Series([]), String()), + (pd.Series([]), Nothing()), (pd.Series([True, False, True]), Boolean()), (pd.Series([1, 2, 3]), Integer()), (pd.Series([1.0, 2.0, 3.0]), RealNumber()), (pd.Series(["a", "b", "c"]), String()), - (pd.Series([1, 2.0, "a", True]), String()), + (pd.Series([1, 2.0, "a", True]), Anything(is_nullable=False)), ], ids=["empty", "boolean", "integer", "real number", "string", "mixed"], ) diff --git a/tests/safeds/data/tabular/containers/_column/test_init.py b/tests/safeds/data/tabular/containers/_column/test_init.py index 2966b2970..68a11d40c 100644 --- a/tests/safeds/data/tabular/containers/_column/test_init.py +++ b/tests/safeds/data/tabular/containers/_column/test_init.py @@ -3,7 +3,7 @@ import pandas as pd import pytest from safeds.data.tabular.containers import Column -from safeds.data.tabular.typing import Boolean, ColumnType, Integer, RealNumber, String +from safeds.data.tabular.typing import Boolean, ColumnType, Integer, RealNumber, String, Nothing, Anything def test_should_store_the_name() -> None: @@ -43,12 +43,12 @@ def test_should_store_the_data(column: Column, expected: list) -> None: @pytest.mark.parametrize( ("column", "expected"), [ - (Column("A", []), String()), + (Column("A", []), Nothing()), (Column("A", [True, False, True]), Boolean()), (Column("A", [1, 2, 3]), Integer()), (Column("A", [1.0, 2.0, 3.0]), RealNumber()), (Column("A", ["a", "b", "c"]), String()), - (Column("A", [1, 2.0, "a", True]), String()), + (Column("A", [1, 2.0, "a", True]), Anything()), ], ids=["empty", "boolean", "integer", "real number", "string", "mixed"], ) diff --git a/tests/safeds/data/tabular/containers/_table/test_split.py b/tests/safeds/data/tabular/containers/_table/test_split.py index 36789a346..ef39bb2ab 100644 --- a/tests/safeds/data/tabular/containers/_table/test_split.py +++ b/tests/safeds/data/tabular/containers/_table/test_split.py @@ -1,7 +1,7 @@ import pandas as pd import pytest from safeds.data.tabular.containers import Table -from safeds.data.tabular.typing import Integer, Schema +from safeds.data.tabular.typing import Integer, Schema, Nothing @pytest.mark.parametrize( @@ -15,7 +15,7 @@ ), ( Table({"col1": [1, 2, 1], "col2": [1, 2, 4]}), - Table._from_pandas_dataframe(pd.DataFrame(), Schema({"col1": Integer(), "col2": Integer()})), + Table._from_pandas_dataframe(pd.DataFrame(), Schema({"col1": Nothing(), "col2": Nothing()})), Table({"col1": [1, 2, 1], "col2": [1, 2, 4]}), 0, ), diff --git a/tests/safeds/data/tabular/typing/test_column_type.py b/tests/safeds/data/tabular/typing/test_column_type.py index 4b817967e..d0882cb5d 100644 --- a/tests/safeds/data/tabular/typing/test_column_type.py +++ b/tests/safeds/data/tabular/typing/test_column_type.py @@ -1,3 +1,5 @@ +from typing import Iterable + import pandas as pd import pytest @@ -16,21 +18,22 @@ class TestDataType: @pytest.mark.parametrize( ("data", "expected"), [ - (pd.Series([1, 2, 3]), Integer(is_nullable=False)), - (pd.Series([1.0, 2.0, 3.0]), RealNumber(is_nullable=False)), - (pd.Series([True, False, True]), Boolean(is_nullable=False)), - (pd.Series(["a", "b", "c"]), String(is_nullable=False)), - (pd.Series(["a", 1, 2.0]), Anything(is_nullable=False)), - (pd.Series([None, None, None]), Nothing()), - (pd.Series([1, 2, None]), Integer(is_nullable=True)), - (pd.Series([1.0, 2.0, None]), RealNumber(is_nullable=True)), - (pd.Series([True, False, None]), Boolean(is_nullable=True)), - (pd.Series(["a", None, "b"]), String(is_nullable=True)), + ([1, 2, 3], Integer(is_nullable=False)), + ([1.0, 2.0, 3.0], RealNumber(is_nullable=False)), + ([True, False, True], Boolean(is_nullable=False)), + (["a", "b", "c"], String(is_nullable=False)), + (["a", 1, 2.0], Anything(is_nullable=False)), + ([None, None, None], Nothing()), + ([None, 1, 2], Integer(is_nullable=True)), + ([1.0, 2.0, None], RealNumber(is_nullable=True)), + ([True, False, None], Boolean(is_nullable=True)), + (["a", None, "b"], String(is_nullable=True)), ], - ids=repr, + ids=["Integer", "Real number", "Boolean", "String", "Mixed", "None", "Nullable integer", + "Nullable RealNumber", "Nullable Boolean", "Nullable String"], ) - def test_should_return_the_data_type(self, data: pd.Series, expected: ColumnType) -> None: + def test_should_return_the_data_type(self, data: Iterable, expected: ColumnType) -> None: assert ColumnType._data_type(data) == expected diff --git a/tests/safeds/data/tabular/typing/test_schema.py b/tests/safeds/data/tabular/typing/test_schema.py index 1845694fe..2e1731863 100644 --- a/tests/safeds/data/tabular/typing/test_schema.py +++ b/tests/safeds/data/tabular/typing/test_schema.py @@ -4,6 +4,8 @@ import pandas as pd import pytest + +from safeds.data.tabular.containers import Column, Table from safeds.data.tabular.typing import Boolean, ColumnType, Integer, RealNumber, Schema, String, Anything from safeds.exceptions import UnknownColumnNameError @@ -13,59 +15,60 @@ class TestFromPandasDataFrame: @pytest.mark.parametrize( - ("dataframe", "expected"), + ("columns", "expected"), [ ( - pd.DataFrame({"A": [True, False, True]}), + Column("A", [True, False, True]), Schema({"A": Boolean(is_nullable=False)}), ), ( - pd.DataFrame({"A": [1, 2, 3]}), + Column("A", [1, 2, 3]), Schema({"A": Integer(is_nullable=False)}), ), ( - pd.DataFrame({"A": [1.0, 2.0, 3.0]}), + Column("A", [1.0, 2.0, 3.0]), Schema({"A": RealNumber(is_nullable=False)}), ), ( - pd.DataFrame({"A": ["a", "b", "c"]}), + Column("A", ["a", "b", "c"]), Schema({"A": String(is_nullable=False)}), ), ( - pd.DataFrame({"A": [1, 2.0, "a", True]}), + Column("A", [1, 2.0, "a", True]), Schema({"A": Anything(is_nullable=False)}), ), ( - pd.DataFrame({"A": [1, 2, 3], "B": ["a", "b", "c"]}), + Table({"A": [1, 2, 3], "B": ["a", "b", "c"]}), Schema({"A": Integer(is_nullable=False), "B": String(is_nullable=False)}), ), ( - pd.DataFrame({"A": [True, False, None]}), + Column("A", [True, False, None]), Schema({"A": Boolean(is_nullable=True)}), ), ( - pd.DataFrame({"A": [None, 2, 3]}), + Column("A", [None, 2, 3]), Schema({"A": Integer(is_nullable=True)}), ), ( - pd.DataFrame({"A": [None, 2.0, 3.0]}), + Column("A", [2.0, None, 3.0]), Schema({"A": RealNumber(is_nullable=True)}), ), ( - pd.DataFrame({"A": ["a", None, "b"]}), + Column("A", ["a", None, "b"]), Schema({"A": String(is_nullable=True)}), ), ( - pd.DataFrame({"A": [1, 2.0, "a", True, None]}), + Column("A", [1, 2.0, "a", True, None]), Schema({"A": Anything(is_nullable=True)}), ), ( - pd.DataFrame({"A": [1, 2, 3], "B": ["a", "b", None]}), + Table({"A": [1, 2, 3], "B": ["a", "b", None]}), Schema({"A": Integer(is_nullable=False), "B": String(is_nullable=True)}), ), ( - pd.DataFrame({"A": [1, 2, 3], "B": ["a", "b", "c"], "C": [True, True, False]}), - Schema({"A": Integer(is_nullable=False), "B": String(is_nullable=False), "C": Boolean(is_nullable=False)}), + Table({"A": [1, 2, 3], "B": ["a", "b", "c"], "C": [True, True, False]}), + Schema( + {"A": Integer(is_nullable=False), "B": String(is_nullable=False), "C": Boolean(is_nullable=False)}), ), ], ids=[ @@ -84,8 +87,8 @@ class TestFromPandasDataFrame: "integer, string, boolean" ], ) - def test_should_create_schema_from_pandas_dataframe(self, dataframe: pd.DataFrame, expected: Schema) -> None: - assert Schema._from_pandas_dataframe(dataframe) == expected + def test_should_create_schema_from_pandas_dataframe(self, columns: Column | Table, expected: Schema) -> None: + assert Schema._from_pandas_dataframe(columns) == expected class TestRepr: From f54fdd5888e2ca6e02e9455eade1fe30d55700d7 Mon Sep 17 00:00:00 2001 From: Simon Date: Fri, 23 Jun 2023 13:31:17 +0200 Subject: [PATCH 12/51] Fixed Bug where test would break if the first cell in a column is null --- .../data/tabular/typing/_column_type.py | 3 +++ .../data/tabular/typing/test_column_type.py | 23 ++++++++++--------- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/src/safeds/data/tabular/typing/_column_type.py b/src/safeds/data/tabular/typing/_column_type.py index b710e5256..9420cd208 100644 --- a/src/safeds/data/tabular/typing/_column_type.py +++ b/src/safeds/data/tabular/typing/_column_type.py @@ -56,6 +56,9 @@ def columntype_of_type(celltype: Any) -> ColumnType: for cell in data: if result == Nothing(): result = columntype_of_type(type(cell)) + if type(cell) is NoneType: + is_nullable = True + result._is_nullable = is_nullable if result != columntype_of_type(type(cell)): if type(cell) is NoneType: is_nullable = True diff --git a/tests/safeds/data/tabular/typing/test_column_type.py b/tests/safeds/data/tabular/typing/test_column_type.py index 4b817967e..47a38bcdb 100644 --- a/tests/safeds/data/tabular/typing/test_column_type.py +++ b/tests/safeds/data/tabular/typing/test_column_type.py @@ -16,19 +16,20 @@ class TestDataType: @pytest.mark.parametrize( ("data", "expected"), [ - (pd.Series([1, 2, 3]), Integer(is_nullable=False)), - (pd.Series([1.0, 2.0, 3.0]), RealNumber(is_nullable=False)), - (pd.Series([True, False, True]), Boolean(is_nullable=False)), - (pd.Series(["a", "b", "c"]), String(is_nullable=False)), - (pd.Series(["a", 1, 2.0]), Anything(is_nullable=False)), - (pd.Series([None, None, None]), Nothing()), - (pd.Series([1, 2, None]), Integer(is_nullable=True)), - (pd.Series([1.0, 2.0, None]), RealNumber(is_nullable=True)), - (pd.Series([True, False, None]), Boolean(is_nullable=True)), - (pd.Series(["a", None, "b"]), String(is_nullable=True)), + (([1, 2, 3]), Integer(is_nullable=False)), + (([1.0, 2.0, 3.0]), RealNumber(is_nullable=False)), + (([True, False, True]), Boolean(is_nullable=False)), + ((["a", "b", "c"]), String(is_nullable=False)), + ((["a", 1, 2.0]), Anything(is_nullable=False)), + (([None, None, None]), Nothing()), + (([1, 2, None]), Integer(is_nullable=True)), + (([1.0, 2.0, None]), RealNumber(is_nullable=True)), + (([True, False, None]), Boolean(is_nullable=True)), + ((["a", None, "b"]), String(is_nullable=True)), ], - ids=repr, + ids=["Integer", "RealNumber", "Boolean", "String", "Mixed", "None", "Nullable Integer", "Nullable RealNumber", + "Nullable Boolean", "Nullable String"], ) def test_should_return_the_data_type(self, data: pd.Series, expected: ColumnType) -> None: assert ColumnType._data_type(data) == expected From 655bee04bbf8319dd05bba009f016a3e6f187377 Mon Sep 17 00:00:00 2001 From: daniaHu Date: Fri, 23 Jun 2023 16:07:03 +0200 Subject: [PATCH 13/51] changes rolled back, couldn't find a way to work with pd.DataFrame and still get correct schema by Columns with numbers and missing values --- .../data/tabular/typing/_column_type.py | 6 +- .../data/tabular/typing/test_column_type.py | 3 +- .../safeds/data/tabular/typing/test_schema.py | 57 ++++++++----------- 3 files changed, 26 insertions(+), 40 deletions(-) diff --git a/src/safeds/data/tabular/typing/_column_type.py b/src/safeds/data/tabular/typing/_column_type.py index 80bed53a7..68439dcbe 100644 --- a/src/safeds/data/tabular/typing/_column_type.py +++ b/src/safeds/data/tabular/typing/_column_type.py @@ -3,14 +3,10 @@ from abc import ABC, abstractmethod from dataclasses import dataclass from types import NoneType -from typing import TYPE_CHECKING, Any +from typing import Any import pandas as pd -if TYPE_CHECKING: - import numpy as np - from safeds.data.tabular.containers import Column - class ColumnType(ABC): """Abstract base class for column types.""" diff --git a/tests/safeds/data/tabular/typing/test_column_type.py b/tests/safeds/data/tabular/typing/test_column_type.py index 654519715..9655f656d 100644 --- a/tests/safeds/data/tabular/typing/test_column_type.py +++ b/tests/safeds/data/tabular/typing/test_column_type.py @@ -1,4 +1,5 @@ -import pandas as pd +from typing import Iterable + import pytest from safeds.data.tabular.typing import ( diff --git a/tests/safeds/data/tabular/typing/test_schema.py b/tests/safeds/data/tabular/typing/test_schema.py index 2e1731863..0dd67e9c0 100644 --- a/tests/safeds/data/tabular/typing/test_schema.py +++ b/tests/safeds/data/tabular/typing/test_schema.py @@ -1,11 +1,11 @@ from __future__ import annotations -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Iterable import pandas as pd import pytest -from safeds.data.tabular.containers import Column, Table + from safeds.data.tabular.typing import Boolean, ColumnType, Integer, RealNumber, Schema, String, Anything from safeds.exceptions import UnknownColumnNameError @@ -18,58 +18,49 @@ class TestFromPandasDataFrame: ("columns", "expected"), [ ( - Column("A", [True, False, True]), - Schema({"A": Boolean(is_nullable=False)}), + pd.DataFrame({"A": [True, False, True]}), + Schema({"A": Boolean()}), ), ( - Column("A", [1, 2, 3]), - Schema({"A": Integer(is_nullable=False)}), + pd.DataFrame({"A": [1, 2, 3]}), + Schema({"A": Integer()}), ), ( - Column("A", [1.0, 2.0, 3.0]), - Schema({"A": RealNumber(is_nullable=False)}), + pd.DataFrame({"A": [1.0, 2.0, 3.0]}), + Schema({"A": RealNumber()}), ), ( - Column("A", ["a", "b", "c"]), - Schema({"A": String(is_nullable=False)}), + pd.DataFrame({"A": ["a", "b", "c"]}), + Schema({"A": String()}), ), ( - Column("A", [1, 2.0, "a", True]), - Schema({"A": Anything(is_nullable=False)}), + pd.DataFrame({"A": [1, 2.0, "a", True]}), + Schema({"A": Anything()}), ), ( - Table({"A": [1, 2, 3], "B": ["a", "b", "c"]}), - Schema({"A": Integer(is_nullable=False), "B": String(is_nullable=False)}), + pd.DataFrame({"A": [1, 2, 3], "B": ["a", "b", "c"]}), + Schema({"A": Integer(), "B": String()}), ), ( - Column("A", [True, False, None]), + pd.DataFrame({"A": [True, False, None]}), Schema({"A": Boolean(is_nullable=True)}), ), ( - Column("A", [None, 2, 3]), - Schema({"A": Integer(is_nullable=True)}), + pd.DataFrame({"A": [1, None, 3]}), + Schema({"A": RealNumber()}), ), ( - Column("A", [2.0, None, 3.0]), - Schema({"A": RealNumber(is_nullable=True)}), + pd.DataFrame({"A": [1.0, None, 3.0]}), + Schema({"A": RealNumber()}), ), ( - Column("A", ["a", None, "b"]), + pd.DataFrame({"A": ["a", None, "c"]}), Schema({"A": String(is_nullable=True)}), ), ( - Column("A", [1, 2.0, "a", True, None]), + pd.DataFrame({"A": [1, 2.0, None, True]}), Schema({"A": Anything(is_nullable=True)}), ), - ( - Table({"A": [1, 2, 3], "B": ["a", "b", None]}), - Schema({"A": Integer(is_nullable=False), "B": String(is_nullable=True)}), - ), - ( - Table({"A": [1, 2, 3], "B": ["a", "b", "c"], "C": [True, True, False]}), - Schema( - {"A": Integer(is_nullable=False), "B": String(is_nullable=False), "C": Boolean(is_nullable=False)}), - ), ], ids=[ "boolean", @@ -82,12 +73,10 @@ class TestFromPandasDataFrame: "integer?", "real number?", "string?", - "anything?", - "integer, string?", - "integer, string, boolean" + "Anything?", ], ) - def test_should_create_schema_from_pandas_dataframe(self, columns: Column | Table, expected: Schema) -> None: + def test_should_create_schema_from_pandas_dataframe(self, columns: Iterable, expected: Schema) -> None: assert Schema._from_pandas_dataframe(columns) == expected From 923cad002ffecceb6f4bb6dc95e60af030ee9b46 Mon Sep 17 00:00:00 2001 From: alex-senger Date: Fri, 30 Jun 2023 11:34:39 +0200 Subject: [PATCH 14/51] fix: fix wrong datatype error Co-authored-by: sibre28 <86068340+sibre28@users.noreply.github.com> --- src/safeds/data/tabular/typing/_column_type.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/safeds/data/tabular/typing/_column_type.py b/src/safeds/data/tabular/typing/_column_type.py index 68439dcbe..7dbd2ab7a 100644 --- a/src/safeds/data/tabular/typing/_column_type.py +++ b/src/safeds/data/tabular/typing/_column_type.py @@ -5,6 +5,7 @@ from types import NoneType from typing import Any +import numpy as np import pandas as pd @@ -33,11 +34,11 @@ def _data_type(data: pd.Series) -> ColumnType: """ def column_type_of_type(cell_type: Any) -> ColumnType: - if cell_type == int: + if cell_type == int or cell_type == np.int64 or cell_type == np.int32: return Integer(is_nullable) if cell_type == bool: return Boolean(is_nullable) - if cell_type == float: + if cell_type == float or cell_type == np.float64 or cell_type == np.float32: return RealNumber(is_nullable) if cell_type == str: return String(is_nullable) From 2431d0288212348ad75acda72132a9f7560f28a2 Mon Sep 17 00:00:00 2001 From: alex-senger Date: Fri, 30 Jun 2023 15:24:44 +0200 Subject: [PATCH 15/51] fix: fix merge problems Co-authored-by: sibre28 <86068340+sibre28@users.noreply.github.com> --- src/safeds/data/tabular/containers/_table.py | 49 +++++-------------- .../tabular/containers/_table/test_add_row.py | 24 ++++----- 2 files changed, 23 insertions(+), 50 deletions(-) diff --git a/src/safeds/data/tabular/containers/_table.py b/src/safeds/data/tabular/containers/_table.py index c26fffa90..7bbb4b24d 100644 --- a/src/safeds/data/tabular/containers/_table.py +++ b/src/safeds/data/tabular/containers/_table.py @@ -923,15 +923,12 @@ def add_row(self, row: Row) -> Table: 0 1 2 1 3 4 """ - if self.number_of_columns == 0: - return Table.from_rows([row]) + int_columns = [] + result = self.remove_columns([]) # clone if len(set(self.column_names) - set(row.column_names)) > 0: raise UnknownColumnNameError(list(set(self.column_names) - set(row.column_names))) - row = row.sort_columns(lambda col1, col2: self.column_names.index(col2[0]) - self.column_names.index(col1[0])) - int_columns = [] - result = self.remove_columns([]) # clone if result.number_of_rows == 0: int_columns = list(filter(lambda name: isinstance(row[name], int | np.int64), row.column_names)) if result.number_of_columns == 0: @@ -939,23 +936,18 @@ def add_row(self, row: Row) -> Table: result._data[column] = Column(column, []) result._schema = Schema._from_pandas_dataframe(result._data) elif result.column_names != row.column_names: - raise SchemaMismatchError - elif result._schema != row.schema: - raise SchemaMismatchError + raise UnknownColumnNameError new_df = pd.concat([result._data, row._data]).infer_objects() new_df.columns = result.column_names result = Table._from_pandas_dataframe(new_df) - new_df = pd.concat([self._data, row._data]).infer_objects() - new_df.columns = self.column_names - - schema = Schema.merge_multiple_schemas([self.schema, row.schema]) - - return Table._from_pandas_dataframe(new_df, schema) for column in int_columns: result = result.replace_column(column, [result.get_column(column).transform(lambda it: int(it))]) + schema = Schema.merge_multiple_schemas([result.schema, row.schema]) + result._schema = schema + return result def add_rows(self, rows: list[Row] | Table) -> Table: @@ -994,9 +986,8 @@ def add_rows(self, rows: list[Row] | Table) -> Table: """ if isinstance(rows, Table): rows = rows.to_rows() - - if self.number_of_columns == 0: - return Table.from_rows(rows) + int_columns = [] + result = self.remove_columns([]) # clone missing_col_names = set() for row in rows: @@ -1004,36 +995,16 @@ def add_rows(self, rows: list[Row] | Table) -> Table: if len(missing_col_names) > 0: raise UnknownColumnNameError(list(missing_col_names)) - sorted_rows = [] - int_columns = [] - result = self.remove_columns([]) # clone for row in rows: - sorted_rows.append( - row.sort_columns( - lambda col1, col2: self.column_names.index(col2[0]) - self.column_names.index(col1[0]), - ), - ) - rows = sorted_rows if result.number_of_rows == 0: int_columns = list(filter(lambda name: isinstance(row[name], int | np.int64), row.column_names)) if result.number_of_columns == 0: for column in row.column_names: result._data[column] = Column(column, []) result._schema = Schema._from_pandas_dataframe(result._data) - elif result.column_names != row.column_names: - raise SchemaMismatchError - elif result._schema != row.schema: - raise SchemaMismatchError - result = self._data row_frames = (row._data for row in rows) - result = pd.concat([result, *row_frames]).infer_objects() - result.columns = self.column_names - - schema = Schema.merge_multiple_schemas([self.schema, *[row.schema for row in rows]]) - - return Table._from_pandas_dataframe(result, schema) new_df = pd.concat([result._data, *row_frames]).infer_objects() new_df.columns = result.column_names result = Table._from_pandas_dataframe(new_df) @@ -1041,6 +1012,8 @@ def add_rows(self, rows: list[Row] | Table) -> Table: for column in int_columns: result = result.replace_column(column, [result.get_column(column).transform(lambda it: int(it))]) + result._schema = Schema.merge_multiple_schemas([self.schema, *[row.schema for row in rows]]) + return result def filter_rows(self, query: Callable[[Row], bool]) -> Table: @@ -1515,7 +1488,7 @@ def slice_rows( def sort_columns( self, comparator: Callable[[Column, Column], int] = lambda col1, col2: (col1.name > col2.name) - - (col1.name < col2.name), + - (col1.name < col2.name), ) -> Table: """ Sort the columns of a `Table` with the given comparator and return a new `Table`. diff --git a/tests/safeds/data/tabular/containers/_table/test_add_row.py b/tests/safeds/data/tabular/containers/_table/test_add_row.py index e1bd81fc9..5a748e84f 100644 --- a/tests/safeds/data/tabular/containers/_table/test_add_row.py +++ b/tests/safeds/data/tabular/containers/_table/test_add_row.py @@ -22,7 +22,7 @@ ), ( Table({"col1": [1, 2, 1], "col2": [1, 2, 4]}), - Table.from_rows([Row({"col1": "5", "col2": None}), Row({"col1": "5", "col2": 2})]).get_row(0), + Row({"col1": "5", "col2": None}), Table({"col1": [1, 2, 1, "5"], "col2": [1, 2, 4, None]}), Schema({"col1": Anything(), "col2": Integer(is_nullable=True)}), ), @@ -30,17 +30,22 @@ Table({"col1": [1, 2, 1], "col2": [1, 2, 4]}), Row({"col1": 5, "col2": 6}), Table({"col1": [1, 2, 1, 5], "col2": [1, 2, 4, 6]}), + Schema({"col1": Integer(), "col2": Integer()}) ), - (Table({"col2": [], "col4": []}), Row({"col2": 5, "col4": 6}), Table({"col2": [5], "col4": [6]})), - (Table(), Row({"col2": 5, "col4": 6}), Table({"col2": [5], "col4": [6]})), + ( + Table({"col1": [], "col2": []}), + Row({"col1": 5, "col2": 6}), + Table({"col1": [5], "col2": [6]}), + Schema({"col1": Integer(), "col2": Integer()}) + ) ], ids=["added row", "different schemas", "different schemas and nullable", "add row to rowless table", "add row to empty table"], ) def test_should_add_row(table: Table, row: Row, expected: Table, expected_schema: Schema) -> None: - table = table.add_row(row) - assert table.number_of_rows == 4 - assert table.schema == expected_schema - assert table == expected + result = table.add_row(row) + assert result.number_of_rows - 1 == table.number_of_rows + assert result.schema == expected_schema + assert result == expected @pytest.mark.parametrize( @@ -57,8 +62,3 @@ def test_should_add_row(table: Table, row: Row, expected: Table, expected_schema def test_should_raise_error_if_row_column_names_invalid(table: Table, row: Row, expected_error_msg: str) -> None: with raises(UnknownColumnNameError, match=expected_error_msg): table.add_row(row) - - -def test_should_raise_schema_mismatch() -> None: - with raises(SchemaMismatchError, match=r"Failed because at least two schemas didn't match."): - Table({"a": [], "b": []}).add_row(Row({"beer": None, "rips": None})) From 484478876e25f7d1d0cbe66728d9400822288c2e Mon Sep 17 00:00:00 2001 From: alex-senger Date: Fri, 30 Jun 2023 16:28:16 +0200 Subject: [PATCH 16/51] fix: fix `add_rows` Co-authored-by: sibre28 <86068340+sibre28@users.noreply.github.com> --- src/safeds/data/tabular/containers/_table.py | 39 ++++++++----------- .../containers/_table/test_add_rows.py | 9 +---- 2 files changed, 17 insertions(+), 31 deletions(-) diff --git a/src/safeds/data/tabular/containers/_table.py b/src/safeds/data/tabular/containers/_table.py index 7bbb4b24d..992c59512 100644 --- a/src/safeds/data/tabular/containers/_table.py +++ b/src/safeds/data/tabular/containers/_table.py @@ -24,11 +24,9 @@ DuplicateColumnNameError, IndexOutOfBoundsError, NonNumericColumnError, - SchemaMismatchError, UnknownColumnNameError, WrongFileExtensionError, ) - from ._column import Column from ._row import Row @@ -926,6 +924,8 @@ def add_row(self, row: Row) -> Table: int_columns = [] result = self.remove_columns([]) # clone + if self.number_of_columns == 0: + return Table.from_rows([row]) if len(set(self.column_names) - set(row.column_names)) > 0: raise UnknownColumnNameError(list(set(self.column_names) - set(row.column_names))) @@ -986,33 +986,26 @@ def add_rows(self, rows: list[Row] | Table) -> Table: """ if isinstance(rows, Table): rows = rows.to_rows() - int_columns = [] - result = self.remove_columns([]) # clone - missing_col_names = set() - for row in rows: - missing_col_names.update(set(self.column_names) - set(row.column_names)) - if len(missing_col_names) > 0: - raise UnknownColumnNameError(list(missing_col_names)) + if len(rows) == 0: + return copy.deepcopy(self) + different_column_names = set() for row in rows: - if result.number_of_rows == 0: - int_columns = list(filter(lambda name: isinstance(row[name], int | np.int64), row.column_names)) - if result.number_of_columns == 0: - for column in row.column_names: - result._data[column] = Column(column, []) - result._schema = Schema._from_pandas_dataframe(result._data) + different_column_names.update(set(rows[0].column_names) - set(row.column_names)) + if len(different_column_names) > 0: + raise UnknownColumnNameError(list(different_column_names)) - row_frames = (row._data for row in rows) + different_column_names = set() + if self.number_of_columns != 0: + different_column_names.update(set(self.column_names) - set(rows[0].column_names)) + if len(different_column_names) > 0: + raise UnknownColumnNameError(list(different_column_names)) - new_df = pd.concat([result._data, *row_frames]).infer_objects() - new_df.columns = result.column_names - result = Table._from_pandas_dataframe(new_df) - - for column in int_columns: - result = result.replace_column(column, [result.get_column(column).transform(lambda it: int(it))]) + result = copy.deepcopy(self) - result._schema = Schema.merge_multiple_schemas([self.schema, *[row.schema for row in rows]]) + for row in rows: + result = result.add_row(row) return result diff --git a/tests/safeds/data/tabular/containers/_table/test_add_rows.py b/tests/safeds/data/tabular/containers/_table/test_add_rows.py index 999f7b74d..1ee4e3dd6 100644 --- a/tests/safeds/data/tabular/containers/_table/test_add_rows.py +++ b/tests/safeds/data/tabular/containers/_table/test_add_rows.py @@ -74,7 +74,7 @@ def test_should_add_rows_from_table(table1: Table, table2: Table, expected: Tabl ( Table({"col1": [1, 2, 1], "col2": [1, 2, 4]}), [Row({"col1": 2, "col3": 4}), Row({"col1": 5, "col2": "Hallo"})], - r"Could not find column\(s\) 'col2'", + r"Could not find column\(s\) 'col3'", ), ], ids=["column names do not match"], @@ -82,10 +82,3 @@ def test_should_add_rows_from_table(table1: Table, table2: Table, expected: Tabl def test_should_raise_error_if_row_column_names_invalid(table: Table, rows: list[Row], expected_error_msg: str) -> None: with pytest.raises(UnknownColumnNameError, match=expected_error_msg): table.add_rows(rows) - - -def test_should_raise_schema_mismatch() -> None: - with raises(SchemaMismatchError, match=r"Failed because at least two schemas didn't match."): - Table({"a": [], "b": []}).add_rows([Row({"a": None, "b": None}), Row({"beer": None, "rips": None})]) - with raises(SchemaMismatchError, match=r"Failed because at least two schemas didn't match."): - Table({"a": [], "b": []}).add_rows([Row({"beer": None, "rips": None}), Row({"a": None, "b": None})]) From d3ba7222bb56eb7f66a78737475234a82fba1674 Mon Sep 17 00:00:00 2001 From: Simon Date: Sat, 1 Jul 2023 15:10:49 +0200 Subject: [PATCH 17/51] Fix merge_multiple_schemas() method to also handle Nothing Types correctly --- src/safeds/data/tabular/typing/_schema.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/safeds/data/tabular/typing/_schema.py b/src/safeds/data/tabular/typing/_schema.py index cd7801599..8aa28d153 100644 --- a/src/safeds/data/tabular/typing/_schema.py +++ b/src/safeds/data/tabular/typing/_schema.py @@ -3,7 +3,7 @@ from dataclasses import dataclass from typing import TYPE_CHECKING -from safeds.data.tabular.typing import Anything, Integer, RealNumber +from safeds.data.tabular.typing import Anything, Integer, RealNumber, Nothing from safeds.data.tabular.typing._column_type import ColumnType from safeds.exceptions import UnknownColumnNameError @@ -277,6 +277,16 @@ def merge_multiple_schemas(schemas: list[Schema]) -> Schema: ): schema_dict[col_name] = RealNumber(nullable) continue + if ( + isinstance(schema_dict[col_name], Nothing) + ): + schema_dict[col_name] = type(schema.get_column_type(col_name))(nullable) + continue + if ( + isinstance(schema.get_column_type(col_name), Nothing) + ): + schema_dict[col_name] = type(schema_dict[col_name])(nullable) + continue schema_dict[col_name] = Anything(nullable) return Schema(schema_dict) From 69d0fb4270b92ddcf845a978ec87da87513f708a Mon Sep 17 00:00:00 2001 From: Simon Date: Sun, 2 Jul 2023 12:30:56 +0200 Subject: [PATCH 18/51] Fix add_row() Method to correctly handle a row with a different schema --- src/safeds/data/tabular/containers/_table.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/safeds/data/tabular/containers/_table.py b/src/safeds/data/tabular/containers/_table.py index 992c59512..1ffe99117 100644 --- a/src/safeds/data/tabular/containers/_table.py +++ b/src/safeds/data/tabular/containers/_table.py @@ -923,7 +923,6 @@ def add_row(self, row: Row) -> Table: """ int_columns = [] result = self.remove_columns([]) # clone - if self.number_of_columns == 0: return Table.from_rows([row]) if len(set(self.column_names) - set(row.column_names)) > 0: @@ -940,14 +939,12 @@ def add_row(self, row: Row) -> Table: new_df = pd.concat([result._data, row._data]).infer_objects() new_df.columns = result.column_names - result = Table._from_pandas_dataframe(new_df) + schema = Schema.merge_multiple_schemas([result.schema, row.schema]) + result = Table._from_pandas_dataframe(new_df, schema) for column in int_columns: result = result.replace_column(column, [result.get_column(column).transform(lambda it: int(it))]) - schema = Schema.merge_multiple_schemas([result.schema, row.schema]) - result._schema = schema - return result def add_rows(self, rows: list[Row] | Table) -> Table: From 30e8c1f0b0ce2198b65c96386c4031226cbadac1 Mon Sep 17 00:00:00 2001 From: alex-senger Date: Fri, 7 Jul 2023 09:41:16 +0200 Subject: [PATCH 19/51] fix: fix `remove_rows_with_missing_values` to update schema Co-authored-by: sibre28 <86068340+sibre28@users.noreply.github.com> --- src/safeds/data/tabular/containers/_table.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/safeds/data/tabular/containers/_table.py b/src/safeds/data/tabular/containers/_table.py index 1ffe99117..852774cae 100644 --- a/src/safeds/data/tabular/containers/_table.py +++ b/src/safeds/data/tabular/containers/_table.py @@ -1246,7 +1246,7 @@ def remove_rows_with_missing_values(self) -> Table: """ result = self._data.copy(deep=True) result = result.dropna(axis="index") - return Table._from_pandas_dataframe(result, self._schema) + return Table._from_pandas_dataframe(result) def remove_rows_with_outliers(self) -> Table: """ From 7de0f452b7490dd2cba6786d9175924c1b9a4314 Mon Sep 17 00:00:00 2001 From: alex-senger Date: Fri, 7 Jul 2023 09:57:49 +0200 Subject: [PATCH 20/51] fix: fix table transformer error handling Co-authored-by: sibre28 <86068340+sibre28@users.noreply.github.com> --- .../tabular/transformation/_label_encoder.py | 6 +++--- .../tabular/transformation/_one_hot_encoder.py | 6 +++--- .../tabular/transformation/_range_scaler.py | 18 +++++++++--------- .../tabular/transformation/_standard_scaler.py | 18 +++++++++--------- 4 files changed, 24 insertions(+), 24 deletions(-) diff --git a/src/safeds/data/tabular/transformation/_label_encoder.py b/src/safeds/data/tabular/transformation/_label_encoder.py index 20fec0436..089cdee4f 100644 --- a/src/safeds/data/tabular/transformation/_label_encoder.py +++ b/src/safeds/data/tabular/transformation/_label_encoder.py @@ -152,6 +152,9 @@ def inverse_transform(self, transformed_table: Table) -> Table: if len(missing_columns) > 0: raise UnknownColumnNameError(missing_columns) + if transformed_table.number_of_rows == 0: + raise ValueError("The LabelEncoder cannot inverse transform the table because it contains 0 rows") + if transformed_table.keep_only_columns( self._column_names, ).remove_columns_with_non_numerical_values().number_of_columns < len(self._column_names): @@ -168,9 +171,6 @@ def inverse_transform(self, transformed_table: Table) -> Table: ), ) - if transformed_table.number_of_rows == 0: - raise ValueError("The LabelEncoder cannot inverse transform the table because it contains 0 rows") - data = transformed_table._data.copy() data.columns = transformed_table.column_names data[self._column_names] = self._wrapped_transformer.inverse_transform(data[self._column_names]) diff --git a/src/safeds/data/tabular/transformation/_one_hot_encoder.py b/src/safeds/data/tabular/transformation/_one_hot_encoder.py index 1070478fe..f1bd25197 100644 --- a/src/safeds/data/tabular/transformation/_one_hot_encoder.py +++ b/src/safeds/data/tabular/transformation/_one_hot_encoder.py @@ -271,6 +271,9 @@ def inverse_transform(self, transformed_table: Table) -> Table: if len(missing_columns) > 0: raise UnknownColumnNameError(missing_columns) + if transformed_table.number_of_rows == 0: + raise ValueError("The OneHotEncoder cannot inverse transform the table because it contains 0 rows") + if transformed_table.keep_only_columns( _transformed_column_names, ).remove_columns_with_non_numerical_values().number_of_columns < len(_transformed_column_names): @@ -287,9 +290,6 @@ def inverse_transform(self, transformed_table: Table) -> Table: ), ) - if transformed_table.number_of_rows == 0: - raise ValueError("The OneHotEncoder cannot inverse transform the table because it contains 0 rows") - original_columns = {} for original_column_name in self._column_names: original_columns[original_column_name] = [None for _ in range(transformed_table.number_of_rows)] diff --git a/src/safeds/data/tabular/transformation/_range_scaler.py b/src/safeds/data/tabular/transformation/_range_scaler.py index e7f8c16ba..38b163e9d 100644 --- a/src/safeds/data/tabular/transformation/_range_scaler.py +++ b/src/safeds/data/tabular/transformation/_range_scaler.py @@ -65,6 +65,9 @@ def fit(self, table: Table, column_names: list[str] | None) -> RangeScaler: if len(missing_columns) > 0: raise UnknownColumnNameError(missing_columns) + if table.number_of_rows == 0: + raise ValueError("The RangeScaler cannot be fitted because the table contains 0 rows") + if ( table.keep_only_columns(column_names).remove_columns_with_non_numerical_values().number_of_columns < table.keep_only_columns(column_names).number_of_columns @@ -82,9 +85,6 @@ def fit(self, table: Table, column_names: list[str] | None) -> RangeScaler: ), ) - if table.number_of_rows == 0: - raise ValueError("The RangeScaler cannot be fitted because the table contains 0 rows") - wrapped_transformer = sk_MinMaxScaler((self._minimum, self._maximum)) wrapped_transformer.fit(table._data[column_names]) @@ -130,6 +130,9 @@ def transform(self, table: Table) -> Table: if len(missing_columns) > 0: raise UnknownColumnNameError(missing_columns) + if table.number_of_rows == 0: + raise ValueError("The RangeScaler cannot transform the table because it contains 0 rows") + if ( table.keep_only_columns(self._column_names).remove_columns_with_non_numerical_values().number_of_columns < table.keep_only_columns(self._column_names).number_of_columns @@ -147,9 +150,6 @@ def transform(self, table: Table) -> Table: ), ) - if table.number_of_rows == 0: - raise ValueError("The RangeScaler cannot transform the table because it contains 0 rows") - data = table._data.copy() data.columns = table.column_names data[self._column_names] = self._wrapped_transformer.transform(data[self._column_names]) @@ -190,6 +190,9 @@ def inverse_transform(self, transformed_table: Table) -> Table: if len(missing_columns) > 0: raise UnknownColumnNameError(missing_columns) + if transformed_table.number_of_rows == 0: + raise ValueError("The RangeScaler cannot transform the table because it contains 0 rows") + if ( transformed_table.keep_only_columns(self._column_names) .remove_columns_with_non_numerical_values() @@ -209,9 +212,6 @@ def inverse_transform(self, transformed_table: Table) -> Table: ), ) - if transformed_table.number_of_rows == 0: - raise ValueError("The RangeScaler cannot transform the table because it contains 0 rows") - data = transformed_table._data.copy() data.columns = transformed_table.column_names data[self._column_names] = self._wrapped_transformer.inverse_transform(data[self._column_names]) diff --git a/src/safeds/data/tabular/transformation/_standard_scaler.py b/src/safeds/data/tabular/transformation/_standard_scaler.py index 3c190c58f..a3b213a11 100644 --- a/src/safeds/data/tabular/transformation/_standard_scaler.py +++ b/src/safeds/data/tabular/transformation/_standard_scaler.py @@ -48,6 +48,9 @@ def fit(self, table: Table, column_names: list[str] | None) -> StandardScaler: if len(missing_columns) > 0: raise UnknownColumnNameError(missing_columns) + if table.number_of_rows == 0: + raise ValueError("The StandardScaler cannot be fitted because the table contains 0 rows") + if ( table.keep_only_columns(column_names).remove_columns_with_non_numerical_values().number_of_columns < table.keep_only_columns(column_names).number_of_columns @@ -65,9 +68,6 @@ def fit(self, table: Table, column_names: list[str] | None) -> StandardScaler: ), ) - if table.number_of_rows == 0: - raise ValueError("The StandardScaler cannot be fitted because the table contains 0 rows") - wrapped_transformer = sk_StandardScaler() wrapped_transformer.fit(table._data[column_names]) @@ -113,6 +113,9 @@ def transform(self, table: Table) -> Table: if len(missing_columns) > 0: raise UnknownColumnNameError(missing_columns) + if table.number_of_rows == 0: + raise ValueError("The StandardScaler cannot transform the table because it contains 0 rows") + if ( table.keep_only_columns(self._column_names).remove_columns_with_non_numerical_values().number_of_columns < table.keep_only_columns(self._column_names).number_of_columns @@ -130,9 +133,6 @@ def transform(self, table: Table) -> Table: ), ) - if table.number_of_rows == 0: - raise ValueError("The StandardScaler cannot transform the table because it contains 0 rows") - data = table._data.copy() data.columns = table.column_names data[self._column_names] = self._wrapped_transformer.transform(data[self._column_names]) @@ -173,6 +173,9 @@ def inverse_transform(self, transformed_table: Table) -> Table: if len(missing_columns) > 0: raise UnknownColumnNameError(missing_columns) + if transformed_table.number_of_rows == 0: + raise ValueError("The StandardScaler cannot transform the table because it contains 0 rows") + if ( transformed_table.keep_only_columns(self._column_names) .remove_columns_with_non_numerical_values() @@ -192,9 +195,6 @@ def inverse_transform(self, transformed_table: Table) -> Table: ), ) - if transformed_table.number_of_rows == 0: - raise ValueError("The StandardScaler cannot transform the table because it contains 0 rows") - data = transformed_table._data.copy() data.columns = transformed_table.column_names data[self._column_names] = self._wrapped_transformer.inverse_transform(data[self._column_names]) From 7af83da1462ea0e8a6ffd854b07f267930e5d0e6 Mon Sep 17 00:00:00 2001 From: alex-senger Date: Fri, 7 Jul 2023 10:57:29 +0200 Subject: [PATCH 21/51] fix: fix `one_hot_encoder` to be able to handle `float("nan")` values Co-authored-by: sibre28 <86068340+sibre28@users.noreply.github.com> --- src/safeds/data/tabular/transformation/_one_hot_encoder.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/safeds/data/tabular/transformation/_one_hot_encoder.py b/src/safeds/data/tabular/transformation/_one_hot_encoder.py index f1bd25197..230ce5a7e 100644 --- a/src/safeds/data/tabular/transformation/_one_hot_encoder.py +++ b/src/safeds/data/tabular/transformation/_one_hot_encoder.py @@ -300,6 +300,12 @@ def inverse_transform(self, transformed_table: Table) -> Table: if transformed_table.get_column(constructed_column)[i] == 1.0: original_columns[original_column_name][i] = value + for original_column_name in self._value_to_column_nans: + constructed_column = self._value_to_column_nans[original_column_name] + for i in range(transformed_table.number_of_rows): + if transformed_table.get_column(constructed_column)[i] == 1.0: + original_columns[original_column_name][i] = float("nan") + table = transformed_table for column_name, encoded_column in original_columns.items(): From ec62ba3f0bd67277f2462eef481a6c1f9f90facd Mon Sep 17 00:00:00 2001 From: alex-senger Date: Fri, 7 Jul 2023 11:21:54 +0200 Subject: [PATCH 22/51] fix: fix test_row parameterize Co-authored-by: sibre28 <86068340+sibre28@users.noreply.github.com> --- tests/safeds/data/tabular/containers/test_row.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/safeds/data/tabular/containers/test_row.py b/tests/safeds/data/tabular/containers/test_row.py index 43fe0df93..0f1380750 100644 --- a/tests/safeds/data/tabular/containers/test_row.py +++ b/tests/safeds/data/tabular/containers/test_row.py @@ -516,7 +516,7 @@ def test_should_contain_td_element_for_each_value(self, row: Row) -> None: class TestCopy: @pytest.mark.parametrize( "row", - [Row(), Row({"a": [3, 0.1]})], + [Row(), Row({"a": 3, "b": 4})], ids=["empty", "normal"], ) def test_should_copy_table(self, row: Row) -> None: From fafad3ae7f041524a6129f3e1b503a6106611e52 Mon Sep 17 00:00:00 2001 From: alex-senger Date: Fri, 7 Jul 2023 11:35:35 +0200 Subject: [PATCH 23/51] fix: add typehints Co-authored-by: sibre28 <86068340+sibre28@users.noreply.github.com> --- src/safeds/data/tabular/typing/_column_type.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/safeds/data/tabular/typing/_column_type.py b/src/safeds/data/tabular/typing/_column_type.py index 620558dae..ee0864d4d 100644 --- a/src/safeds/data/tabular/typing/_column_type.py +++ b/src/safeds/data/tabular/typing/_column_type.py @@ -13,7 +13,7 @@ class ColumnType(ABC): """Abstract base class for column types.""" @abstractmethod - def __init__(self, is_nullable: bool = False): + def __init__(self, is_nullable: bool = False) -> None: pass @staticmethod @@ -107,7 +107,7 @@ class Anything(ColumnType): _is_nullable: bool - def __init__(self, is_nullable: bool = False): + def __init__(self, is_nullable: bool = False) -> None: self._is_nullable = is_nullable def __repr__(self) -> str: @@ -152,7 +152,7 @@ class Boolean(ColumnType): _is_nullable: bool - def __init__(self, is_nullable: bool = False): + def __init__(self, is_nullable: bool = False) -> None: self._is_nullable = is_nullable def __repr__(self) -> str: @@ -197,7 +197,7 @@ class RealNumber(ColumnType): _is_nullable: bool - def __init__(self, is_nullable: bool = False): + def __init__(self, is_nullable: bool = False) -> None: self._is_nullable = is_nullable def __repr__(self) -> str: @@ -242,7 +242,7 @@ class Integer(ColumnType): _is_nullable: bool - def __init__(self, is_nullable: bool = False): + def __init__(self, is_nullable: bool = False) -> None: self._is_nullable = is_nullable def __repr__(self) -> str: @@ -287,7 +287,7 @@ class String(ColumnType): _is_nullable: bool - def __init__(self, is_nullable: bool = False): + def __init__(self, is_nullable: bool = False) -> None: self._is_nullable = is_nullable def __repr__(self) -> str: @@ -325,7 +325,7 @@ class Nothing(ColumnType): _is_nullable: bool - def __init__(self): + def __init__(self) -> None: self._is_nullable = True def __repr__(self) -> str: From 9d3818eb2fe767976fb9afd18d4445bea98d021a Mon Sep 17 00:00:00 2001 From: Simon Date: Fri, 7 Jul 2023 13:15:34 +0200 Subject: [PATCH 24/51] Try stuff to make linter happy --- src/safeds/data/tabular/typing/_column_type.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/safeds/data/tabular/typing/_column_type.py b/src/safeds/data/tabular/typing/_column_type.py index ee0864d4d..9c6cdea42 100644 --- a/src/safeds/data/tabular/typing/_column_type.py +++ b/src/safeds/data/tabular/typing/_column_type.py @@ -52,7 +52,7 @@ def column_type_of_type(cell_type: Any) -> ColumnType: message = f"Unsupported numpy data type '{cell_type}'." raise NotImplementedError(message) - result = Nothing() + result: ColumnType = Nothing() is_nullable = False for cell in data: if result == Nothing(): From 0f474c995c805d83eb6f89b0854dc25e56758f81 Mon Sep 17 00:00:00 2001 From: alex-senger Date: Fri, 7 Jul 2023 13:25:52 +0200 Subject: [PATCH 25/51] fix: fix error handling and typehint Co-authored-by: sibre28 <86068340+sibre28@users.noreply.github.com> --- src/safeds/data/tabular/containers/_table.py | 4 ++-- src/safeds/data/tabular/transformation/_one_hot_encoder.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/safeds/data/tabular/containers/_table.py b/src/safeds/data/tabular/containers/_table.py index 9bf9876ad..878e027e0 100644 --- a/src/safeds/data/tabular/containers/_table.py +++ b/src/safeds/data/tabular/containers/_table.py @@ -935,7 +935,8 @@ def add_row(self, row: Row) -> Table: result._data[column] = Column(column, []) result._schema = Schema._from_pandas_dataframe(result._data) elif result.column_names != row.column_names: - raise UnknownColumnNameError + unknown_columns = list(set(row.column_names) - set(result.column_names)) + raise UnknownColumnNameError(unknown_columns) new_df = pd.concat([result._data, row._data]).infer_objects() new_df.columns = result.column_names @@ -983,7 +984,6 @@ def add_rows(self, rows: list[Row] | Table) -> Table: """ if isinstance(rows, Table): rows = rows.to_rows() - int_columns = [] result = self._copy() if len(rows) == 0: diff --git a/src/safeds/data/tabular/transformation/_one_hot_encoder.py b/src/safeds/data/tabular/transformation/_one_hot_encoder.py index 230ce5a7e..c74cce58e 100644 --- a/src/safeds/data/tabular/transformation/_one_hot_encoder.py +++ b/src/safeds/data/tabular/transformation/_one_hot_encoder.py @@ -292,7 +292,7 @@ def inverse_transform(self, transformed_table: Table) -> Table: original_columns = {} for original_column_name in self._column_names: - original_columns[original_column_name] = [None for _ in range(transformed_table.number_of_rows)] + original_columns[original_column_name]: list[Any] = [None for _ in range(transformed_table.number_of_rows)] for original_column_name, value in self._value_to_column: constructed_column = self._value_to_column[(original_column_name, value)] @@ -304,7 +304,7 @@ def inverse_transform(self, transformed_table: Table) -> Table: constructed_column = self._value_to_column_nans[original_column_name] for i in range(transformed_table.number_of_rows): if transformed_table.get_column(constructed_column)[i] == 1.0: - original_columns[original_column_name][i] = float("nan") + original_columns[original_column_name][i]: Any = float("nan") table = transformed_table From 55679291fbc400f664c7bfb99572a70bd58d8795 Mon Sep 17 00:00:00 2001 From: Simon Date: Fri, 7 Jul 2023 13:26:17 +0200 Subject: [PATCH 26/51] Try stuff to make linter happy --- src/safeds/data/tabular/typing/_column_type.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/safeds/data/tabular/typing/_column_type.py b/src/safeds/data/tabular/typing/_column_type.py index 9c6cdea42..1c7e1cc63 100644 --- a/src/safeds/data/tabular/typing/_column_type.py +++ b/src/safeds/data/tabular/typing/_column_type.py @@ -52,7 +52,7 @@ def column_type_of_type(cell_type: Any) -> ColumnType: message = f"Unsupported numpy data type '{cell_type}'." raise NotImplementedError(message) - result: ColumnType = Nothing() + result: ColumnType | Integer | String | Boolean | RealNumber | Nothing = Nothing() is_nullable = False for cell in data: if result == Nothing(): From 9d443182b35d03f894f0c0cf3f3e14da9bf7e228 Mon Sep 17 00:00:00 2001 From: Simon Date: Fri, 7 Jul 2023 13:32:26 +0200 Subject: [PATCH 27/51] Try stuff to make linter happy --- src/safeds/data/tabular/typing/_column_type.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/safeds/data/tabular/typing/_column_type.py b/src/safeds/data/tabular/typing/_column_type.py index 1c7e1cc63..04e4d963f 100644 --- a/src/safeds/data/tabular/typing/_column_type.py +++ b/src/safeds/data/tabular/typing/_column_type.py @@ -12,6 +12,7 @@ class ColumnType(ABC): """Abstract base class for column types.""" + _is_nullable: bool @abstractmethod def __init__(self, is_nullable: bool = False) -> None: pass @@ -52,7 +53,7 @@ def column_type_of_type(cell_type: Any) -> ColumnType: message = f"Unsupported numpy data type '{cell_type}'." raise NotImplementedError(message) - result: ColumnType | Integer | String | Boolean | RealNumber | Nothing = Nothing() + result: ColumnType = Nothing() is_nullable = False for cell in data: if result == Nothing(): From 7949af1ebf61b89e658da261f74566e89b5e3fe1 Mon Sep 17 00:00:00 2001 From: alex-senger Date: Fri, 7 Jul 2023 13:34:11 +0200 Subject: [PATCH 28/51] fix: trying our best to make the linter happy Co-authored-by: sibre28 <86068340+sibre28@users.noreply.github.com> --- src/safeds/data/tabular/transformation/_one_hot_encoder.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/safeds/data/tabular/transformation/_one_hot_encoder.py b/src/safeds/data/tabular/transformation/_one_hot_encoder.py index c74cce58e..8d190a750 100644 --- a/src/safeds/data/tabular/transformation/_one_hot_encoder.py +++ b/src/safeds/data/tabular/transformation/_one_hot_encoder.py @@ -292,7 +292,7 @@ def inverse_transform(self, transformed_table: Table) -> Table: original_columns = {} for original_column_name in self._column_names: - original_columns[original_column_name]: list[Any] = [None for _ in range(transformed_table.number_of_rows)] + original_columns[original_column_name] = [None for _ in range(transformed_table.number_of_rows)] for original_column_name, value in self._value_to_column: constructed_column = self._value_to_column[(original_column_name, value)] @@ -304,7 +304,7 @@ def inverse_transform(self, transformed_table: Table) -> Table: constructed_column = self._value_to_column_nans[original_column_name] for i in range(transformed_table.number_of_rows): if transformed_table.get_column(constructed_column)[i] == 1.0: - original_columns[original_column_name][i]: Any = float("nan") + original_columns[original_column_name][i] = np.nan table = transformed_table From 4903ae4494f1846d51bf9d448405492b07345682 Mon Sep 17 00:00:00 2001 From: megalinter-bot <129584137+megalinter-bot@users.noreply.github.com> Date: Fri, 7 Jul 2023 11:36:27 +0000 Subject: [PATCH 29/51] style: apply automated linter fixes --- src/safeds/data/tabular/containers/_row.py | 3 +-- src/safeds/data/tabular/containers/_table.py | 3 ++- src/safeds/data/tabular/typing/__init__.py | 2 +- .../data/tabular/typing/_column_type.py | 7 +++++-- src/safeds/data/tabular/typing/_schema.py | 10 +++------- .../_column/test_from_pandas_series.py | 2 +- .../tabular/containers/_column/test_init.py | 2 +- .../tabular/containers/_table/test_add_row.py | 16 ++++++++++----- .../containers/_table/test_add_rows.py | 1 - .../tabular/containers/_table/test_split.py | 2 +- .../data/tabular/typing/test_column_type.py | 20 +++++++++++++------ .../safeds/data/tabular/typing/test_schema.py | 7 +++---- 12 files changed, 43 insertions(+), 32 deletions(-) diff --git a/src/safeds/data/tabular/containers/_row.py b/src/safeds/data/tabular/containers/_row.py index 3916784e7..fa92d9330 100644 --- a/src/safeds/data/tabular/containers/_row.py +++ b/src/safeds/data/tabular/containers/_row.py @@ -1,9 +1,8 @@ from __future__ import annotations +import copy import functools from collections.abc import Callable, Mapping -import copy -from collections.abc import Mapping from typing import TYPE_CHECKING, Any import pandas as pd diff --git a/src/safeds/data/tabular/containers/_table.py b/src/safeds/data/tabular/containers/_table.py index 878e027e0..6995eacce 100644 --- a/src/safeds/data/tabular/containers/_table.py +++ b/src/safeds/data/tabular/containers/_table.py @@ -27,6 +27,7 @@ UnknownColumnNameError, WrongFileExtensionError, ) + from ._column import Column from ._row import Row @@ -1480,7 +1481,7 @@ def slice_rows( def sort_columns( self, comparator: Callable[[Column, Column], int] = lambda col1, col2: (col1.name > col2.name) - - (col1.name < col2.name), + - (col1.name < col2.name), ) -> Table: """ Sort the columns of a `Table` with the given comparator and return a new `Table`. diff --git a/src/safeds/data/tabular/typing/__init__.py b/src/safeds/data/tabular/typing/__init__.py index 14823d345..09ff84404 100644 --- a/src/safeds/data/tabular/typing/__init__.py +++ b/src/safeds/data/tabular/typing/__init__.py @@ -1,6 +1,6 @@ """Types used to define the schema of a tabular dataset.""" -from ._column_type import Anything, Boolean, ColumnType, Integer, RealNumber, String, Nothing +from ._column_type import Anything, Boolean, ColumnType, Integer, Nothing, RealNumber, String from ._imputer_strategy import ImputerStrategy from ._schema import Schema diff --git a/src/safeds/data/tabular/typing/_column_type.py b/src/safeds/data/tabular/typing/_column_type.py index 04e4d963f..dbaecbe35 100644 --- a/src/safeds/data/tabular/typing/_column_type.py +++ b/src/safeds/data/tabular/typing/_column_type.py @@ -3,16 +3,19 @@ from abc import ABC, abstractmethod from dataclasses import dataclass from types import NoneType -from typing import Any +from typing import TYPE_CHECKING, Any import numpy as np -import pandas as pd + +if TYPE_CHECKING: + import pandas as pd class ColumnType(ABC): """Abstract base class for column types.""" _is_nullable: bool + @abstractmethod def __init__(self, is_nullable: bool = False) -> None: pass diff --git a/src/safeds/data/tabular/typing/_schema.py b/src/safeds/data/tabular/typing/_schema.py index 8aa28d153..32a6bc3af 100644 --- a/src/safeds/data/tabular/typing/_schema.py +++ b/src/safeds/data/tabular/typing/_schema.py @@ -3,7 +3,7 @@ from dataclasses import dataclass from typing import TYPE_CHECKING -from safeds.data.tabular.typing import Anything, Integer, RealNumber, Nothing +from safeds.data.tabular.typing import Anything, Integer, Nothing, RealNumber from safeds.data.tabular.typing._column_type import ColumnType from safeds.exceptions import UnknownColumnNameError @@ -277,14 +277,10 @@ def merge_multiple_schemas(schemas: list[Schema]) -> Schema: ): schema_dict[col_name] = RealNumber(nullable) continue - if ( - isinstance(schema_dict[col_name], Nothing) - ): + if isinstance(schema_dict[col_name], Nothing): schema_dict[col_name] = type(schema.get_column_type(col_name))(nullable) continue - if ( - isinstance(schema.get_column_type(col_name), Nothing) - ): + if isinstance(schema.get_column_type(col_name), Nothing): schema_dict[col_name] = type(schema_dict[col_name])(nullable) continue schema_dict[col_name] = Anything(nullable) diff --git a/tests/safeds/data/tabular/containers/_column/test_from_pandas_series.py b/tests/safeds/data/tabular/containers/_column/test_from_pandas_series.py index f26d9a60c..e4787c0fa 100644 --- a/tests/safeds/data/tabular/containers/_column/test_from_pandas_series.py +++ b/tests/safeds/data/tabular/containers/_column/test_from_pandas_series.py @@ -1,7 +1,7 @@ import pandas as pd import pytest from safeds.data.tabular.containers import Column -from safeds.data.tabular.typing import Boolean, ColumnType, Integer, RealNumber, String, Nothing, Anything +from safeds.data.tabular.typing import Anything, Boolean, ColumnType, Integer, Nothing, RealNumber, String @pytest.mark.parametrize( diff --git a/tests/safeds/data/tabular/containers/_column/test_init.py b/tests/safeds/data/tabular/containers/_column/test_init.py index 68a11d40c..9f2f2ec80 100644 --- a/tests/safeds/data/tabular/containers/_column/test_init.py +++ b/tests/safeds/data/tabular/containers/_column/test_init.py @@ -3,7 +3,7 @@ import pandas as pd import pytest from safeds.data.tabular.containers import Column -from safeds.data.tabular.typing import Boolean, ColumnType, Integer, RealNumber, String, Nothing, Anything +from safeds.data.tabular.typing import Anything, Boolean, ColumnType, Integer, Nothing, RealNumber, String def test_should_store_the_name() -> None: diff --git a/tests/safeds/data/tabular/containers/_table/test_add_row.py b/tests/safeds/data/tabular/containers/_table/test_add_row.py index 5a748e84f..d64e47f41 100644 --- a/tests/safeds/data/tabular/containers/_table/test_add_row.py +++ b/tests/safeds/data/tabular/containers/_table/test_add_row.py @@ -2,7 +2,7 @@ from _pytest.python_api import raises from safeds.data.tabular.containers import Row, Table from safeds.data.tabular.typing import Anything, Integer, Schema -from safeds.exceptions import UnknownColumnNameError, SchemaMismatchError +from safeds.exceptions import UnknownColumnNameError @pytest.mark.parametrize( @@ -30,16 +30,22 @@ Table({"col1": [1, 2, 1], "col2": [1, 2, 4]}), Row({"col1": 5, "col2": 6}), Table({"col1": [1, 2, 1, 5], "col2": [1, 2, 4, 6]}), - Schema({"col1": Integer(), "col2": Integer()}) + Schema({"col1": Integer(), "col2": Integer()}), ), ( Table({"col1": [], "col2": []}), Row({"col1": 5, "col2": 6}), Table({"col1": [5], "col2": [6]}), - Schema({"col1": Integer(), "col2": Integer()}) - ) + Schema({"col1": Integer(), "col2": Integer()}), + ), + ], + ids=[ + "added row", + "different schemas", + "different schemas and nullable", + "add row to rowless table", + "add row to empty table", ], - ids=["added row", "different schemas", "different schemas and nullable", "add row to rowless table", "add row to empty table"], ) def test_should_add_row(table: Table, row: Row, expected: Table, expected_schema: Schema) -> None: result = table.add_row(row) diff --git a/tests/safeds/data/tabular/containers/_table/test_add_rows.py b/tests/safeds/data/tabular/containers/_table/test_add_rows.py index 1ee4e3dd6..b5cd8742a 100644 --- a/tests/safeds/data/tabular/containers/_table/test_add_rows.py +++ b/tests/safeds/data/tabular/containers/_table/test_add_rows.py @@ -1,5 +1,4 @@ import pytest -from _pytest.python_api import raises from safeds.data.tabular.containers import Row, Table from safeds.exceptions import UnknownColumnNameError diff --git a/tests/safeds/data/tabular/containers/_table/test_split.py b/tests/safeds/data/tabular/containers/_table/test_split.py index ef39bb2ab..c5066f944 100644 --- a/tests/safeds/data/tabular/containers/_table/test_split.py +++ b/tests/safeds/data/tabular/containers/_table/test_split.py @@ -1,7 +1,7 @@ import pandas as pd import pytest from safeds.data.tabular.containers import Table -from safeds.data.tabular.typing import Integer, Schema, Nothing +from safeds.data.tabular.typing import Integer, Nothing, Schema @pytest.mark.parametrize( diff --git a/tests/safeds/data/tabular/typing/test_column_type.py b/tests/safeds/data/tabular/typing/test_column_type.py index 9655f656d..49edcb8f3 100644 --- a/tests/safeds/data/tabular/typing/test_column_type.py +++ b/tests/safeds/data/tabular/typing/test_column_type.py @@ -1,15 +1,14 @@ -from typing import Iterable +from collections.abc import Iterable import pytest - from safeds.data.tabular.typing import ( Anything, Boolean, ColumnType, Integer, + Nothing, RealNumber, String, - Nothing, ) @@ -27,10 +26,19 @@ class TestDataType: ([1.0, 2.0, None], RealNumber(is_nullable=True)), ([True, False, None], Boolean(is_nullable=True)), (["a", None, "b"], String(is_nullable=True)), - ], - ids=["Integer", "Real number", "Boolean", "String", "Mixed", "None", "Nullable integer", - "Nullable RealNumber", "Nullable Boolean", "Nullable String"], + ids=[ + "Integer", + "Real number", + "Boolean", + "String", + "Mixed", + "None", + "Nullable integer", + "Nullable RealNumber", + "Nullable Boolean", + "Nullable String", + ], ) def test_should_return_the_data_type(self, data: Iterable, expected: ColumnType) -> None: assert ColumnType._data_type(data) == expected diff --git a/tests/safeds/data/tabular/typing/test_schema.py b/tests/safeds/data/tabular/typing/test_schema.py index c81f50fe8..523c7de60 100644 --- a/tests/safeds/data/tabular/typing/test_schema.py +++ b/tests/safeds/data/tabular/typing/test_schema.py @@ -1,15 +1,14 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Iterable +from typing import TYPE_CHECKING import pandas as pd import pytest - - -from safeds.data.tabular.typing import Boolean, ColumnType, Integer, RealNumber, Schema, String, Anything +from safeds.data.tabular.typing import Anything, Boolean, ColumnType, Integer, RealNumber, Schema, String from safeds.exceptions import UnknownColumnNameError if TYPE_CHECKING: + from collections.abc import Iterable from typing import Any From 90a9948402c06c72858e1c33b6bd57dab8a0534b Mon Sep 17 00:00:00 2001 From: Simon Date: Fri, 7 Jul 2023 13:38:08 +0200 Subject: [PATCH 30/51] Add comment to linter solution --- src/safeds/data/tabular/typing/_column_type.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/safeds/data/tabular/typing/_column_type.py b/src/safeds/data/tabular/typing/_column_type.py index 04e4d963f..ffd2646a6 100644 --- a/src/safeds/data/tabular/typing/_column_type.py +++ b/src/safeds/data/tabular/typing/_column_type.py @@ -12,7 +12,7 @@ class ColumnType(ABC): """Abstract base class for column types.""" - _is_nullable: bool + _is_nullable: bool # This line is just here so the linter doesn't throw an error in line 63. @abstractmethod def __init__(self, is_nullable: bool = False) -> None: pass From 109b2a7d2cb8acf46320ea9cb7c2e21a837c6079 Mon Sep 17 00:00:00 2001 From: megalinter-bot <129584137+megalinter-bot@users.noreply.github.com> Date: Fri, 7 Jul 2023 11:45:23 +0000 Subject: [PATCH 31/51] style: apply automated linter fixes --- src/safeds/data/tabular/typing/_column_type.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/safeds/data/tabular/typing/_column_type.py b/src/safeds/data/tabular/typing/_column_type.py index d0d60a3d2..e1c7c068d 100644 --- a/src/safeds/data/tabular/typing/_column_type.py +++ b/src/safeds/data/tabular/typing/_column_type.py @@ -15,6 +15,7 @@ class ColumnType(ABC): """Abstract base class for column types.""" _is_nullable: bool # This line is just here so the linter doesn't throw an error in line 63. + @abstractmethod def __init__(self, is_nullable: bool = False) -> None: pass From b4881f6a92425a90ef47b32252459f4bfcee0078 Mon Sep 17 00:00:00 2001 From: alex-senger Date: Fri, 7 Jul 2023 14:15:10 +0200 Subject: [PATCH 32/51] fix: fix `_data_type` Co-authored-by: sibre28 <86068340+sibre28@users.noreply.github.com> --- src/safeds/data/tabular/typing/_column_type.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/safeds/data/tabular/typing/_column_type.py b/src/safeds/data/tabular/typing/_column_type.py index e1c7c068d..646aee0cc 100644 --- a/src/safeds/data/tabular/typing/_column_type.py +++ b/src/safeds/data/tabular/typing/_column_type.py @@ -14,7 +14,7 @@ class ColumnType(ABC): """Abstract base class for column types.""" - _is_nullable: bool # This line is just here so the linter doesn't throw an error in line 63. + _is_nullable: bool # This line is just here so the linter doesn't throw an error @abstractmethod def __init__(self, is_nullable: bool = False) -> None: @@ -44,10 +44,10 @@ def _data_type(data: pd.Series) -> ColumnType: def column_type_of_type(cell_type: Any) -> ColumnType: if cell_type == int or cell_type == np.int64 or cell_type == np.int32: return Integer(is_nullable) - if cell_type == bool: - return Boolean(is_nullable) if cell_type == float or cell_type == np.float64 or cell_type == np.float32: return RealNumber(is_nullable) + if cell_type == bool: + return Boolean(is_nullable) if cell_type == str: return String(is_nullable) if cell_type is NoneType: @@ -68,7 +68,7 @@ def column_type_of_type(cell_type: Any) -> ColumnType: if type(cell) is NoneType: is_nullable = True result._is_nullable = is_nullable - elif result == Integer and type(cell) == float: + elif (isinstance(result, Integer) and isinstance(column_type_of_type(type(cell)), RealNumber)) or (isinstance(result, RealNumber) and isinstance(column_type_of_type(type(cell)), Integer)): result = RealNumber(is_nullable) else: result = Anything(is_nullable) From 7d89395aeb8f079babd01cadea28823227125799 Mon Sep 17 00:00:00 2001 From: megalinter-bot <129584137+megalinter-bot@users.noreply.github.com> Date: Fri, 7 Jul 2023 12:16:57 +0000 Subject: [PATCH 33/51] style: apply automated linter fixes --- src/safeds/data/tabular/typing/_column_type.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/safeds/data/tabular/typing/_column_type.py b/src/safeds/data/tabular/typing/_column_type.py index 646aee0cc..71d8cf34e 100644 --- a/src/safeds/data/tabular/typing/_column_type.py +++ b/src/safeds/data/tabular/typing/_column_type.py @@ -68,7 +68,9 @@ def column_type_of_type(cell_type: Any) -> ColumnType: if type(cell) is NoneType: is_nullable = True result._is_nullable = is_nullable - elif (isinstance(result, Integer) and isinstance(column_type_of_type(type(cell)), RealNumber)) or (isinstance(result, RealNumber) and isinstance(column_type_of_type(type(cell)), Integer)): + elif (isinstance(result, Integer) and isinstance(column_type_of_type(type(cell)), RealNumber)) or ( + isinstance(result, RealNumber) and isinstance(column_type_of_type(type(cell)), Integer) + ): result = RealNumber(is_nullable) else: result = Anything(is_nullable) From 626707524d2dc12f53f5ff204dbe4ddd5e4b6be0 Mon Sep 17 00:00:00 2001 From: alex-senger Date: Mon, 10 Jul 2023 14:10:57 +0200 Subject: [PATCH 34/51] test: add tests for `sort_columns` Co-authored-by: sibre28 <86068340+sibre28@users.noreply.github.com> --- .../data/tabular/containers/test_row.py | 34 ++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/tests/safeds/data/tabular/containers/test_row.py b/tests/safeds/data/tabular/containers/test_row.py index 0f1380750..58f1ded65 100644 --- a/tests/safeds/data/tabular/containers/test_row.py +++ b/tests/safeds/data/tabular/containers/test_row.py @@ -1,5 +1,5 @@ import re -from typing import Any +from typing import Any, Callable import pandas as pd import pytest @@ -523,3 +523,35 @@ def test_should_copy_table(self, row: Row) -> None: copied = row._copy() assert copied == row assert copied is not row + + +class TestSortColumns: + @pytest.mark.parametrize( + ("row", "comparator", "expected"), + [ + (Row({"b": 1, "a": 2}), lambda col1, col2: (col1[0] > col2[0]) - (col1[0] < col2[0]), Row({"a": 2, "b": 1})), + (Row({"a": 2, "b": 1}), lambda col1, col2: (col2[0] > col1[0]) - (col2[0] < col1[0]), Row({"b": 1, "a": 2})), + (Row(), lambda col1, col2: (col1[0] > col2[0]) - (col1[0] < col2[0]), Row()), + ], + ids=[ + "sort descending by first element", + "sort ascending by first element", + "empty rows", + ], + ) + def test_should_sort_table(self, row: Row, comparator: Callable[[tuple[str, Any], tuple[str, Any]], int], expected: Row) -> None: + row = row.sort_columns(comparator) + assert row == expected + + @pytest.mark.parametrize( + "row", + [ + (Row({"b": 1, "a": 2})), + ], + ids=[ + "sort descending by first element", + ], + ) + def test_should_sort_table_out_of_place(self, row: Row) -> None: + sorted_row = row.sort_columns() + assert sorted_row != row From 9d72cd028f43f2b95fe2b5c312f79d75edcc3435 Mon Sep 17 00:00:00 2001 From: alex-senger Date: Mon, 10 Jul 2023 15:06:00 +0200 Subject: [PATCH 35/51] test: add test for unsupported data types Co-authored-by: sibre28 <86068340+sibre28@users.noreply.github.com> --- .../safeds/data/tabular/typing/test_column_type.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tests/safeds/data/tabular/typing/test_column_type.py b/tests/safeds/data/tabular/typing/test_column_type.py index 49edcb8f3..ba76ed5a6 100644 --- a/tests/safeds/data/tabular/typing/test_column_type.py +++ b/tests/safeds/data/tabular/typing/test_column_type.py @@ -1,5 +1,7 @@ from collections.abc import Iterable +from typing import Any +import numpy as np import pytest from safeds.data.tabular.typing import ( Anything, @@ -43,6 +45,17 @@ class TestDataType: def test_should_return_the_data_type(self, data: Iterable, expected: ColumnType) -> None: assert ColumnType._data_type(data) == expected + @pytest.mark.parametrize( + ("data", "error_message"), + [ + (np.array([1, 2, 3], dtype=np.int16), "Unsupported numpy data type ''.") + ], + ids=["int16 not supported"], + ) + def test_should_throw_not_implemented_error_when_type_is_not_supported(self, data: Any, error_message: str) -> None: + with pytest.raises(NotImplementedError, match=error_message): + ColumnType._data_type(data) + class TestRepr: @pytest.mark.parametrize( From 8386ac2c77fabcd6829444cd57e157f406068eb1 Mon Sep 17 00:00:00 2001 From: alex-senger Date: Mon, 10 Jul 2023 17:25:18 +0200 Subject: [PATCH 36/51] fix: remove unnecessary code Co-authored-by: sibre28 <86068340+sibre28@users.noreply.github.com> --- src/safeds/data/tabular/containers/_table.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/src/safeds/data/tabular/containers/_table.py b/src/safeds/data/tabular/containers/_table.py index 4e43da9f0..c80e321f4 100644 --- a/src/safeds/data/tabular/containers/_table.py +++ b/src/safeds/data/tabular/containers/_table.py @@ -947,14 +947,7 @@ def add_row(self, row: Row) -> Table: raise UnknownColumnNameError(list(set(self.column_names) - set(row.column_names))) if result.number_of_rows == 0: - int_columns = list(filter(lambda name: isinstance(row[name], int | np.int64), row.column_names)) - if result.number_of_columns == 0: - for column in row.column_names: - result._data[column] = Column(column, []) - result._schema = Schema._from_pandas_dataframe(result._data) - elif result.column_names != row.column_names: - unknown_columns = list(set(row.column_names) - set(result.column_names)) - raise UnknownColumnNameError(unknown_columns) + int_columns = list(filter(lambda name: isinstance(row[name], int | np.int64 | np.int32), row.column_names)) new_df = pd.concat([result._data, row._data]).infer_objects() new_df.columns = result.column_names From a52a12b69e9e40876a3b99efda34fa83fdc8202f Mon Sep 17 00:00:00 2001 From: alex-senger Date: Mon, 10 Jul 2023 17:31:49 +0200 Subject: [PATCH 37/51] fix: remove duplicate code Co-authored-by: sibre28 <86068340+sibre28@users.noreply.github.com> --- src/safeds/data/tabular/containers/_table.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/safeds/data/tabular/containers/_table.py b/src/safeds/data/tabular/containers/_table.py index c80e321f4..747d0fdee 100644 --- a/src/safeds/data/tabular/containers/_table.py +++ b/src/safeds/data/tabular/containers/_table.py @@ -1006,12 +1006,6 @@ def add_rows(self, rows: list[Row] | Table) -> Table: if len(different_column_names) > 0: raise UnknownColumnNameError(list(different_column_names)) - different_column_names = set() - if self.number_of_columns != 0: - different_column_names.update(set(self.column_names) - set(rows[0].column_names)) - if len(different_column_names) > 0: - raise UnknownColumnNameError(list(different_column_names)) - result = self._copy() for row in rows: From 672b6b13b2477edf3a5cc71a6da6b2f957b5d6be Mon Sep 17 00:00:00 2001 From: alex-senger Date: Mon, 10 Jul 2023 17:35:40 +0200 Subject: [PATCH 38/51] fix: remove unnecessary code Co-authored-by: sibre28 <86068340+sibre28@users.noreply.github.com> --- src/safeds/data/tabular/typing/_column_type.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/safeds/data/tabular/typing/_column_type.py b/src/safeds/data/tabular/typing/_column_type.py index 71d8cf34e..7b81db942 100644 --- a/src/safeds/data/tabular/typing/_column_type.py +++ b/src/safeds/data/tabular/typing/_column_type.py @@ -16,10 +16,6 @@ class ColumnType(ABC): _is_nullable: bool # This line is just here so the linter doesn't throw an error - @abstractmethod - def __init__(self, is_nullable: bool = False) -> None: - pass - @staticmethod def _data_type(data: pd.Series) -> ColumnType: """ From 6ed099058a450a462d24a9529699a6a9344fcc38 Mon Sep 17 00:00:00 2001 From: alex-senger Date: Mon, 10 Jul 2023 18:22:33 +0200 Subject: [PATCH 39/51] test: Add test for CodeCov Co-authored-by: sibre28 <86068340+sibre28@users.noreply.github.com> --- src/safeds/data/tabular/typing/_column_type.py | 4 ++++ tests/safeds/data/tabular/typing/test_column_type.py | 7 +++++++ 2 files changed, 11 insertions(+) diff --git a/src/safeds/data/tabular/typing/_column_type.py b/src/safeds/data/tabular/typing/_column_type.py index 7b81db942..71d8cf34e 100644 --- a/src/safeds/data/tabular/typing/_column_type.py +++ b/src/safeds/data/tabular/typing/_column_type.py @@ -16,6 +16,10 @@ class ColumnType(ABC): _is_nullable: bool # This line is just here so the linter doesn't throw an error + @abstractmethod + def __init__(self, is_nullable: bool = False) -> None: + pass + @staticmethod def _data_type(data: pd.Series) -> ColumnType: """ diff --git a/tests/safeds/data/tabular/typing/test_column_type.py b/tests/safeds/data/tabular/typing/test_column_type.py index ba76ed5a6..a29fd8bcf 100644 --- a/tests/safeds/data/tabular/typing/test_column_type.py +++ b/tests/safeds/data/tabular/typing/test_column_type.py @@ -118,3 +118,10 @@ class TestIsNumeric: ) def test_should_return_whether_the_column_type_is_numeric(self, column_type: ColumnType, expected: bool) -> None: assert column_type.is_numeric() == expected + + +# We need this test for CodeCoverage +class TestAbstractClass: + def test_should_raise_if_abstract_class_is_initialized(self) -> None: + with pytest.raises(TypeError): + ColumnType() From 531e7d6bd5cf8b07aea1e42c375317e4720f3852 Mon Sep 17 00:00:00 2001 From: alex-senger Date: Mon, 10 Jul 2023 18:29:42 +0200 Subject: [PATCH 40/51] fix: Fix Typehint and add match to raises Co-authored-by: sibre28 <86068340+sibre28@users.noreply.github.com> --- tests/safeds/data/tabular/containers/test_row.py | 2 +- tests/safeds/data/tabular/typing/test_column_type.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/safeds/data/tabular/containers/test_row.py b/tests/safeds/data/tabular/containers/test_row.py index 58f1ded65..96c897e41 100644 --- a/tests/safeds/data/tabular/containers/test_row.py +++ b/tests/safeds/data/tabular/containers/test_row.py @@ -539,7 +539,7 @@ class TestSortColumns: "empty rows", ], ) - def test_should_sort_table(self, row: Row, comparator: Callable[[tuple[str, Any], tuple[str, Any]], int], expected: Row) -> None: + def test_should_sort_columns(self, row: Row, comparator: Callable[[tuple, tuple], int], expected: Row) -> None: row = row.sort_columns(comparator) assert row == expected diff --git a/tests/safeds/data/tabular/typing/test_column_type.py b/tests/safeds/data/tabular/typing/test_column_type.py index a29fd8bcf..781d1a66d 100644 --- a/tests/safeds/data/tabular/typing/test_column_type.py +++ b/tests/safeds/data/tabular/typing/test_column_type.py @@ -123,5 +123,5 @@ def test_should_return_whether_the_column_type_is_numeric(self, column_type: Col # We need this test for CodeCoverage class TestAbstractClass: def test_should_raise_if_abstract_class_is_initialized(self) -> None: - with pytest.raises(TypeError): + with pytest.raises(TypeError, match="Can't instantiate abstract class ColumnType with abstract methods __init__, is_nullable, is_numeric"): ColumnType() From 18f27eab1c76868c7eea1b0b539ac82f78e2063b Mon Sep 17 00:00:00 2001 From: alex-senger Date: Mon, 10 Jul 2023 18:35:57 +0200 Subject: [PATCH 41/51] test: remove test because it makes the Linter fail Co-authored-by: sibre28 <86068340+sibre28@users.noreply.github.com> --- tests/safeds/data/tabular/typing/test_column_type.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/tests/safeds/data/tabular/typing/test_column_type.py b/tests/safeds/data/tabular/typing/test_column_type.py index 781d1a66d..55e14942c 100644 --- a/tests/safeds/data/tabular/typing/test_column_type.py +++ b/tests/safeds/data/tabular/typing/test_column_type.py @@ -119,9 +119,3 @@ class TestIsNumeric: def test_should_return_whether_the_column_type_is_numeric(self, column_type: ColumnType, expected: bool) -> None: assert column_type.is_numeric() == expected - -# We need this test for CodeCoverage -class TestAbstractClass: - def test_should_raise_if_abstract_class_is_initialized(self) -> None: - with pytest.raises(TypeError, match="Can't instantiate abstract class ColumnType with abstract methods __init__, is_nullable, is_numeric"): - ColumnType() From ecb8796fe2112b94237e8068eea2293765688f53 Mon Sep 17 00:00:00 2001 From: megalinter-bot <129584137+megalinter-bot@users.noreply.github.com> Date: Mon, 10 Jul 2023 16:38:04 +0000 Subject: [PATCH 42/51] style: apply automated linter fixes --- tests/safeds/data/tabular/containers/test_row.py | 15 ++++++++++++--- .../data/tabular/typing/test_column_type.py | 5 +---- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/tests/safeds/data/tabular/containers/test_row.py b/tests/safeds/data/tabular/containers/test_row.py index 96c897e41..011553f27 100644 --- a/tests/safeds/data/tabular/containers/test_row.py +++ b/tests/safeds/data/tabular/containers/test_row.py @@ -1,5 +1,6 @@ import re -from typing import Any, Callable +from collections.abc import Callable +from typing import Any import pandas as pd import pytest @@ -529,8 +530,16 @@ class TestSortColumns: @pytest.mark.parametrize( ("row", "comparator", "expected"), [ - (Row({"b": 1, "a": 2}), lambda col1, col2: (col1[0] > col2[0]) - (col1[0] < col2[0]), Row({"a": 2, "b": 1})), - (Row({"a": 2, "b": 1}), lambda col1, col2: (col2[0] > col1[0]) - (col2[0] < col1[0]), Row({"b": 1, "a": 2})), + ( + Row({"b": 1, "a": 2}), + lambda col1, col2: (col1[0] > col2[0]) - (col1[0] < col2[0]), + Row({"a": 2, "b": 1}), + ), + ( + Row({"a": 2, "b": 1}), + lambda col1, col2: (col2[0] > col1[0]) - (col2[0] < col1[0]), + Row({"b": 1, "a": 2}), + ), (Row(), lambda col1, col2: (col1[0] > col2[0]) - (col1[0] < col2[0]), Row()), ], ids=[ diff --git a/tests/safeds/data/tabular/typing/test_column_type.py b/tests/safeds/data/tabular/typing/test_column_type.py index 55e14942c..fc9c7a5ca 100644 --- a/tests/safeds/data/tabular/typing/test_column_type.py +++ b/tests/safeds/data/tabular/typing/test_column_type.py @@ -47,9 +47,7 @@ def test_should_return_the_data_type(self, data: Iterable, expected: ColumnType) @pytest.mark.parametrize( ("data", "error_message"), - [ - (np.array([1, 2, 3], dtype=np.int16), "Unsupported numpy data type ''.") - ], + [(np.array([1, 2, 3], dtype=np.int16), "Unsupported numpy data type ''.")], ids=["int16 not supported"], ) def test_should_throw_not_implemented_error_when_type_is_not_supported(self, data: Any, error_message: str) -> None: @@ -118,4 +116,3 @@ class TestIsNumeric: ) def test_should_return_whether_the_column_type_is_numeric(self, column_type: ColumnType, expected: bool) -> None: assert column_type.is_numeric() == expected - From 95526cb16fa4287c08346952ccc66e8690abe430 Mon Sep 17 00:00:00 2001 From: alex-senger Date: Tue, 11 Jul 2023 14:55:18 +0200 Subject: [PATCH 43/51] fix: replace pass with docstring --- src/safeds/data/tabular/typing/_column_type.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/safeds/data/tabular/typing/_column_type.py b/src/safeds/data/tabular/typing/_column_type.py index 71d8cf34e..f9eb19d62 100644 --- a/src/safeds/data/tabular/typing/_column_type.py +++ b/src/safeds/data/tabular/typing/_column_type.py @@ -18,7 +18,14 @@ class ColumnType(ABC): @abstractmethod def __init__(self, is_nullable: bool = False) -> None: - pass + """ + Abstract initializer for ColumnType. + + Parameters + ---------- + is_nullable + Whether the columntype is nullable. + """ @staticmethod def _data_type(data: pd.Series) -> ColumnType: From 50e15fc0a66bae6ece4a562c4e64e92538cd8b6e Mon Sep 17 00:00:00 2001 From: Alex Senger <91055000+alex-senger@users.noreply.github.com> Date: Tue, 11 Jul 2023 14:57:29 +0200 Subject: [PATCH 44/51] Apply suggestions from code review Co-authored-by: Alexander <47296670+Marsmaennchen221@users.noreply.github.com> --- src/safeds/data/tabular/containers/_row.py | 6 +++--- src/safeds/data/tabular/containers/_table.py | 7 +++++-- src/safeds/data/tabular/typing/_column_type.py | 2 +- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/src/safeds/data/tabular/containers/_row.py b/src/safeds/data/tabular/containers/_row.py index fa92d9330..5f16ff810 100644 --- a/src/safeds/data/tabular/containers/_row.py +++ b/src/safeds/data/tabular/containers/_row.py @@ -453,7 +453,7 @@ def sort_columns( """ Sort the columns of a `Row` with the given comparator and return a new `Row`. - The original row is not modified. The comparator is a function that takes two Tuples of (ColumnName: Value) `col1` and `col2` and + The original row is not modified. The comparator is a function that takes two tuples of (ColumnName, Value) `col1` and `col2` and returns an integer: * If `col1` should be ordered before `col2`, the function should return a negative number. @@ -464,8 +464,8 @@ def sort_columns( Parameters ---------- - comparator : Callable[[Tuple, Tuple], int] - The function used to compare two Tuples of (ColumnName: Value). + comparator : Callable[[tuple, tuple], int] + The function used to compare two tuples of (ColumnName, Value). Returns ------- diff --git a/src/safeds/data/tabular/containers/_table.py b/src/safeds/data/tabular/containers/_table.py index 747d0fdee..6412ba442 100644 --- a/src/safeds/data/tabular/containers/_table.py +++ b/src/safeds/data/tabular/containers/_table.py @@ -911,7 +911,8 @@ def add_row(self, row: Row) -> Table: If the table happens to be empty beforehand, respective columns will be added automatically. The order of columns of the new row will be adjusted to the order of columns in the table. - This table will contain the merged schema. + The new table will contain the merged schema. + This table is not modified. Parameters @@ -964,7 +965,9 @@ def add_rows(self, rows: list[Row] | Table) -> Table: Add multiple rows to a table. The order of columns of the new rows will be adjusted to the order of columns in the table. - This table will contain the merged schema. + The new table will contain the merged schema. + + This table is not modified. Parameters ---------- diff --git a/src/safeds/data/tabular/typing/_column_type.py b/src/safeds/data/tabular/typing/_column_type.py index f9eb19d62..b4ea73703 100644 --- a/src/safeds/data/tabular/typing/_column_type.py +++ b/src/safeds/data/tabular/typing/_column_type.py @@ -30,7 +30,7 @@ def __init__(self, is_nullable: bool = False) -> None: @staticmethod def _data_type(data: pd.Series) -> ColumnType: """ - Return the column type for a given `numpy` data type. + Return the column type for a given `Series` from `pandas`. Parameters ---------- From 171de136c28d50d4801d939b8a78b841fe1c7be3 Mon Sep 17 00:00:00 2001 From: megalinter-bot <129584137+megalinter-bot@users.noreply.github.com> Date: Tue, 11 Jul 2023 13:11:42 +0000 Subject: [PATCH 45/51] style: apply automated linter fixes --- src/safeds/data/tabular/containers/_table.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/safeds/data/tabular/containers/_table.py b/src/safeds/data/tabular/containers/_table.py index 9a9940963..ae26a62bc 100644 --- a/src/safeds/data/tabular/containers/_table.py +++ b/src/safeds/data/tabular/containers/_table.py @@ -912,7 +912,7 @@ def add_row(self, row: Row) -> Table: The order of columns of the new row will be adjusted to the order of columns in the table. The new table will contain the merged schema. - + This table is not modified. Parameters @@ -966,7 +966,7 @@ def add_rows(self, rows: list[Row] | Table) -> Table: The order of columns of the new rows will be adjusted to the order of columns in the table. The new table will contain the merged schema. - + This table is not modified. Parameters From 9c163121070460d2cee9d82514c4f9185ef614c6 Mon Sep 17 00:00:00 2001 From: alex-senger Date: Tue, 11 Jul 2023 20:05:05 +0200 Subject: [PATCH 46/51] fix: remove `SchemaMismatchError` --- src/safeds/exceptions/__init__.py | 2 -- src/safeds/exceptions/_data.py | 7 ------- 2 files changed, 9 deletions(-) diff --git a/src/safeds/exceptions/__init__.py b/src/safeds/exceptions/__init__.py index 021736287..352334cf5 100644 --- a/src/safeds/exceptions/__init__.py +++ b/src/safeds/exceptions/__init__.py @@ -9,7 +9,6 @@ IndexOutOfBoundsError, MissingValuesColumnError, NonNumericColumnError, - SchemaMismatchError, TransformerNotFittedError, UnknownColumnNameError, ValueNotPresentWhenFittedError, @@ -43,7 +42,6 @@ "IndexOutOfBoundsError", "MissingValuesColumnError", "NonNumericColumnError", - "SchemaMismatchError", "TransformerNotFittedError", "UnknownColumnNameError", "ValueNotPresentWhenFittedError", diff --git a/src/safeds/exceptions/_data.py b/src/safeds/exceptions/_data.py index f11c7a334..2d2fb7880 100644 --- a/src/safeds/exceptions/_data.py +++ b/src/safeds/exceptions/_data.py @@ -93,13 +93,6 @@ def __init__(self, expected_size: str, actual_size: str): super().__init__(f"Expected a column of size {expected_size} but got column of size {actual_size}.") -class SchemaMismatchError(Exception): - """Exception raised when schemas are unequal.""" - - def __init__(self) -> None: - super().__init__("Failed because at least two schemas didn't match.") - - class ColumnLengthMismatchError(Exception): """Exception raised when the lengths of two or more columns do not match.""" From 4c8f2a98fbdeccff2bf31a60b36c21cf80aafe7b Mon Sep 17 00:00:00 2001 From: Alexander <47296670+Marsmaennchen221@users.noreply.github.com> Date: Tue, 11 Jul 2023 23:59:39 +0200 Subject: [PATCH 47/51] Update src/safeds/data/tabular/typing/__init__.py --- src/safeds/data/tabular/typing/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/safeds/data/tabular/typing/__init__.py b/src/safeds/data/tabular/typing/__init__.py index 09ff84404..8b9b4a849 100644 --- a/src/safeds/data/tabular/typing/__init__.py +++ b/src/safeds/data/tabular/typing/__init__.py @@ -10,8 +10,8 @@ "ColumnType", "ImputerStrategy", "Integer", + "Nothing", "RealNumber", "Schema", "String", - "Nothing", ] From 80c35d5830a4dbd3db42d9beb56cda58ed09b714 Mon Sep 17 00:00:00 2001 From: Alex Senger <91055000+alex-senger@users.noreply.github.com> Date: Wed, 12 Jul 2023 15:48:03 +0200 Subject: [PATCH 48/51] Apply suggestions from code review Co-authored-by: Alexander <47296670+Marsmaennchen221@users.noreply.github.com> --- src/safeds/data/tabular/containers/_column.py | 2 +- src/safeds/data/tabular/typing/_column_type.py | 6 ++++++ .../_column/test_from_pandas_series.py | 5 +++-- .../data/tabular/containers/_column/test_init.py | 5 +++-- .../data/tabular/typing/test_column_type.py | 10 +++++++--- tests/safeds/data/tabular/typing/test_schema.py | 16 +++++++++++++--- 6 files changed, 33 insertions(+), 11 deletions(-) diff --git a/src/safeds/data/tabular/containers/_column.py b/src/safeds/data/tabular/containers/_column.py index 1d70fb877..9cfdbe402 100644 --- a/src/safeds/data/tabular/containers/_column.py +++ b/src/safeds/data/tabular/containers/_column.py @@ -106,7 +106,7 @@ def __init__(self, name: str, data: Sequence[T] | None = None) -> None: self._name: str = name self._data: pd.Series = data.rename(name) if isinstance(data, pd.Series) else pd.Series(data, name=name) # noinspection PyProtectedMember - self._type: ColumnType = ColumnType._data_type(data) + self._type: ColumnType = ColumnType._data_type(self._data) def __contains__(self, item: Any) -> bool: return item in self._data diff --git a/src/safeds/data/tabular/typing/_column_type.py b/src/safeds/data/tabular/typing/_column_type.py index b4ea73703..cb8de4c7a 100644 --- a/src/safeds/data/tabular/typing/_column_type.py +++ b/src/safeds/data/tabular/typing/_column_type.py @@ -81,6 +81,12 @@ def column_type_of_type(cell_type: Any) -> ColumnType: result = RealNumber(is_nullable) else: result = Anything(is_nullable) + if isinstance(cell, float) and np.isnan(cell): + is_nullable = True + result._is_nullable = is_nullable + + if isinstance(result, RealNumber) and all(data.apply(lambda c: True if (isinstance(c, float) and np.isnan(c)) or (c == float(int(c))) else False)): + result = Integer(is_nullable) return result diff --git a/tests/safeds/data/tabular/containers/_column/test_from_pandas_series.py b/tests/safeds/data/tabular/containers/_column/test_from_pandas_series.py index e4787c0fa..b62e3d1af 100644 --- a/tests/safeds/data/tabular/containers/_column/test_from_pandas_series.py +++ b/tests/safeds/data/tabular/containers/_column/test_from_pandas_series.py @@ -38,11 +38,12 @@ def test_should_use_type_if_passed(series: pd.Series, type_: ColumnType) -> None (pd.Series([]), Nothing()), (pd.Series([True, False, True]), Boolean()), (pd.Series([1, 2, 3]), Integer()), - (pd.Series([1.0, 2.0, 3.0]), RealNumber()), + (pd.Series([1.0, 2.0, 3.0]), Integer()), + (pd.Series([1.0, 2.5, 3.0]), RealNumber()), (pd.Series(["a", "b", "c"]), String()), (pd.Series([1, 2.0, "a", True]), Anything(is_nullable=False)), ], - ids=["empty", "boolean", "integer", "real number", "string", "mixed"], + ids=["empty", "boolean", "integer", "real number .0", "real number", "string", "mixed"], ) def test_should_infer_type_if_not_passed(series: pd.Series, expected: ColumnType) -> None: assert Column._from_pandas_series(series).type == expected diff --git a/tests/safeds/data/tabular/containers/_column/test_init.py b/tests/safeds/data/tabular/containers/_column/test_init.py index 9f2f2ec80..d7015b3ff 100644 --- a/tests/safeds/data/tabular/containers/_column/test_init.py +++ b/tests/safeds/data/tabular/containers/_column/test_init.py @@ -46,11 +46,12 @@ def test_should_store_the_data(column: Column, expected: list) -> None: (Column("A", []), Nothing()), (Column("A", [True, False, True]), Boolean()), (Column("A", [1, 2, 3]), Integer()), - (Column("A", [1.0, 2.0, 3.0]), RealNumber()), + (Column("A", [1.0, 2.0, 3.0]), Integer()), + (Column("A", [1.0, 2.5, 3.0]), RealNumber()), (Column("A", ["a", "b", "c"]), String()), (Column("A", [1, 2.0, "a", True]), Anything()), ], - ids=["empty", "boolean", "integer", "real number", "string", "mixed"], + ids=["empty", "boolean", "integer", "real number .0", "real number", "string", "mixed"], ) def test_should_infer_type(column: Column, expected: ColumnType) -> None: assert column.type == expected diff --git a/tests/safeds/data/tabular/typing/test_column_type.py b/tests/safeds/data/tabular/typing/test_column_type.py index fc9c7a5ca..09229a80f 100644 --- a/tests/safeds/data/tabular/typing/test_column_type.py +++ b/tests/safeds/data/tabular/typing/test_column_type.py @@ -19,31 +19,35 @@ class TestDataType: ("data", "expected"), [ ([1, 2, 3], Integer(is_nullable=False)), - ([1.0, 2.0, 3.0], RealNumber(is_nullable=False)), + ([1.0, 2.0, 3.0], Integer(is_nullable=False)), + ([1.0, 2.5, 3.0], RealNumber(is_nullable=False)), ([True, False, True], Boolean(is_nullable=False)), (["a", "b", "c"], String(is_nullable=False)), (["a", 1, 2.0], Anything(is_nullable=False)), ([None, None, None], Nothing()), ([None, 1, 2], Integer(is_nullable=True)), - ([1.0, 2.0, None], RealNumber(is_nullable=True)), + ([1.0, 2.0, None], Integer(is_nullable=True)), + ([1.0, 2.5, None], RealNumber(is_nullable=True)), ([True, False, None], Boolean(is_nullable=True)), (["a", None, "b"], String(is_nullable=True)), ], ids=[ "Integer", + "Real number .0", "Real number", "Boolean", "String", "Mixed", "None", "Nullable integer", + "Nullable RealNumber .0", "Nullable RealNumber", "Nullable Boolean", "Nullable String", ], ) def test_should_return_the_data_type(self, data: Iterable, expected: ColumnType) -> None: - assert ColumnType._data_type(data) == expected + assert ColumnType._data_type(pd.Series(data)) == expected @pytest.mark.parametrize( ("data", "error_message"), diff --git a/tests/safeds/data/tabular/typing/test_schema.py b/tests/safeds/data/tabular/typing/test_schema.py index 523c7de60..615cf7deb 100644 --- a/tests/safeds/data/tabular/typing/test_schema.py +++ b/tests/safeds/data/tabular/typing/test_schema.py @@ -36,6 +36,10 @@ class TestFromPandasDataFrame: pd.DataFrame({"A": [1, 2.0, "a", True]}), Schema({"A": Anything()}), ), + ( + pd.DataFrame({"A": [1.0, 2.5, 3.0]}), + Schema({"A": RealNumber()}), + ), ( pd.DataFrame({"A": [1, 2, 3], "B": ["a", "b", "c"]}), Schema({"A": Integer(), "B": String()}), @@ -46,11 +50,15 @@ class TestFromPandasDataFrame: ), ( pd.DataFrame({"A": [1, None, 3]}), - Schema({"A": RealNumber()}), + Schema({"A": Integer(is_nullable=True)}), ), ( pd.DataFrame({"A": [1.0, None, 3.0]}), - Schema({"A": RealNumber()}), + Schema({"A": Integer(is_nullable=True)}), + ), + ( + pd.DataFrame({"A": [1.5, None, 3.0]}), + Schema({"A": RealNumber(is_nullable=True)}), ), ( pd.DataFrame({"A": ["a", None, "c"]}), @@ -64,12 +72,14 @@ class TestFromPandasDataFrame: ids=[ "boolean", "integer", - "real number", + "real number .0", "string", "mixed", + "real number", "multiple columns", "boolean?", "integer?", + "real number? .0", "real number?", "string?", "Anything?", From c944492613a8bb1fbd9ae083ad2bb049f67dc21a Mon Sep 17 00:00:00 2001 From: alex-senger Date: Wed, 12 Jul 2023 15:50:40 +0200 Subject: [PATCH 49/51] test: apply suggestion from codereview --- tests/safeds/data/tabular/typing/test_schema.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/safeds/data/tabular/typing/test_schema.py b/tests/safeds/data/tabular/typing/test_schema.py index 615cf7deb..f6cd256d5 100644 --- a/tests/safeds/data/tabular/typing/test_schema.py +++ b/tests/safeds/data/tabular/typing/test_schema.py @@ -26,7 +26,7 @@ class TestFromPandasDataFrame: ), ( pd.DataFrame({"A": [1.0, 2.0, 3.0]}), - Schema({"A": RealNumber()}), + Schema({"A": Integer()}), ), ( pd.DataFrame({"A": ["a", "b", "c"]}), From c2b26792c6dfb352319f03f92a82f8ea66f4bdf4 Mon Sep 17 00:00:00 2001 From: alex-senger Date: Wed, 12 Jul 2023 15:52:08 +0200 Subject: [PATCH 50/51] test: add import --- tests/safeds/data/tabular/typing/test_column_type.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/safeds/data/tabular/typing/test_column_type.py b/tests/safeds/data/tabular/typing/test_column_type.py index 09229a80f..dfe9c13e6 100644 --- a/tests/safeds/data/tabular/typing/test_column_type.py +++ b/tests/safeds/data/tabular/typing/test_column_type.py @@ -2,6 +2,7 @@ from typing import Any import numpy as np +import pandas as pd import pytest from safeds.data.tabular.typing import ( Anything, From 551e396b05a0c7c1dbca32e5e015d0101d6d1845 Mon Sep 17 00:00:00 2001 From: megalinter-bot <129584137+megalinter-bot@users.noreply.github.com> Date: Wed, 12 Jul 2023 13:54:48 +0000 Subject: [PATCH 51/51] style: apply automated linter fixes --- src/safeds/data/tabular/typing/_column_type.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/safeds/data/tabular/typing/_column_type.py b/src/safeds/data/tabular/typing/_column_type.py index cb8de4c7a..2c4ca4f57 100644 --- a/src/safeds/data/tabular/typing/_column_type.py +++ b/src/safeds/data/tabular/typing/_column_type.py @@ -85,7 +85,9 @@ def column_type_of_type(cell_type: Any) -> ColumnType: is_nullable = True result._is_nullable = is_nullable - if isinstance(result, RealNumber) and all(data.apply(lambda c: True if (isinstance(c, float) and np.isnan(c)) or (c == float(int(c))) else False)): + if isinstance(result, RealNumber) and all( + data.apply(lambda c: bool(isinstance(c, float) and np.isnan(c) or c == float(int(c)))), + ): result = Integer(is_nullable) return result