Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: discretize table #327

Merged
merged 41 commits into from
Jul 7, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
32f8c91
add Discretizer file
robmeth May 26, 2023
b2705e5
add tests for Discretizer
robmeth May 26, 2023
0d06d95
Merge branch 'main' into 143-discretize-table
robmeth May 26, 2023
d88cdec
added Discretizer to init file
robmeth May 26, 2023
7a21a52
style: apply automated linter fixes
megalinter-bot May 26, 2023
432fde4
removed comment
robmeth May 26, 2023
603fda0
Merge branch 'main' into 143-discretize-table
robmeth May 26, 2023
2e72846
Merge branch '143-discretize-table' of https://github.com/Safe-DS/Std…
robmeth May 26, 2023
8ff87bb
trying to fix the test but it doesn't work
robmeth Jun 9, 2023
2e8cf27
requested changes
robmeth Jun 23, 2023
146cb54
Merge branch 'main' into 143-discretize-table
robmeth Jun 23, 2023
a3e46bb
requested changes
robmeth Jun 23, 2023
b33a95f
import changes
robmeth Jun 23, 2023
f053755
Merge branch 'main' into 143-discretize-table
robmeth Jun 30, 2023
b3d6baf
add ValueError and NonNumericColumnError and test them
robmeth Jun 30, 2023
0435b55
fixed import
robmeth Jun 30, 2023
cf55a5e
style: apply automated linter fixes
megalinter-bot Jun 30, 2023
4b58a0d
style: apply automated linter fixes
megalinter-bot Jun 30, 2023
4a7520a
fixed import
robmeth Jun 30, 2023
452b1d8
fixed import
robmeth Jun 30, 2023
99f1391
style: apply automated linter fixes
megalinter-bot Jun 30, 2023
93fe3cc
fixed import
robmeth Jun 30, 2023
820ae0a
merge conflict
robmeth Jun 30, 2023
30b5482
change order of errors
robmeth Jun 30, 2023
7214b68
style: apply automated linter fixes
megalinter-bot Jun 30, 2023
cf38032
requested changes
robmeth Jun 30, 2023
2a5b687
requested changes
robmeth Jun 30, 2023
bcd834f
Merge branch '143-discretize-table' of https://github.com/Safe-DS/Std…
robmeth Jun 30, 2023
135aa9e
make linter happy
robmeth Jun 30, 2023
4e48200
make linter happy
robmeth Jun 30, 2023
c113b59
make linter happy
robmeth Jun 30, 2023
1f332aa
make linter happy
robmeth Jun 30, 2023
7f0775e
make linter happy
robmeth Jun 30, 2023
9b5edf0
style: apply automated linter fixes
megalinter-bot Jun 30, 2023
11a9803
Merge branch 'main' into 143-discretize-table
robmeth Jun 30, 2023
ed00f74
style: apply automated linter fixes
megalinter-bot Jun 30, 2023
7b5df1b
Merge branch 'main' into 143-discretize-table
robmeth Jun 30, 2023
be05305
requested changes, match error message
robmeth Jul 7, 2023
bc92b9a
Merge branch '143-discretize-table' of https://github.com/Safe-DS/Std…
robmeth Jul 7, 2023
b2c1b29
requested changes
robmeth Jul 7, 2023
85827e4
Merge branch 'main' into 143-discretize-table
robmeth Jul 7, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/safeds/data/tabular/transformation/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Classes for transforming tabular data."""

from ._discretizer import Discretizer
from ._imputer import Imputer
from ._label_encoder import LabelEncoder
from ._one_hot_encoder import OneHotEncoder
Expand All @@ -14,5 +15,6 @@
"InvertibleTableTransformer",
"TableTransformer",
"RangeScaler",
"Discretizer",
"StandardScaler",
]
205 changes: 205 additions & 0 deletions src/safeds/data/tabular/transformation/_discretizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
from __future__ import annotations

from sklearn.preprocessing import KBinsDiscretizer as sk_KBinsDiscretizer

from safeds.data.tabular.containers import Table
from safeds.data.tabular.transformation._table_transformer import TableTransformer
robmeth marked this conversation as resolved.
Show resolved Hide resolved
from safeds.exceptions import NonNumericColumnError, TransformerNotFittedError, UnknownColumnNameError


class Discretizer(TableTransformer):
sibre28 marked this conversation as resolved.
Show resolved Hide resolved
"""
The Discretizer bins continuous data into intervals.

Parameters
----------
number_of_bins: float
The number of bins to be created.

Raises
------
ValueError
If the given number_of_bins is less than 2.
"""

def __init__(self, number_of_bins: float = 5):
self._column_names: list[str] | None = None
self._wrapped_transformer: sk_KBinsDiscretizer | None = None

if number_of_bins < 2:
raise ValueError("Parameter 'number_of_bins' must be >= 2.")
self._number_of_bins = number_of_bins

def fit(self, table: Table, column_names: list[str] | None) -> Discretizer:
robmeth marked this conversation as resolved.
Show resolved Hide resolved
"""
Learn a transformation for a set of columns in a table.

This transformer is not modified.

Parameters
----------
table : Table
The table used to fit the transformer.
column_names : list[str] | None
The list of columns from the table used to fit the transformer. If `None`, all columns are used.

Returns
-------
fitted_transformer : TableTransformer
The fitted transformer.

Raises
------
ValueError
If the table is empty.
NonNumericColumnError
If one of the columns, that should be fitted is non-numeric.
UnknownColumnNameError
If one of the columns, that should be fitted is not in the table.
"""
if table.number_of_rows == 0:
raise ValueError("The Discretizer cannot be fitted because the table contains 0 rows")

if column_names is None:
column_names = table.column_names
else:
missing_columns = set(column_names) - set(table.column_names)
if len(missing_columns) > 0:
raise UnknownColumnNameError(
sorted(
missing_columns,
key={val: ix for ix, val in enumerate(column_names)}.__getitem__,
),
)

for column in column_names:
if not table.get_column(column).type.is_numeric():
raise NonNumericColumnError(f"{column} is of type {table.get_column(column).type}.")

wrapped_transformer = sk_KBinsDiscretizer(n_bins=self._number_of_bins, encode="ordinal")
wrapped_transformer.fit(table._data[column_names])

result = Discretizer(self._number_of_bins)
result._wrapped_transformer = wrapped_transformer
result._column_names = column_names

return result

def transform(self, table: Table) -> Table:
robmeth marked this conversation as resolved.
Show resolved Hide resolved
"""
Apply the learned transformation to a table.

The table is not modified.

Parameters
----------
table : Table
The table to which the learned transformation is applied.

Returns
-------
transformed_table : Table
The transformed table.

Raises
------
TransformerNotFittedError
If the transformer has not been fitted yet.
ValueError
If the table is empty.
UnknownColumnNameError
If one of the columns, that should be transformed is not in the table.
NonNumericColumnError
If one of the columns, that should be fitted is non-numeric.
"""
# Transformer has not been fitted yet
if self._wrapped_transformer is None or self._column_names is None:
raise TransformerNotFittedError

if table.number_of_rows == 0:
raise ValueError("The table cannot be transformed because it contains 0 rows")

# Input table does not contain all columns used to fit the transformer
missing_columns = set(self._column_names) - set(table.column_names)
if len(missing_columns) > 0:
raise UnknownColumnNameError(
sorted(
missing_columns,
key={val: ix for ix, val in enumerate(self._column_names)}.__getitem__,
),
)

for column in self._column_names:
if not table.get_column(column).type.is_numeric():
raise NonNumericColumnError(f"{column} is of type {table.get_column(column).type}.")

data = table._data.copy()
data.columns = table.column_names
data[self._column_names] = self._wrapped_transformer.transform(data[self._column_names])
return Table._from_pandas_dataframe(data)

def is_fitted(self) -> bool:
"""
Check if the transformer is fitted.

Returns
-------
is_fitted : bool
Whether the transformer is fitted.
"""
return self._wrapped_transformer is not None

def get_names_of_added_columns(self) -> list[str]:
"""
Get the names of all new columns that have been added by the Discretizer.

Returns
-------
added_columns : list[str]
A list of names of the added columns, ordered as they will appear in the table.

Raises
------
TransformerNotFittedError
If the transformer has not been fitted yet.
"""
if not self.is_fitted():
raise TransformerNotFittedError
return []

# (Must implement abstract method, cannot instantiate class otherwise.)
def get_names_of_changed_columns(self) -> list[str]:
"""
Get the names of all columns that may have been changed by the Discretizer.

Returns
-------
changed_columns : list[str]
The list of (potentially) changed column names, as passed to fit.

Raises
------
TransformerNotFittedError
If the transformer has not been fitted yet.
"""
if self._column_names is None:
raise TransformerNotFittedError
return self._column_names

def get_names_of_removed_columns(self) -> list[str]:
"""
Get the names of all columns that have been removed by the Discretizer.

Returns
-------
removed_columns : list[str]
A list of names of the removed columns, ordered as they appear in the table the Discretizer was fitted on.

Raises
------
TransformerNotFittedError
If the transformer has not been fitted yet.
"""
if not self.is_fitted():
raise TransformerNotFittedError
return []
Loading