diff --git a/.pyenchant_dessia.dict b/.pyenchant_dessia.dict index 19eebcbec..7cf07616e 100644 --- a/.pyenchant_dessia.dict +++ b/.pyenchant_dessia.dict @@ -1,12 +1,16 @@ +adam args argspecs agglomerative backend +BallTree +binarized BinaryFile bool boolean boolarg builtins +cKDTree centroid centroids classmethod @@ -18,12 +22,16 @@ CMA config csv cvar +Cython dataset Dataset datasets Datasets datatools dbscan +DecisionTree +DecisionTreeClassifier +DecisionTreeRegressor dectree dendrogram deserialization @@ -32,11 +40,13 @@ deserialized dessia DessiaObject dict +dicts dimensionality distarg docstring Dessia's DessiaFilter +DOE enum Enum eps @@ -48,7 +58,9 @@ frontend hlist hotfix ge +getitem getsizeof +gini gt gte init @@ -56,21 +68,36 @@ intarg instantiation iterable itertools +th json JSON jsonschema +kernel +KMeans kwargs -le +hyperparameter +hyperparameters +l1 +l2 len -lhs +LinearModel +LinearRegression +linf linspace -lp lt +le +lhs +lp lte Mahalanobis matchable matplotlib Minkowski +MLPClassifier +MLPRegressor +multiclass +MultiLayerPerceptron +multioutput Mongodb Multiplot nbv @@ -84,12 +111,25 @@ Optimizer optimizers Optimizers orjson +Platt +poly +RandomForest +RandomForestClassifier +RandomForestRegressor +rbf +Ridge Pycharm rtype +scalers schemas scipy +scikit serializable +sigmoid sklearn +SupportVectorClassifier +SupportVectorMachine +SupportVectorRegressor sqrt STL str @@ -114,4 +154,5 @@ uuids visjs volmdlr XLS +XLSX xlsx diff --git a/.pylintrc b/.pylintrc index 68a8c0ee3..a665575eb 100644 --- a/.pylintrc +++ b/.pylintrc @@ -352,7 +352,7 @@ function-naming-style=snake_case # Good variable names which should always be accepted, separated by a comma. good-names=i,j,k,n,ie,x,xi,x0,v1,v2, - ax, + ax,_probA,_probB,C, _ # Good variable names regexes, separated by a comma. If names match any regex, diff --git a/CHANGELOG.md b/CHANGELOG.md index cbde0c75c..15c400255 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,22 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added +- Blocks : add GetModelAttribute which will replace ModelAttribute in few release +- Blocks : display blocks are now more configurable (custom selector and type) +- DessiaObject: add type to load_from_file method +- Document generator: add new class Table +- Files: .doc & .docx files typings +- Schemas refactor: add support of Ellipsed tuple (Tuple[T, ...]) +- Schemas refactor: add more Error Checks +- Schemas refactor: add a json export to method schemas for low-code implementations +- Schemas refactor: add default value to method types +- Schemas refactor: add standalone in db property +- Typings : add AttributeType and ClassAttributeType +- Datatools : + - Add the possibility to use scikit-learn principal machine learning algorithms as DessiaObjects + - Add Modeler class that allows to initialize, train, validate and use a machine learning model from sklearn + - Add CrossValidation stuff for these models + - Link it to Dataset - Schemas : Steps have been added to provide a framework for wizard. - Workflow : Steps and methods have been added to provide a framework for wizard. @@ -440,6 +456,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - code_pydocstyle.py checks daily instead of weekly - Add a time decrease effect for pylint +### Added +- modeling.py file in datatools which contains: + * StandardScaler, IdentityScaler + * Regressions: SVR, RandomForest, MLP, Ridge, DecisionTree + * Classifications: SVC, RandomForest, MLP, DecisionTree +- modeler.py file in datatools which allows to : + * Handle full machine learning modelization process (scale data, train model and validate), with plots + ### Performance - Conform doc for many parts of dessia_common @@ -474,6 +498,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Fix path deepth when dict misses keys ### Changed + - Refactor copy_pipes and nbv checking ### Performance diff --git a/dessia_common/core.py b/dessia_common/core.py index 8f1df76ee..213892e3e 100644 --- a/dessia_common/core.py +++ b/dessia_common/core.py @@ -631,7 +631,7 @@ def save_export_to_file(self, selector: str, filepath: str): raise ValueError(f'Export selector not found: {selector}') def to_vector(self): - """ Compute vector from object. """ + """ Get all values of specified attributes into a list of values (vector). """ vectored_objects = [] for feature in self.vector_features(): vectored_objects.append(dch.get_in_object_from_path(self, feature.lower())) @@ -639,7 +639,7 @@ def to_vector(self): @classmethod def vector_features(cls): - """ Get a list of vector features, or generate a default one. """ + """ Get the list of attributes specified in _vector_features attribute (in order to build a Dataset). """ if cls._vector_features is None: return list(set(get_attribute_names(cls)).difference(get_attribute_names(DessiaObject))) return cls._vector_features @@ -1055,7 +1055,7 @@ def get_booleans_index(self, dobjects_list: List[DessiaObject]): :param dobject_list: List of data to filter :type dobject_list: List[DessiaObject] - :return: A `booleans index` of `dobjects_list` of the list of data to filter (`dobjects_list`) + :return: A boolean index of `dobjects_list` of the list of data to filter (`dobjects_list`) :rtype: List[bool] :Examples: diff --git a/dessia_common/datatools/__init__.py b/dessia_common/datatools/__init__.py index fea5c85a5..ae571ac40 100644 --- a/dessia_common/datatools/__init__.py +++ b/dessia_common/datatools/__init__.py @@ -1,7 +1,6 @@ -""" __init__ method for datatools module """ +""" __init__ method for datatools module. """ import warnings -from typing import List from dessia_common.core import DessiaObject import dessia_common.datatools.dataset as DS import dessia_common.datatools.cluster as DC @@ -11,7 +10,7 @@ class HeterogeneousList(DS.Dataset): - def __init__(self, dessia_objects: List[DessiaObject] = None, name: str = ''): + def __init__(self, dessia_objects: list[DessiaObject] = None, name: str = ''): self.warning_string() DS.Dataset.__init__(self, dessia_objects=dessia_objects, name=name) @@ -23,7 +22,7 @@ def warning_string(self): class CategorizedList(DC.ClusteredDataset): - def __init__(self, dessia_objects: List[DessiaObject] = None, labels: List[int] = None, name: str = ''): + def __init__(self, dessia_objects: list[DessiaObject] = None, labels: list[int] = None, name: str = ''): self.warning_string() DC.ClusteredDataset.__init__(self, dessia_objects=dessia_objects, labels=labels, name=name) diff --git a/dessia_common/datatools/cluster.py b/dessia_common/datatools/cluster.py index 1adc1e3f0..6bf632c89 100644 --- a/dessia_common/datatools/cluster.py +++ b/dessia_common/datatools/cluster.py @@ -1,5 +1,4 @@ """ Library for building clusters on Dataset or List. """ -from typing import List from scipy.spatial.distance import cdist import numpy as npy @@ -20,30 +19,27 @@ class ClusteredDataset(Dataset): """ Base object for handling a categorized (clustered) list of DessiaObjects. - **ClusteredDataset should be instantiated with** `from_...` **methods.** + **`ClusteredDataset` should be instantiated with** `from_...` **methods.** - **Do not use** `__init__` **to instantiate a ClusteredDataset.** + **Do not use** `__init__` **to instantiate a `ClusteredDataset`.** :param dessia_objects: - -------- - List of DessiaObjects to store in ClusteredDataset - :type dessia_objects: `List[DessiaObject]`, `optional`, defaults to `None` + List of DessiaObjects to store in `ClusteredDataset` + :type dessia_objects: `list[DessiaObject]`, `optional`, defaults to `None` :param labels: - -------- - Labels of DessiaObjects' cluster stored in ClusteredDataset - :type labels: `List[int]`, `optional`, defaults to `None` + Labels of DessiaObjects' cluster stored in `ClusteredDataset` + :type labels: `list[int]`, `optional`, defaults to `None` :param name: - -------- - Name of ClusteredDataset + Name of `ClusteredDataset` :type name: `str`, `optional`, defaults to `""` :Properties: - * **common_attributes:** (`List[str]`) + * **common_attributes:** (`list[str]`) -------- Common attributes of DessiaObjects contained in the current `ClusteredDataset` - * **matrix:** (`List[List[float]]`, `n_samples x n_features`) + * **matrix:** (`list[list[float]]`, `n_samples x n_features`) -------- Matrix of data computed by calling the to_vector method of all dessia_objects * **n_cluster:** (`int`) @@ -55,7 +51,7 @@ class ClusteredDataset(Dataset): _allowed_methods = ['from_agglomerative_clustering', 'from_kmeans', 'from_dbscan', 'from_pareto_sheets'] - def __init__(self, dessia_objects: List[DessiaObject] = None, labels: List[int] = None, name: str = ''): + def __init__(self, dessia_objects: list[DessiaObject] = None, labels: list[int] = None, name: str = ''): """ See class docstring. """ Dataset.__init__(self, dessia_objects=dessia_objects, name=name) if labels is None: @@ -78,16 +74,16 @@ def to_xlsx_stream(self, stream): writer.save_to_stream(stream) def _pick_from_slice(self, key: slice): - new_hlist = Dataset._pick_from_slice(self, key) - new_hlist.labels = self.labels[key] - # new_hlist.name += f"_{key.start if key.start is not None else 0}_{key.stop}") - return new_hlist + new_dataset = Dataset._pick_from_slice(self, key) + new_dataset.labels = self.labels[key] + # new_dataset.name += f"_{key.start if key.start is not None else 0}_{key.stop}") + return new_dataset - def _pick_from_boolist(self, key: List[bool]): - new_hlist = Dataset._pick_from_boolist(self, key) - new_hlist.labels = DessiaFilter.apply(self.labels, key) - # new_hlist.name += "_list") - return new_hlist + def _pick_from_boolist(self, key: list[bool]): + new_dataset = Dataset._pick_from_boolist(self, key) + new_dataset.labels = DessiaFilter.apply(self.labels, key) + # new_dataset.name += "_list") + return new_dataset def _printed_attributes(self): return ["label"] + Dataset._printed_attributes(self) @@ -104,27 +100,27 @@ def _get_printed_value(self, index: int, attr: str): def clustered_sublists(self): """ - Split a ClusteredDataset of labeled DessiaObjects into a ClusteredDatasetet of labeled Datasets. + Split a `ClusteredDataset` of labeled DessiaObjects into a `ClusteredDataset` of labeled Datasets. - :return: A ClusteredDataset of length n_cluster that store each cluster in a Dataset. Labels are \ + :return: A `ClusteredDataset` of length n_cluster that store each cluster in a Dataset. Labels are \ the labels of each cluster, i.e. stored Dataset - :rtype: ClusteredDataset[Dataset] + :rtype: `ClusteredDataset`[Dataset] :Examples: >>> from dessia_common.datatools.dataset import Dataset >>> from dessia_common.datatools.cluster import ClusteredDataset >>> from dessia_common.models import all_cars_wi_feat - >>> hlist = Dataset(all_cars_wi_feat, name="cars") - >>> clist = ClusteredDataset.from_agglomerative_clustering(hlist, n_clusters=10, name="ex") - >>> split_clist = clist.clustered_sublists() - >>> print(split_clist[:3]) + >>> dataset = Dataset(all_cars_wi_feat, name="cars") + >>> clustered_dataset = ClusteredDataset.from_agglomerative_clustering(dataset, n_clusters=10, name="ex") + >>> split_clustered_dataset = clustered_dataset.clustered_sublists() + >>> print(split_clustered_dataset[:3]) ClusteredDataset ex_split: 3 samples, 2 features, 3 clusters | n° | Name | Common_attributes | --------------------------------------------- | 0 | ex_0 |['mpg', 'displacement...| | 1 | ex_1 |['mpg', 'displacement...| | 2 | ex_2 |['mpg', 'displacement...| - >>> print(split_clist[3][:3]) + >>> print(split_clustered_dataset[3][:3]) Dataset ex_3: 3 samples, 5 features | Mpg | Displacement | Horsepower | Weight | Acceleration | ------------------------------------------------------------------------------- @@ -165,22 +161,22 @@ def mean_clusters(self): :return: A list of `n_cluster` lists of `n_samples` where each element is the average value in a dimension in \ one cluster. - :rtype: List[List[float]] + :rtype: list[list[float]] :Examples: >>> from dessia_common.datatools.dataset import Dataset >>> from dessia_common.datatools.cluster import ClusteredDataset >>> from dessia_common.models import all_cars_wi_feat - >>> hlist = Dataset(all_cars_wi_feat, name="cars") - >>> clist = ClusteredDataset.from_agglomerative_clustering(hlist, n_clusters=10, name="ex") - >>> means = clist.mean_clusters() + >>> dataset = Dataset(all_cars_wi_feat, name="cars") + >>> clustered_dataset = ClusteredDataset.from_agglomerative_clustering(dataset, n_clusters=10, name="ex") + >>> means = clustered_dataset.mean_clusters() >>> print(means[0]) [28.83333333333334, 0.10651785714285714, 79.16666666666667, 2250.3571428571427, 16.075000000000006] """ clustered_sublists = self._check_transform_sublists() means = [] - for hlist in clustered_sublists: - means.append(hlist.mean()) + for dataset in clustered_sublists: + means.append(dataset.mean()) return means def cluster_distances(self, method: str = 'minkowski', **kwargs): @@ -211,15 +207,15 @@ def cluster_distances(self, method: str = 'minkowski', **kwargs): :type **kwargs: `dict`, `optional` :return: `n_clusters` lists of distances of all elements of a cluster from its mean. - :rtype: List[List[float]] + :rtype: list[list[float]] :Examples: >>> from dessia_common.datatools.dataset import Dataset >>> from dessia_common.datatools.cluster import ClusteredDataset >>> from dessia_common.models import all_cars_wi_feat - >>> hlist = Dataset(all_cars_wi_feat, name="cars") - >>> clist = ClusteredDataset.from_agglomerative_clustering(hlist, n_clusters=10, name="ex") - >>> cluster_distances = clist.cluster_distances() + >>> dataset = Dataset(all_cars_wi_feat, name="cars") + >>> clustered_dataset = ClusteredDataset.from_agglomerative_clustering(dataset, n_clusters=10, name="ex") + >>> cluster_distances = clustered_dataset.cluster_distances() >>> print(list(map(int, cluster_distances[6]))) [180, 62, 162, 47, 347, 161, 160, 67, 164, 206, 114, 138, 97, 159, 124, 139] """ @@ -227,8 +223,8 @@ def cluster_distances(self, method: str = 'minkowski', **kwargs): kwargs = self._set_distance_kwargs(method, kwargs) means = clustered_sublists.mean_clusters() cluster_distances = [] - for mean_, hlist in zip(means, clustered_sublists): - cluster_distances.append(cdist([mean_], hlist.matrix, method, **kwargs).tolist()[0]) + for mean_, dataset in zip(means, clustered_sublists): + cluster_distances.append(cdist([mean_], dataset.matrix, method, **kwargs).tolist()[0]) return cluster_distances def cluster_real_centroids(self, method: str = 'minkowski', **kwargs): @@ -257,15 +253,15 @@ def cluster_real_centroids(self, method: str = 'minkowski', **kwargs): :type **kwargs: `dict`, `optional` :return: `n_clusters` lists of distances of all elements of a cluster from its mean. - :rtype: List[List[float]] + :rtype: list[list[float]] :Examples: >>> from dessia_common.datatools.dataset import Dataset >>> from dessia_common.datatools.cluster import ClusteredDataset >>> from dessia_common.models import all_cars_wi_feat - >>> hlist = Dataset(all_cars_wi_feat, name="cars") - >>> clist = ClusteredDataset.from_agglomerative_clustering(hlist, n_clusters=10, name="ex") - >>> cluster_real_centroids = clist.cluster_real_centroids() + >>> dataset = Dataset(all_cars_wi_feat, name="cars") + >>> clustered_dataset = ClusteredDataset.from_agglomerative_clustering(dataset, n_clusters=10, name="ex") + >>> cluster_real_centroids = clustered_dataset.cluster_real_centroids() >>> print(Dataset([cluster_real_centroids[0]])) Dataset 0x7f752654a0a0: 1 samples, 5 features | Name | Mpg | Displacement | Horsepower | Weight | Acceleration | @@ -283,13 +279,13 @@ def cluster_real_centroids(self, method: str = 'minkowski', **kwargs): return real_centroids def _merge_sublists(self): - merged_hlists = self.dessia_objects[0][:] - merged_labels = [self.labels[0]] * len(merged_hlists) + merged_datasets = self.dessia_objects[0][:] + merged_labels = [self.labels[0]] * len(merged_datasets) for dobject, label in zip(self.dessia_objects[1:], self.labels[1:]): - merged_hlists.extend(dobject) + merged_datasets.extend(dobject) merged_labels.extend([label] * len(dobject)) - plotted_clist = self.__class__(dessia_objects=merged_hlists.dessia_objects, labels=merged_labels) - return plotted_clist + plotted_clustered_dataset = self.__class__(dessia_objects=merged_datasets.dessia_objects, labels=merged_labels) + return plotted_clustered_dataset def _tooltip_attributes(self): return self.common_attributes + ["Cluster Label"] @@ -301,8 +297,8 @@ def plot_data(self, reference_path: str = "#", **kwargs): If dessia_objects are Dataset, merge all Dataset to plot them in one. """ if isinstance(self.dessia_objects[0], Dataset): - plotted_clist = self._merge_sublists() - return plotted_clist.plot_data(reference_path=reference_path, **kwargs) + plotted_clustered_dataset = self._merge_sublists() + return plotted_clustered_dataset.plot_data(reference_path=reference_path, **kwargs) return Dataset.plot_data(self, reference_path=reference_path, **kwargs) def _object_to_sample(self, dessia_object: DessiaObject, row: int, reference_path: str = '#'): @@ -355,7 +351,7 @@ def from_agglomerative_clustering(cls, data: Dataset, n_clusters: int = 2, See more : https://scikit-learn.org/stable/modules/clustering.html#hierarchical-clustering :param data: The future clustered data. - :type data: List[DessiaObject] + :type data: list[DessiaObject] :param n_clusters: Number of wished clusters. @@ -395,7 +391,7 @@ def from_agglomerative_clustering(cls, data: Dataset, n_clusters: int = 2, Formula is `scaled_x = ( x - mean )/standard_deviation` :type scaling: `bool`, `optional`, default to `False` - :return: a ClusteredDataset that knows the data and their labels + :return: a `ClusteredDataset` that knows the data and their labels :rtype: ClusteredDataset """ skl_cluster = cluster.AgglomerativeClustering( @@ -422,7 +418,7 @@ def from_kmeans(cls, data: Dataset, n_clusters: int = 2, n_init: int = 10, tol: See more : https://scikit-learn.org/stable/modules/clustering.html#k-means :param data: The future clustered data. - :type data: List[DessiaObject] + :type data: list[DessiaObject] :param n_clusters: Number of wished clusters @@ -444,7 +440,7 @@ def from_kmeans(cls, data: Dataset, n_clusters: int = 2, n_init: int = 10, tol: Formula is `scaled_x = ( x - mean )/standard_deviation` :type scaling: `bool`, `optional`, default to `False` - :return: a ClusteredDataset that knows the data and their labels + :return: a `ClusteredDataset` that knows the data and their labels :rtype: ClusteredDataset """ skl_cluster = cluster.KMeans(n_clusters=n_clusters, n_init=n_init, tol=tol) @@ -469,7 +465,7 @@ def from_dbscan(cls, data: Dataset, eps: float = 0.5, min_samples: int = 5, mink See more : https://scikit-learn.org/stable/modules/clustering.html#dbscan :param data: The future clustered data. - :type data: List[DessiaObject] + :type data: list[DessiaObject] :param eps: The maximum distance between two samples for one to be considered as in the neighborhood of the other. @@ -506,7 +502,7 @@ def from_dbscan(cls, data: Dataset, eps: float = 0.5, min_samples: int = 5, mink Formula is `scaled_x = ( x - mean )/standard_deviation` :type scaling: `bool`, `optional`, default to `False` - :return: a ClusteredDataset that knows the data and their labels + :return: a `ClusteredDataset` that knows the data and their labels :rtype: ClusteredDataset """ skl_cluster = cluster.DBSCAN(eps=eps, min_samples=min_samples, p=mink_power, leaf_size=leaf_size, metric=metric) @@ -514,7 +510,7 @@ def from_dbscan(cls, data: Dataset, eps: float = 0.5, min_samples: int = 5, mink return cls(data.dessia_objects, skl_cluster.labels_.tolist(), name=name) @classmethod - def from_pareto_sheets(cls, h_list: Dataset, costs_columns: List[str], nb_sheets: int = 1): + def from_pareto_sheets(cls, h_list: Dataset, costs_columns: list[str], nb_sheets: int = 1): """ Get successive Pareto sheets where each label is the index of a Pareto sheet put them in a `ClusteredDataset`. @@ -523,8 +519,8 @@ def from_pareto_sheets(cls, h_list: Dataset, costs_columns: List[str], nb_sheets :param h_list: The Dataset in which to pick optimal points. :type h_list: Dataset - :param costs_columns: List of columns' indexes or attributes on which costs are stored in current Dataset - :type costs_columns: `List[str]` + :param costs_columns: list of columns' indexes or attributes on which costs are stored in current Dataset + :type costs_columns: `list[str]` :param nb_sheets: Number of Pareto sheets to pick :type nb_sheets: `int`, `optional`, default to `1` @@ -544,15 +540,15 @@ def from_pareto_sheets(cls, h_list: Dataset, costs_columns: List[str], nb_sheets return cls(dessia_objects, labels) @staticmethod - def fit_cluster(skl_cluster: cluster, matrix: List[List[float]], scaling: bool): + def fit_cluster(skl_cluster: cluster, matrix: list[list[float]], scaling: bool): """ - Find clusters in data set for skl_cluster model. + Find clusters in data set for `skl_cluster` model. :param skl_cluster: sklearn.cluster object to compute clusters. :type data: cluster :param matrix: - List of data + list of data :type matrix: `float`, `n_samples x n_features` :param scaling: @@ -570,24 +566,24 @@ def fit_cluster(skl_cluster: cluster, matrix: List[List[float]], scaling: bool): return skl_cluster @classmethod - def list_agglomerative_clustering(cls, data: List[DessiaObject], n_clusters: int = 2, + def list_agglomerative_clustering(cls, data: list[DessiaObject], n_clusters: int = 2, metric: str = 'euclidean', linkage: str = 'ward', distance_threshold: float = None, scaling: bool = False, name: str = ""): - """ Does the same as `from_agglomerative_clustering` method but data is a `List[DessiaObject]`. """ + """ Does the same as `from_agglomerative_clustering` method but data is a `list[DessiaObject]`. """ return cls.from_agglomerative_clustering(Dataset(data), n_clusters=n_clusters, metric=metric, linkage=linkage, distance_threshold=distance_threshold, scaling=scaling, name=name) @classmethod - def list_kmeans(cls, data: List[DessiaObject], n_clusters: int = 2, n_init: int = 10, tol: float = 1e-4, + def list_kmeans(cls, data: list[DessiaObject], n_clusters: int = 2, n_init: int = 10, tol: float = 1e-4, scaling: bool = False, name: str = ""): - """ Does the same as `from_kmeans` method but data is a `List[DessiaObject]`. """ + """ Does the same as `from_kmeans` method but data is a `list[DessiaObject]`. """ return cls.from_kmeans(Dataset(data), n_clusters=n_clusters, n_init=n_init, tol=tol, scaling=scaling, name=name) @classmethod - def list_dbscan(cls, data: List[DessiaObject], eps: float = 0.5, min_samples: int = 5, mink_power: float = 2, + def list_dbscan(cls, data: list[DessiaObject], eps: float = 0.5, min_samples: int = 5, mink_power: float = 2, leaf_size: int = 30, metric: str = "euclidean", scaling: bool = False, name: str = ""): - """ Does the same as `from_dbscan` method but data is a `List[DessiaObject]`. """ + """ Does the same as `from_dbscan` method but data is a `list[DessiaObject]`. """ return cls.from_dbscan(Dataset(data), eps=eps, min_samples=min_samples, mink_power=mink_power, leaf_size=leaf_size, metric=metric, scaling=scaling, name=name) diff --git a/dessia_common/datatools/dataset.py b/dessia_common/datatools/dataset.py index f8fa4546a..e25553315 100644 --- a/dessia_common/datatools/dataset.py +++ b/dessia_common/datatools/dataset.py @@ -1,7 +1,7 @@ """ Library for building Dataset. """ import itertools from copy import copy -from typing import Any, Dict, List +from typing import Any import numpy as npy from scipy.spatial.distance import pdist, squareform @@ -15,11 +15,14 @@ pass from dessia_common import templates from dessia_common.core import DessiaFilter, DessiaObject, FiltersList -from dessia_common.datatools.metrics import (covariance_matrix, mean, std, - variance) from dessia_common.decorators import plot_data_view from dessia_common.exports import MarkdownWriter +from dessia_common.datatools.math import mean, std, variance, covariance_matrix, Vector, Matrix, maximums, minimums +from dessia_common.datatools import learning_models as models + + + class Dataset(DessiaObject): """ @@ -28,7 +31,7 @@ class Dataset(DessiaObject): :param dessia_objects: -------- List of DessiaObjects to store in Dataset - :type dessia_objects: List[DessiaObject], `optional`, defaults to `None` + :type dessia_objects: list[DessiaObject], `optional`, defaults to `None` :param name: -------- @@ -36,11 +39,11 @@ class Dataset(DessiaObject): :type name: str, `optional`, defaults to `''` :Properties: - * **common_attributes:** (`List[str]`) + * **common_attributes:** (`list[str]`) -------- Common attributes of DessiaObjects contained in the current `Dataset` - * **matrix:** (`List[List[float]]`, `n_samples x n_features`) + * **matrix:** (`list[list[float]]`, `n_samples x n_features`) -------- Matrix of data computed by calling the `to_vector` method of all `dessia_objects` @@ -48,7 +51,7 @@ class Dataset(DessiaObject): * __init__ >>> from dessia_common.datatools.dataset import Dataset >>> from dessia_common.models import all_cars_wi_feat - >>> hlist = Dataset(all_cars_wi_feat, name="init") + >>> dataset = Dataset(all_cars_wi_feat, name="init") * __str__ >>> print(Dataset(all_cars_wi_feat[:3], name='printed')) @@ -87,7 +90,7 @@ class Dataset(DessiaObject): _vector_features = ["name", "common_attributes"] _non_data_eq_attributes = ["name", "_common_attributes", "_matrix"] - def __init__(self, dessia_objects: List[DessiaObject] = None, name: str = ''): + def __init__(self, dessia_objects: list[DessiaObject] = None, name: str = ''): """ See class docstring. """ if dessia_objects is None: dessia_objects = [] @@ -130,14 +133,14 @@ def __add__(self, other: 'Dataset'): raise TypeError("Addition only defined for Dataset. A specific __add__ method is required for " f"{self.__class__}") - sum_hlist = self.__class__(dessia_objects=self.dessia_objects + other.dessia_objects, + sum_dataset = self.__class__(dessia_objects=self.dessia_objects + other.dessia_objects, name=self.name[:5] + '_+_' + other.name[:5]) if all(item in self.common_attributes for item in other.common_attributes): - sum_hlist._common_attributes = self.common_attributes + sum_dataset._common_attributes = self.common_attributes if self._matrix is not None and other._matrix is not None: - sum_hlist._matrix = self._matrix + other._matrix - return sum_hlist + sum_dataset._matrix = self._matrix + other._matrix + return sum_dataset def extend(self, other: 'Dataset'): """ @@ -161,27 +164,27 @@ def _pick_from_int(self, idx: int): return self.dessia_objects[idx] def _pick_from_slice(self, key: slice): - new_hlist = self.__class__(dessia_objects=self.dessia_objects[key], name=self.name) - new_hlist._common_attributes = copy(self._common_attributes) - new_hlist.dessia_objects = self.dessia_objects[key] + new_dataset = self.__class__(dessia_objects=self.dessia_objects[key], name=self.name) + new_dataset._common_attributes = copy(self._common_attributes) + new_dataset.dessia_objects = self.dessia_objects[key] if self._matrix is not None: - new_hlist._matrix = self._matrix[key] - # new_hlist.name += f"_{key.start if key.start is not None else 0}_{key.stop}") - return new_hlist + new_dataset._matrix = self._matrix[key] + # new_dataset.name += f"_{key.start if key.start is not None else 0}_{key.stop}") + return new_dataset - def _indexlist_to_booleanlist(self, index_list: List[int]): + def _indexlist_to_booleanlist(self, index_list: list[int]): boolean_list = [False] * len(self) for idx in index_list: boolean_list[idx] = True return boolean_list - def _pick_from_boolist(self, key: List[bool]): - new_hlist = self.__class__(dessia_objects=DessiaFilter.apply(self.dessia_objects, key), name=self.name) - new_hlist._common_attributes = copy(self._common_attributes) + def _pick_from_boolist(self, key: list[bool]): + new_dataset = self.__class__(dessia_objects=DessiaFilter.apply(self.dessia_objects, key), name=self.name) + new_dataset._common_attributes = copy(self._common_attributes) if self._matrix is not None: - new_hlist._matrix = DessiaFilter.apply(self._matrix, key) - # new_hlist.name += "_list") - return new_hlist + new_dataset._matrix = DessiaFilter.apply(self._matrix, key) + # new_dataset.name += "_list") + return new_dataset def __str__(self): """ Print Dataset as a table. """ @@ -195,14 +198,14 @@ def __str__(self): string = "" string += self._print_titles(attr_space) string += "\n" + "-" * len(string) - string += self._print_objects_slice(slice(0, 5), attr_space) + string += self._print_objects_slice(slice(0, 5, 1), attr_space) if len(self) > 10: undispl_len = len(self) - 10 string += (f"\n+ {undispl_len} undisplayed object" + "s" * (min([undispl_len, 2]) - 1) + "...") if len(self) > 5: - string += self._print_objects_slice(slice(-5, len(self)), attr_space) + string += self._print_objects_slice(slice(len(self) - 5, len(self), 1), attr_space) return prefix + "\n" + string + "\n" def _printed_attributes(self): @@ -210,9 +213,9 @@ def _printed_attributes(self): return self.common_attributes return ['name'] + self.common_attributes - def _print_objects_slice(self, key: slice, attr_space: List[int]): + def _print_objects_slice(self, key: slice, attr_space: list[int]): string = "" - for index in range(len(self[key])): + for index in range(key.start, key.stop, key.step): string += "\n" string += self._print_object(index, attr_space) return string @@ -222,7 +225,7 @@ def _write_str_prefix(self): prefix += f"{len(self)} samples, {len(self.common_attributes)} features" return prefix - def _print_titles(self, attr_space: List[int]): + def _print_titles(self, attr_space: list[int]): min_col_length = 16 printed_attributes = self._printed_attributes() string = "" @@ -235,15 +238,15 @@ def _print_titles(self, attr_space: List[int]): indentation = 3 else: indentation = min_col_length - len(attr) - odd_incr = int(indentation % 2) - indentation = int(indentation / 2) + odd_incr = int(indentation % 2) + indentation = int(indentation / 2) name_attr = " " * indentation + " " * odd_incr + f"{attr.capitalize()}" + " " * indentation attr_space.append(len(name_attr)) string += "|" + name_attr + end_bar return string - def _print_object(self, index: int, attr_space: List[int]): + def _print_object(self, index: int, attr_space: list[int]): printed_attributes = self._printed_attributes() string = "" for idx, attr in enumerate(printed_attributes): @@ -282,12 +285,12 @@ def _get_printed_value(self, index: int, attr: str): return self.matrix[index][self.common_attributes.index(attr)] def __len__(self): - """Length of Dataset is len(Dataset.dessia_objects).""" + """ Length of Dataset is len(Dataset.dessia_objects). """ return len(self.dessia_objects) @property def common_attributes(self): - """List of common attributes of stored dessia_objects.""" + """ List of common attributes of stored dessia_objects. """ if self._common_attributes is None: if len(self) == 0: return [] @@ -335,7 +338,7 @@ def attribute_values(self, attribute: str): :type attribute: str :return: A list of all values of the specified attribute of dessia_objects - :rtype: List[Any] + :rtype: list[Any] :Examples: >>> from dessia_common.datatools.dataset import Dataset @@ -357,7 +360,7 @@ def column_values(self, index: int): :type index: int :return: A list of all values of the specified attribute of dessia_objects - :rtype: List[float] + :rtype: list[float] :Examples: >>> from dessia_common.datatools.dataset import Dataset @@ -367,17 +370,17 @@ def column_values(self, index: int): """ return [row[index] for row in self.matrix] - def sub_matrix(self, columns_names: List[str]): + def sub_matrix(self, columns_names: list[str]): """ Build a sub matrix of the current Dataset taking column numbers in indexes or attribute values in attributes. Warning: Only one of `indexes` or `attributes` has to be specified. - :param columns_names: List of columns' names to create a sub matrix - :type columns_names: List[str] + :param columns_names: list of columns' names to create a sub matrix + :type columns_names: list[str] :return: Data stored in matrix reduced to the specified `indexes` or `attributes` - :rtype: List[List[float]] + :rtype: list[list[float]] :Examples: >>> from dessia_common.datatools.dataset import Dataset @@ -389,6 +392,16 @@ def sub_matrix(self, columns_names: List[str]): transposed_submatrix = [self.attribute_values(column_name) for column_name in columns_names] return list(map(list, zip(*transposed_submatrix))) + def to_input_output(self, input_names: list[str], output_names: list[str]) -> list[Matrix]: + """ Split matrix of Dataset in two matrices inputs and outputs according to input_names and output_names. """ + return self.sub_matrix(input_names), self.sub_matrix(output_names) + + def train_test_split(self, ratio: float = 0.8, shuffled: bool = True) -> tuple[Matrix, Matrix]: + """ Generate train and test Datasets from current Dataset.""" + index_train, index_test = models.get_split_indexes(len_matrix=len(self), ratio=ratio, shuffled=shuffled) + return (Dataset(self[index_train], name=self.name + '_train'), + Dataset(self[index_test], name=self.name + '_test')) + def sort(self, key: Any, ascend: bool = True): # TODO : Replace numpy with faster algorithms """ Sort the current Dataset along the given key. @@ -434,15 +447,24 @@ def sort(self, key: Any, ascend: bool = True): # TODO : Replace numpy with fast sort_indexes = npy.argsort(self.attribute_values(key)) self.dessia_objects = [self.dessia_objects[idx] for idx in (sort_indexes if ascend else sort_indexes[::-1])] if self._matrix is not None: - self._matrix = [self._matrix[idx] for idx in - (sort_indexes if ascend else sort_indexes[::-1])] + self._matrix = [self._matrix[idx] for idx in (sort_indexes if ascend else sort_indexes[::-1])] + + @property + def maximums(self): + """ Compute maximum values and store it in a list of length `n_features`. """ + return maximums(self.matrix) + + @property + def minimums(self): + """ Compute minimum values and store it in a list of length `n_features`. """ + return minimums(self.matrix) def mean(self): """ Compute means along each `common_attribute`. :return: A list of means along each dimension - :rtype: List[float] + :rtype: list[float] :Examples: >>> from dessia_common.datatools.dataset import Dataset @@ -458,7 +480,7 @@ def standard_deviation(self): Compute standard deviations along each `common_attribute`. :return: A list of standard deviations along each dimension - :rtype: List[float] + :rtype: list[float] :Examples: >>> from dessia_common.datatools.dataset import Dataset @@ -474,7 +496,7 @@ def variances(self): Compute variances along each `common_attribute`. :return: A list of variances along each dimension - :rtype: List[float] + :rtype: list[float] :Examples: >>> from dessia_common.datatools.dataset import Dataset @@ -490,7 +512,7 @@ def covariance_matrix(self): Compute the covariance matrix of `self.matrix`. :return: the covariance matrix of all stored data in self - :rtype: List[List[float]], `n_features x n_features` + :rtype: list[list[float]], `n_features x n_features` :Examples: >>> from dessia_common.datatools.dataset import Dataset @@ -532,7 +554,7 @@ def distance_matrix(self, method: str = 'minkowski', **kwargs): :type **kwargs: dict, `optional` :return: the distance matrix of all stored data in self - :rtype: List[List[float]], `n_samples x n_samples` + :rtype: list[list[float]], `n_samples x n_samples` :Examples: >>> from dessia_common.datatools.dataset import Dataset @@ -550,7 +572,7 @@ def distance_matrix(self, method: str = 'minkowski', **kwargs): return distances.tolist() @staticmethod - def _set_distance_kwargs(method: str, kwargs: Dict[str, Any]): + def _set_distance_kwargs(method: str, kwargs: dict[str, Any]): if 'p' not in kwargs and method == 'minkowski': kwargs['p'] = 2 return kwargs @@ -606,7 +628,7 @@ def singular_values(self): :return: **normalized_singular_values**: list of normalized singular values **singular_points**: list of points to plot in dimensionality plot. Does not add any information. - :rtype: Tuple[List[float], List[Dict[str, float]]] + :rtype: tuple[list[float], list[dict[str, float]]] """ scaled_data = Dataset._scale_data(npy.array(self.matrix) - npy.mean(self.matrix, axis=0)) _, singular_values, _ = npy.linalg.svd(npy.array(scaled_data).T, full_matrices=False) @@ -620,7 +642,7 @@ def singular_values(self): return normalized_singular_values, singular_points @staticmethod - def _scale_data(data_matrix: List[List[float]]): + def _scale_data(data_matrix: Matrix) -> Matrix: # TODO : replace it with the models Scaler ? scaled_matrix = preprocessing.StandardScaler().fit_transform(data_matrix) return [list(map(float, row.tolist())) for row in scaled_matrix] @@ -651,7 +673,7 @@ def plot_histogram(self, reference_path: str = "#", **kwargs): plot_mono_attr.elements = data_list return plot_mono_attr - def _build_multiplot(self, data_list: List[Dict[str, float]], tooltip: List[str], **kwargs: Dict[str, Any]): + def _build_multiplot(self, data_list: list[dict[str, float]], tooltip: list[str], **kwargs: dict[str, Any]): subplots = [] for line in self.common_attributes: for idx_col, col in enumerate(self.common_attributes): @@ -688,7 +710,7 @@ def _to_samples(self, reference_path: str = '#'): def _point_families(self): return [PointFamily(Color(182/255, 225/255, 251/255), list(range(len(self))))] - def _parallel_plot(self, data_list: List[Dict[str, float]]): + def _parallel_plot(self, data_list: list[dict[str, float]]): return ParallelPlot(elements=data_list, axes=self._parallel_plot_attr(), disposition='vertical') def _parallel_plot_attr(self): @@ -794,7 +816,7 @@ def _plot_dimensionality(self): return dimensionality_plot @staticmethod - def _check_costs(len_data: int, costs: List[List[float]]): + def _check_costs(len_data: int, costs: Matrix) -> Matrix: if len(costs) != len_data: if len(costs[0]) == len_data: return list(map(list, zip(*costs))) @@ -803,7 +825,7 @@ def _check_costs(len_data: int, costs: List[List[float]]): return costs @staticmethod - def pareto_indexes(costs: List[List[float]]): + def pareto_indexes(costs: Matrix) -> list[bool]: """ Find the Pareto-efficient points. @@ -820,7 +842,7 @@ def pareto_indexes(costs: List[List[float]]): return is_efficient.tolist() @staticmethod - def pareto_frontiers(len_data: int, costs: List[List[float]]): + def pareto_frontiers(len_data: int, costs: list[list[float]]): """ Experimental method to draw the borders of Pareto domain. """ checked_costs = Dataset._check_costs(len_data, costs) pareto_indexes = Dataset.pareto_indexes(checked_costs) @@ -838,8 +860,7 @@ def pareto_frontiers(len_data: int, costs: List[List[float]]): return pareto_frontiers @staticmethod - def _pareto_frontier_2d(x_dim: int, y_dim: int, pareto_costs: List[List[float]], max_x_dim: float, - super_mini: List[float]): + def _pareto_frontier_2d(x_dim: int, y_dim: int, pareto_costs: Matrix, max_x_dim: float, super_mini: Vector): # Experimental minidx = npy.argmin(pareto_costs[:, y_dim]) x_coord = pareto_costs[minidx, x_dim] @@ -858,17 +879,17 @@ def _pareto_frontier_2d(x_dim: int, y_dim: int, pareto_costs: List[List[float]], dir_coeffs[chosen_line] + offsets[chosen_line]]]).T return frontier_2d - def _compute_costs(self, costs_attributes: List[str]): + def _compute_costs(self, costs_attributes: list[str]): costs = self.sub_matrix(costs_attributes) return Dataset._check_costs(len(self), costs) - def pareto_points(self, costs_attributes: List[str]): + def pareto_points(self, costs_attributes: list[str]): """ Find the Pareto-efficient points. :param costs_attributes: List of columns' attributes on which costs are stored in current Dataset - :type costs_attributes: List[str] + :type costs_attributes: list[str] :return: a Dataset containing the selected points :rtype: Dataset @@ -876,18 +897,18 @@ def pareto_points(self, costs_attributes: List[str]): checked_costs = self._compute_costs(costs_attributes) return self[self.__class__.pareto_indexes(checked_costs)] - def pareto_sheets(self, costs_attributes: List[str], nb_sheets: int = 1): + def pareto_sheets(self, costs_attributes: list[str], nb_sheets: int = 1): """ Get successive Pareto sheets (i.e. optimal points in a DOE for already computed costs). :param costs_attributes: List of columns' attributes on which costs are stored in current Dataset - :type costs_attributes: List[str] + :type costs_attributes: list[str] :param nb_sheets: Number of Pareto sheets to pick :type nb_sheets: int, `optional`, default to `1` :return: The successive Pareto sheets and not selected elements - :rtype: List[Dataset], Dataset + :rtype: list[Dataset], Dataset """ checked_costs = self._compute_costs(costs_attributes) non_optimal_costs = checked_costs[:] diff --git a/dessia_common/datatools/learning_models.py b/dessia_common/datatools/learning_models.py new file mode 100644 index 000000000..b09ae0c40 --- /dev/null +++ b/dessia_common/datatools/learning_models.py @@ -0,0 +1,1630 @@ +""" Tools and base classes for machine learning methods. """ +from typing import Any, Union +import random + +import numpy as npy +from sklearn import preprocessing, linear_model, ensemble, tree, svm, neural_network + +from dessia_common.core import DessiaObject +from dessia_common.utils.types import is_sequence + +Vector = list[float] +Matrix = list[Vector] + +# ====================================================================================================================== +# S C A L E R S +# ====================================================================================================================== +class Scaler(DessiaObject): + """ Base object for handling a scikit-learn Scaler. """ + + _rebuild_attributes = [] + _allowed_methods = DessiaObject._allowed_methods + ["fit", "transform", "inverse_transform", "fit_transform", + "transform_matrices", "inverse_transform_matrices"] + + def __init__(self, name: str = ''): + DessiaObject.__init__(self, name=name) + + @classmethod + def _skl_class(cls): + raise NotImplementedError('Method _skl_class not implemented for Scaler. Please use children.') + + def _init_empty(self): + return self._skl_class()() + + @staticmethod + def _set_class(is_scaled: bool) -> 'Scaler': + if is_scaled: + return StandardScaler + return IdentityScaler + + @staticmethod + def _set_name(modeler_name: str, in_out: str, is_scaled: bool) -> str: + name = f"{modeler_name}_" + return name + (f"{in_out}_scaler" if is_scaled else "identity_scaler") + + @classmethod + def set_in_modeler(cls, modeler_name: str, in_out: str, is_scaled: bool) -> tuple['Scaler', str]: + """ Set scaler in modeler. """ + class_ = cls._set_class(is_scaled) + name = cls._set_name(modeler_name, in_out, is_scaled) + return class_, name + + def instantiate_skl(self): + """ Instantiate scikit-learn `Scaler` from `Scaler` dessia object, or children. """ + scaler = self._init_empty() + for attr in self._rebuild_attributes: + setattr(scaler, attr, getattr(self, attr)) + return scaler + + @classmethod + def instantiate_dessia(cls, scaler, name: str = '') -> 'Scaler': + """ Instantiate `Scaler` dessia object, or children, from scikit-learn scaler. """ + kwargs = {attr: get_scaler_attribute(scaler, attr) for attr in cls._rebuild_attributes} + kwargs["name"] = name + return cls(**kwargs) + + @classmethod + def fit(cls, matrix: Matrix, name: str = '') -> 'Scaler': + """ + Fit scaler with data stored in matrix. + + :param matrix: + Matrix of data of dimension `n_samples x n_features` + + :param name: + Name of Scaler + + :return: The Scaler or children (DessiaObject) fit on matrix. + """ + scaler = cls._skl_class()() + reshaped_matrix = vector_to_2d_matrix(matrix) + scaler.fit(reshaped_matrix) + return cls.instantiate_dessia(scaler, name) + + def _prepare_transform(self, matrix: Matrix) -> tuple['Scaler', Matrix]: + return self.instantiate_skl(), vector_to_2d_matrix(matrix) + + def transform(self, matrix: Matrix) -> Matrix: + """ + Transform the data stored in matrix according to this Scaler or children. + + :param matrix: + Matrix of data of dimension `n_samples x n_features` + + :return: The scaled matrix according to the rules set of scaler. + """ + scaler, reshaped_matrix = self._prepare_transform(matrix) + return scaler.transform(reshaped_matrix).tolist() + + + def inverse_transform(self, matrix: Matrix) -> Matrix: + """ + Inverse transform the scaled data stored in matrix according to this Scaler or children. + + :param matrix: + Scaled matrix of data of dimension `n_samples x n_features` + + :return: The raw matrix according to the rules of scaler. + """ + scaler, reshaped_matrix = self._prepare_transform(matrix) + return scaler.inverse_transform(reshaped_matrix).tolist() + + @classmethod + def fit_transform(cls, matrix: Matrix, name: str = '') -> tuple['Scaler', Matrix]: + """ Fit scaler with data of matrix and transform it. It is the succession of fit and transform methods. """ + reshaped_matrix = vector_to_2d_matrix(matrix) + scaler = cls.fit(reshaped_matrix, name) + return scaler, scaler.transform(reshaped_matrix) + + def transform_matrices(self, *matrices: tuple[Matrix]) -> tuple[Matrix]: + """ Iteratively scale all matrices. """ + return tuple(self.transform(m) for m in matrices) + + def inverse_transform_matrices(self, *scaled_matrices: tuple[Matrix]) -> tuple[Matrix]: + """ Iteratively invert scaler for all matrices. """ + return tuple(self.inverse_transform(m) for m in scaled_matrices) + + +class StandardScaler(Scaler): + """ + Data scaler that standard scale data. The operation made by this scaler is `new_X = (X - mean(X))/std(X)`. + + :param mean_: + List of means. + + :param scale_: + List of standard deviations. + + :param var_: + List of variances. + """ + + _rebuild_attributes = ['mean_', 'scale_', 'var_'] + _standalone_in_db = True + + def __init__(self, mean_: Vector = None, scale_: Vector = None, var_: Vector = None, name: str = ''): + self.mean_ = mean_ + self.scale_ = scale_ + self.var_ = var_ + Scaler.__init__(self, name=name) + + @classmethod + def _skl_class(cls): + return preprocessing.StandardScaler + + +class IdentityScaler(StandardScaler): + """ Data scaler that scales nothing. """ + + def __init__(self, mean_: Vector = None, scale_: Vector = None, var_: Vector = None, name: str = 'identity_scaler'): + StandardScaler.__init__(self, mean_=mean_, scale_=scale_, var_=var_, name=name) + + def _init_empty(self): + return self._skl_class()(with_mean = False, with_std = False) + + +class LabelBinarizer(Scaler): + """ + Data scaler used in `MLPClassifier` to standardize class labels. Only implemented for `MLPClassifier` to work. + + :param classes_: + List of classes to standardize. Can be any int. + + :param y_type_: + Type of output labels. + + :param sparse_input_: + Specify if the inputs are a sparse matrix or not. + """ + + _rebuild_attributes = ['classes_', 'y_type_', 'sparse_input_'] + + def __init__(self, classes_: list[int] = None, y_type_: str = 'multiclass', sparse_input_: bool = False, + name: str = ''): + self.classes_ = classes_ + self.y_type_ = y_type_ + self.sparse_input_ = sparse_input_ + Scaler.__init__(self, name=name) + + @classmethod + def _skl_class(cls): + return preprocessing._label.LabelBinarizer + + def instantiate_skl(self): + """ Instantiate scikit-learn `LabelBinarizer` from `LabelBinarizer` object. """ + scaler = self._init_empty() + scaler.classes_ = npy.array(self.classes_) + scaler.y_type_ = self.y_type_ + scaler.sparse_input_ = self.sparse_input_ + return scaler + + +# ====================================================================================================================== +# M O D E L S +# ====================================================================================================================== +class Model(DessiaObject): + """ Base object for handling a scikit-learn models (classifier and regressor). """ + + _allowed_methods = DessiaObject._allowed_methods + ["predict", "score"] + + def __init__(self, parameters: dict[str, Any], name: str = ''): + self.parameters = parameters + DessiaObject.__init__(self, name=name) + + @classmethod + def _skl_class(cls): + raise NotImplementedError(f'Method _skl_class not implemented for {cls.__name__}.') + + def _call_skl_model(self): + return self._skl_class()(**self.parameters) + + def _instantiate_skl(self): + raise NotImplementedError(f'Method _instantiate_skl not implemented for {type(self).__name__}.') + + @classmethod + def _instantiate_dessia(cls, model, parameters: dict[str, Any], name: str = ''): + raise NotImplementedError(f'Method _instantiate_dessia not implemented for {cls.__name__}.') + + @classmethod + def _check_criterion(cls, criterion: str): + return criterion + + @classmethod + def _check_outputs(cls, outputs: Matrix): + return outputs + + @classmethod + def init_for_modeler_(cls, **parameters: dict[str, Any]) -> 'Model': + """ Initialize class of Model with its name and hyperparameters to fit in Modeler. """ + return cls(parameters=parameters) + + @classmethod + def fit_(cls, inputs: Matrix, outputs: Matrix, name: str = '', **hyperparameters) -> 'Model': + """ Standard method to fit outputs to inputs thanks to a scikit-learn model. """ + model = cls._skl_class()(**hyperparameters) + model.fit(inputs, outputs) + return cls._instantiate_dessia(model, hyperparameters, name) + + def predict(self, inputs: Matrix) -> Union[Vector, Matrix]: + """ + Standard method to predict outputs from inputs with a `Model` or children. + + :param inputs: + Matrix of data of dimension `n_samples x n_features` + + :return: The predicted values for inputs. + """ + model = self._instantiate_skl() + return model.predict(inputs).tolist() + + + @classmethod + def fit_predict_(cls, inputs: Matrix, outputs: Matrix, predicted_inputs: Matrix, name: str = '', + **hyperparameters) -> tuple['Model', Union[Vector, Matrix]]: + """ Fit outputs to inputs and predict outputs for `predicted_inputs`: succession of fit and predict. """ + model = cls.fit_(inputs, outputs, name, **hyperparameters) + return model, model.predict(predicted_inputs) + + def score(self, inputs: Matrix, outputs: Matrix) -> float: + """ + Compute the score of `Model` or children. + + Please be sure to fit the model before computing its score and use test data and not train data. + Train data is data used to train the model and shall not be used to evaluate its quality. + Test data is data used to test the model and must not be used to train (fit) it. + + :param inputs: + Matrix of data of dimension `n_samples x n_features` + + :param outputs: + Matrix of data of dimension `n_samples x n_outputs` + + :return: The score of `Model` or children (DessiaObject). + """ + model = self._instantiate_skl() + return model.score(inputs, outputs) + + +class LinearModel(Model): + """ Abstract class for linear models. """ + + def __init__(self, parameters: dict[str, Any], coef_: Matrix = None, intercept_: Matrix = None, name: str = ''): + self.coef_ = coef_ + self.intercept_ = intercept_ + Model.__init__(self, parameters=parameters, name=name) + + @classmethod + def _skl_class(cls): + raise NotImplementedError('Method _skl_class not implemented for LinearModel. Please use '\ + 'Ridge or LinearRegression.') + + def _instantiate_skl(self): + model = self._call_skl_model() + model.coef_ = npy.array(self.coef_) + model.intercept_ = npy.array(self.intercept_) + return model + + @classmethod + def _instantiate_dessia(cls, model, parameters: dict[str, Any], name: str = ''): + return cls(coef_=model.coef_.tolist(), intercept_=model.intercept_.tolist(), parameters=parameters, name=name) + + +class Ridge(LinearModel): + """ + `Ridge` regression. It is a linear or least square regression but computed with a regularization term `alpha`. + + The model searched with this method is of the form `Y = A.X + B`, where `Y` are the outputs, `X` the inputs, `A` and + `B` the matrices of the model. + + The function minimized to get the linear model is `|| Y - A.X + B || + alpha.|| A || = 0`. This means setting + `alpha` to `0` is equivalent than searching a linear model from a least square regression. + + More information: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html + + :param parameters: + Hyperparameters of the used scikit-learn object. + + :param coef_: + List of coefficients of the model. Each element (i, j) of coef_ is the slope of the linear model predicting + the i-th output from the j-th input. + + :param intercept_: + List of offsets of the model. Each element (i, ) of intercept_ is added to the prediction made with coef_ to + compute the i-th element of outputs prediction. + + :param name: + Name of `Ridge` regression + """ + + _standalone_in_db = True + _allowed_methods = Model._allowed_methods + ["fit", "fit_predict", "init_for_modeler"] + + def __init__(self, parameters: dict[str, Any], coef_: Matrix = None, intercept_: Matrix = None, name: str = ''): + LinearModel.__init__(self, coef_=coef_, intercept_=intercept_, parameters=parameters, name=name) + + @classmethod + def _skl_class(cls): + return linear_model.Ridge + + @classmethod + def init_for_modeler(cls, alpha: float = 1., fit_intercept: bool = True, tol: float = 0.001) -> 'Ridge': + """ + Initialize class `Ridge` with its name and hyperparameters to fit in Modeler. + + :param alpha: + Constant that multiplies the L2 term, controlling regularization strength. alpha must be a non-negative + float i.e. in [0, inf[. When alpha = 0, the objective is equivalent to ordinary least squares, + solved by the `LinearRegression` object. For numerical reasons, using `alpha = 0` with the `Ridge` object is + not advised. Instead, you should use the `LinearRegression` object. If an array is passed, penalties are + assumed to be specific to the targets. Hence they must correspond in number. + + :param fit_intercept: + Whether to fit the intercept for this model. If set to False, no intercept will be used in calculations + (i.e. X and Y are expected to be centered). + + :param tol: + Precision of the solution. + + :return: The `Ridge` class, the hyperparameters to instantiate it and the future name of instance. + """ + return cls.init_for_modeler_(alpha=alpha, fit_intercept=fit_intercept, tol=tol) + + @classmethod + def fit(cls, inputs: Matrix, outputs: Matrix, alpha: float = 1., fit_intercept: bool = True, tol: float = 0.001, + name: str = '') -> 'Ridge': + """ + Standard method to fit outputs to inputs thanks to `Ridge` linear model from scikit-learn. + + More information: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html + + :param inputs: + Matrix of data of dimension `n_samples x n_features` + + :param outputs: + Matrix of data of dimension `n_samples x n_outputs` + + :param alpha: + Constant that multiplies the L2 term, controlling regularization strength. alpha must be a non-negative + float i.e. in [0, inf[. When alpha = 0, the objective is equivalent to ordinary least squares, + solved by the `LinearRegression` object. For numerical reasons, using `alpha = 0` with the `Ridge` object is + not advised. Instead, you should use the `LinearRegression` object. If an array is passed, penalties are + assumed to be specific to the targets. Hence they must correspond in number. + + :param fit_intercept: + Whether to fit the intercept for this model. If set to False, no intercept will be used in calculations + (i.e. X and Y are expected to be centered). + + :param tol: + Precision of the solution. + + :param name: + Name of `Ridge` model + + :return: The `Ridge` model fit on inputs and outputs. + """ + return cls.fit_(inputs, outputs, name=name, alpha=alpha, fit_intercept=fit_intercept, tol=tol) + + @classmethod + def fit_predict(cls, inputs: Matrix, outputs: Matrix, predicted_inputs: Matrix, alpha: float = 1., + fit_intercept: bool = True, tol: float = 0.001, + name: str = '') -> tuple['Ridge', Union[Vector, Matrix]]: + """ Fit outputs to inputs and predict outputs for `predicted_inputs`: succession of fit and predict. """ + return cls.fit_predict_(inputs, outputs, predicted_inputs, name=name, + alpha=alpha, fit_intercept=fit_intercept, tol=tol) + + +class LinearRegression(LinearModel): + """ + Linear regression. + + The model searched with this method is of the form `Y = A.X + B`, where `Y` are the outputs, `X` the inputs, `A` and + `B` the matrices of the model. + + The function minimized to get the linear model is `|| Y - A.X + B || = 0`. + + More information: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html + + :param parameters: + Hyperparameters of the used scikit-learn object. + + :param coef_: + List of coefficients of the model. Each element (i, j) of coef_ is the slope of the linear model predicting + the i-th output from the j-th input. + + :param intercept_: + List of offsets of the model. Each element (i, ) of intercept_ is added to the prediction made with coef_ to + compute the i-th element of outputs prediction. + + :param name: + Name of Linear regression + """ + + _standalone_in_db = True + _allowed_methods = Model._allowed_methods + ["fit", "fit_predict", "init_for_modeler"] + + def __init__(self, parameters: dict[str, Any], coef_: Matrix = None, intercept_: Matrix = None, name: str = ''): + LinearModel.__init__(self, coef_=coef_, intercept_=intercept_, parameters=parameters, name=name) + + @classmethod + def _skl_class(cls): + return linear_model.LinearRegression + + @classmethod + def init_for_modeler(cls, fit_intercept: bool = True, positive: bool = False) -> 'LinearRegression': + """ + Initialize class `LinearRegression` with its name and hyperparameters to fit in Modeler. + + :param fit_intercept: + Whether to fit the intercept for this model. If set to False, no intercept will be used in calculations + (i.e. X and Y are expected to be centered). + + :param positive: + When set to True, forces the coefficients to be positive. This option is only supported for dense arrays. + + :return: The `LinearRegression` model fit on inputs and outputs. + """ + return cls.init_for_modeler_(fit_intercept=fit_intercept, positive=positive) + + @classmethod + def fit(cls, inputs: Matrix, outputs: Matrix, fit_intercept: bool = True, positive: bool = False, + name: str = '') -> 'LinearRegression': + """ + Standard method to fit outputs to inputs thanks to Linear Regression model from scikit-learn. + + More information: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html + + :param inputs: + Matrix of data of dimension `n_samples x n_features` + + :param outputs: + Matrix of data of dimension `n_samples x n_outputs` + + :param fit_intercept: + Whether to fit the intercept for this model. If set to False, no intercept will be used in calculations + (i.e. X and Y are expected to be centered). + + :param positive: + When set to True, forces the coefficients to be positive. This option is only supported for dense arrays. + + :param name: + Name of `LinearRegression` model + + :return: The Linear model fit on inputs and outputs. + """ + return cls.fit_(inputs, outputs, name=name, fit_intercept=fit_intercept, positive=positive) + + @classmethod + def fit_predict(cls, inputs: Matrix, outputs: Matrix, predicted_inputs: Matrix, fit_intercept: bool = True, + positive: bool = False, name: str = '') -> tuple['LinearRegression', Union[Vector, Matrix]]: + """ Fit outputs to inputs and predict outputs for `predicted_inputs`: succession of fit and predict. """ + return cls.fit_predict_(inputs, outputs, predicted_inputs, name=name, + fit_intercept=fit_intercept, positive=positive) + + +class Tree(Model): + """ + Base object for handling a scikit-learn tree._tree.Tree object (Cython). + + Please refer to https://scikit-learn.org/stable/auto_examples/tree/plot_unveil_tree_structure.html for more + information on attributes of Tree object and understanding the decision tree structure for basic usage. + + :param n_classes: + Number of output classes to predict from data. + + :param n_features: + Number of features to handle in data. + + :param n_outputs: + The number of outputs when fit is performed. + + :param tree_state: + All required values to re-instantiate a fully working scikit-learn tree are stored in this parameter. + + :param name: + Name of Tree + """ + + def __init__(self, n_classes: list[int] = None, n_features: int = None, n_outputs: int = None, + tree_state: dict[str, Any] = None, parameters: dict[str, Any] = None, name: str = ''): + self.n_classes = n_classes + self.n_features = n_features + self.n_outputs = n_outputs + self.tree_state = tree_state + Model.__init__(self, parameters=None, name=name) + + def _data_hash(self): + hash_ = npy.linalg.norm(self.tree_state['values'][0]) + hash_ += sum(self.n_classes) + hash_ += self.n_features + hash_ += self.n_outputs + return int(hash_ % 1e5) + + @classmethod + def _skl_class(cls): + return tree._tree.Tree + + def _call_skl_model(self): + return self._skl_class()(self.n_features, npy.array(self.n_classes), self.n_outputs) + + @staticmethod + def _getstate(model): + state = model.__getstate__() + dessia_state = {'max_depth': int(state['max_depth']), + 'node_count': int(state['node_count']), + 'values': state['values'].tolist(), + 'nodes': {'dtypes': state['nodes'].dtype.descr, 'values': state['nodes'].tolist()}} + return dessia_state + + @staticmethod + def _setstate(model, state): + skl_state = {'max_depth': int(state['max_depth']), + 'node_count': int(state['node_count']), + 'values': npy.array(state['values']), + 'nodes': npy.array(state['nodes']['values'], dtype=state['nodes']['dtypes'][:-1])} + model.__setstate__(skl_state) + return model + + def _instantiate_skl(self): + model = self._call_skl_model() + model = self._setstate(model, self.tree_state) + return model + + @classmethod + def _instantiate_dessia(cls, model, parameters: dict[str, Any], name: str = ''): + kwargs = {'name': name, + 'tree_state': cls._getstate(model), + 'n_classes': model.n_classes.tolist(), + 'n_features': model.n_features, + 'n_outputs': model.n_outputs} + return cls(**kwargs) + + +class DecisionTreeRegressor(Model): + """ + Base class for handling scikit-learn `DecisionTreeRegressor`. + + More information: https://scikit-learn.org/stable/modules/tree.html#tree + + :param parameters: + Hyperparameters of the used scikit-learn object. + + :param n_outputs_: + The number of outputs when fit is performed. + + :param tree_: + The underlying Tree object. + Please refer to https://scikit-learn.org/stable/auto_examples/tree/plot_unveil_tree_structure.html for + attributes of Tree object and understanding the decision tree structure for basic usage of these attributes. + + :param name: + Name of `DecisionTreeRegressor` + """ + + _standalone_in_db = True + _allowed_methods = Model._allowed_methods + ["fit", "fit_predict", "init_for_modeler"] + + def __init__(self, parameters: dict[str, Any], n_outputs_: int = None, tree_: Tree = None, name: str = ''): + self.n_outputs_ = n_outputs_ + self.tree_ = tree_ + Model.__init__(self, parameters=parameters, name=name) + + @classmethod + def _skl_class(cls): + return tree.DecisionTreeRegressor + + def generic_skl_attributes(self): + """ Generic method to set scikit-learn model attributes from self attributes. """ + model = self._call_skl_model() + model.n_outputs_ = self.n_outputs_ + model.tree_ = self.tree_._instantiate_skl() + return model + + @classmethod + def generic_dessia_attributes(cls, model, parameters: dict[str, Any], name: str = ''): + """ Generic method to set self attributes from scikit-learn model attributes. """ + return {'name': name, + 'tree_': Tree._instantiate_dessia(model.tree_, None), + 'n_outputs_': model.n_outputs_, + 'parameters': parameters} + + def _instantiate_skl(self): + return self.generic_skl_attributes() + + @classmethod + def _instantiate_dessia(cls, model, parameters: dict[str, Any], name: str = ''): + return cls(**cls.generic_dessia_attributes(model, parameters=parameters, name=name)) + + @classmethod + def init_for_modeler(cls, criterion: str = 'squared_error', max_depth: int = None) -> 'DecisionTreeRegressor': + """ + Initialize class `DecisionTreeRegressor` with its name and hyperparameters to fit in Modeler. + + :param criterion: + The function to measure the quality of a split. Supported criteria are “squared_error” for the mean + squared error, which is equal to variance reduction as feature selection criterion and minimizes the L2 + loss using the mean of each terminal node, “friedman_mse”, which uses mean squared error with Friedman’s + improvement score for potential splits, “absolute_error” for the mean absolute error, which minimizes the + L1 loss using the median of each terminal node, and `“poisson”` which uses reduction in Poisson deviance to + find splits. + + :param max_depth: + The maximum depth of the tree. If `None`, then nodes are expanded until all leaves are pure or until all + leaves contain less than min_samples_split samples. + + :return: The `DecisionTreeRegressor` model fit on inputs and outputs. + """ + return cls.init_for_modeler_(criterion=criterion, max_depth=max_depth) + + @classmethod + def fit(cls, inputs: Matrix, outputs: Matrix, criterion: str = 'squared_error', max_depth: int = None, + name: str = '') -> 'DecisionTreeRegressor': + """ + Standard method to fit outputs to inputs thanks to `DecisionTreeRegressor` model from scikit-learn. + + More information: https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html + + :param inputs: + Matrix of data of dimension `n_samples x n_features` + + :param outputs: + Matrix of data of dimension `n_samples x n_outputs` + + :param criterion: + The function to measure the quality of a split. Supported criteria are `“squared_error”` for the mean + squared error, which is equal to variance reduction as feature selection criterion and minimizes the L2 + loss using the mean of each terminal node, “friedman_mse”, which uses mean squared error with Friedman’s + improvement score for potential splits, “absolute_error” for the mean absolute error, which minimizes the + L1 loss using the median of each terminal node, and `“poisson”` which uses reduction in Poisson deviance to + find splits. + + :param max_depth: + The maximum depth of the tree. If `None`, then nodes are expanded until all leaves are pure or until all + leaves contain less than min_samples_split samples. + + :param name: + Name of `DecisionTreeRegressor` model + + :return: The `DecisionTreeRegressor` model fit on inputs and outputs. + """ + criterion = cls._check_criterion(criterion) + formated_outputs = cls._check_outputs(outputs) + return cls.fit_(inputs, formated_outputs, name=name, criterion=criterion, max_depth=max_depth) + + @classmethod + def fit_predict(cls, inputs: Matrix, outputs: Matrix, predicted_inputs: Matrix, criterion: str = 'squared_error', + max_depth: int = None, name: str = '') -> tuple['DecisionTreeRegressor', Union[Vector, Matrix]]: + """ Fit outputs to inputs and predict outputs for `predicted_inputs`: succession of fit and predict. """ + criterion = cls._check_criterion(criterion) + return cls.fit_predict_(inputs, outputs, predicted_inputs, name=name, criterion=criterion, max_depth=max_depth) + + +class DecisionTreeClassifier(DecisionTreeRegressor): + """ + Base class for handling scikit-learn `DecisionTreeClassifier`. + + More information: https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html + + :param parameters: + Hyperparameters of the used scikit-learn object. + + :param n_classes_: + The number of classes (for single output problems), or a list containing the number of classes for each output + (for multi-output problems). + + :param classes_: + The number of outputs when fit is performed. + + :param n_outputs_: + The number of outputs when fit is performed. + + :param tree_: + The underlying Tree object. + Please refer to https://scikit-learn.org/stable/auto_examples/tree/plot_unveil_tree_structure.html for + attributes of Tree object and understanding the decision tree structure for basic usage of these attributes. + + :param name: + Name of `DecisionTreeClassifier` + """ + + def __init__(self, parameters: dict[str, Any], n_classes_: Union[int, list[int]] = None, classes_: list[int] = None, + n_outputs_: int = None, tree_: Tree = None, name: str = ''): + self.n_classes_ = n_classes_ + self.classes_ = classes_ + DecisionTreeRegressor.__init__(self, n_outputs_=n_outputs_, tree_=tree_, parameters=parameters, name=name) + + @classmethod + def _skl_class(cls): + return tree.DecisionTreeClassifier + + @classmethod + def _check_criterion(cls, criterion: str): + if criterion == 'squared_error': + return 'gini' + return criterion + + @classmethod + def _check_outputs(cls, outputs: Matrix): + if isinstance(outputs[0], float): + return [[int(output)] for output in outputs] + if not isinstance(outputs[0], list): + return [[output] for output in outputs] + if not isinstance(outputs[0][0], (int, str)): + return [list(map(int, output)) for output in outputs] + return outputs + + def _instantiate_skl(self): + model = self.generic_skl_attributes() + model.n_classes_ = self.n_classes_ + if isinstance(self.n_classes_, list): + model.classes_ = [npy.array(class_) for class_ in self.classes_] + else: + model.classes_ = npy.array(self.classes_) + return model + + @classmethod + def _instantiate_dessia(cls, model, parameters: dict[str, Any], name: str = ''): + kwargs = cls.generic_dessia_attributes(model, parameters=parameters, name=name) + kwargs.update({'n_classes_': (model.n_classes_ if isinstance(model.n_classes_, (int, list)) + else model.n_classes_.tolist()), + 'classes_': (model.classes_.tolist() if isinstance(model.classes_, npy.ndarray) + else [klass.tolist() for klass in model.classes_])}) + return cls(**kwargs) + + def score(self, inputs: Matrix, outputs: Matrix) -> float: + """ + Compute the score of `Model` or children. + + Please be sure to fit the model before computing its score and use test data and not train data. + Train data is data used to train the model and shall not be used to evaluate its quality. + Test data is data used to test the model and must not be used to train (fit) it. + + :param inputs: + Matrix of data of dimension `n_samples x n_features` + + :param outputs: + Matrix of data of dimension `n_samples x n_outputs` + + :return: The score of `Model` or children (`DessiaObject`). + """ + model = self._instantiate_skl() + return model.score(inputs, outputs) + + +class RandomForest(Model): + """ + Base object for handling a scikit-learn `RandomForest` object. + + More information: https://scikit-learn.org/stable/modules/ensemble.html#forest + """ + + _allowed_methods = Model._allowed_methods + ["fit", "fit_predict"] + + def __init__(self, parameters: dict[str, Any], n_outputs_: int = None, + estimators_: list[DecisionTreeRegressor] = None, name: str = ''): + self.estimators_ = estimators_ + self.n_outputs_ = n_outputs_ + Model.__init__(self, parameters=parameters, name=name) + + @classmethod + def _skl_class(cls): + raise NotImplementedError('Method _skl_class not implemented for RandomForest. Please use '\ + 'RandomForestClassifier or RandomForestRegressor.') + + def generic_skl_attributes(self): + """ Generic method to set scikit-learn model attributes from self attributes. """ + model = self._call_skl_model() + model.estimators_ = [tree._instantiate_skl() for tree in self.estimators_] + model.n_outputs_ = self.n_outputs_ + return model + + @classmethod + def generic_dessia_attributes(cls, model, parameters: dict[str, Any], name: str = ''): + """ Generic method to set self attributes from scikit-learn model attributes. """ + return {'name': name, + 'n_outputs_': model.n_outputs_, + 'parameters': parameters} + + @classmethod + def init_for_modeler(cls, n_estimators: int = 100, criterion: str = 'squared_error', + max_depth: int = None) -> 'RandomForest': + """ + Initialize class `RandomForest` with its name and hyperparameters to fit in `Modeler`. + + :param n_estimators: + Number of `DecisionTree` contained in `RandomForestRegressor` or `RandomForestClassifier` + + :param criterion: + | - **Regressor:** The function to measure the quality of a split. Supported criteria are “squared_error” for + the mean squared error, which is equal to variance reduction as feature selection criterion and minimizes + the L2 loss using the mean of each terminal node, “friedman_mse”, which uses mean squared error with + Friedman’s improvement score for potential splits, “absolute_error” for the mean absolute error, + which minimizes the L1 loss using the median of each terminal node, and `“poisson”` which uses reduction in + Poisson deviance to find splits. + + | - **Classifier:** The function to measure the quality of a split. Supported criteria are “gini” for the Gini + impurity and “log_loss” and “entropy” both for the Shannon information gain, see Mathematical formulation. + | Note: This parameter is tree-specific. + + :param max_depth: + The maximum depth of the tree. If `None`, then nodes are expanded until all leaves are pure or until all + leaves contain less than min_samples_split samples. + + :return: The `RandomForest` model fit on inputs and outputs. + """ + return cls.init_for_modeler_(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth) + + @classmethod + def fit(cls, inputs: Matrix, outputs: Matrix, n_estimators: int = 100, criterion: str = 'squared_error', + max_depth: int = None, name: str = '') -> 'RandomForest': + """ + Standard method to fit outputs to inputs thanks to `RandomForest` model from scikit-learn. + + More information: + - Classifier: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html + - Regressor: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html + + :param inputs: + Matrix of data of dimension `n_samples x n_features` + + :param outputs: + Matrix of data of dimension `n_samples x n_outputs` + + :param n_estimators: + Number of DecisionTree contained in `RandomForestRegressor` or `RandomForestClassifier` + + :param criterion: + | - **Regressor:** The function to measure the quality of a split. Supported criteria are “squared_error” for + the mean squared error, which is equal to variance reduction as feature selection criterion and minimizes + the L2 loss using the mean of each terminal node, “friedman_mse”, which uses mean squared error with + Friedman’s improvement score for potential splits, “absolute_error” for the mean absolute error, + which minimizes the L1 loss using the median of each terminal node, and `“poisson”` which uses reduction in + Poisson deviance to find splits. + + | - **Classifier:** The function to measure the quality of a split. Supported criteria are “gini” for the Gini + impurity and “log_loss” and “entropy” both for the Shannon information gain, see Mathematical formulation. + | Note: This parameter is tree-specific. + + :param max_depth: + The maximum depth of the tree. If `None`, then nodes are expanded until all leaves are pure or until all + leaves contain less than min_samples_split samples. + + :param name: + Name of RandomForestRegressor or `RandomForestClassifier` model + + :return: The RandomForestRegressor or `RandomForestClassifier` model fit on inputs and outputs. + """ + criterion = cls._check_criterion(criterion) + formated_outputs = cls._check_outputs(matrix_1d_to_vector(outputs)) + return cls.fit_(inputs, formated_outputs, name=name, n_estimators=n_estimators, criterion=criterion, + max_depth=max_depth) + + @classmethod + def fit_predict(cls, inputs: Matrix, outputs: Matrix, predicted_inputs: Matrix, n_estimators: int = 100, + criterion: str = 'squared_error', max_depth: int = None, + name: str = '') -> tuple['RandomForest', Union[Vector, Matrix]]: + """ Fit outputs to inputs and predict outputs for `predicted_inputs`: succession of fit and predict. """ + criterion = cls._check_criterion(criterion) + return cls.fit_predict_(inputs, outputs, predicted_inputs, name=name, n_estimators=n_estimators, + criterion=criterion, max_depth=max_depth, n_jobs=1) + + +class RandomForestRegressor(RandomForest): + """ + Base class for handling scikit-learn `RandomForestRegressor`. + + Please refer to https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html for + more information on `RandomForestRegressor`. + + :param parameters: + Hyperparameters of the used scikit-learn object. + + :param n_outputs_: + The number of outputs when fit is performed. + + :param estimators_: + The collection of fitted sub-trees. + + :param name: + Name of `RandomForestRegressor` + """ + + _standalone_in_db = True + + def __init__(self, parameters: dict[str, Any], n_outputs_: int = None, + estimators_: list[DecisionTreeRegressor] = None, name: str = ''): + RandomForest.__init__(self, estimators_=estimators_, n_outputs_=n_outputs_, parameters=parameters, name=name) + + @classmethod + def _skl_class(cls): + return ensemble.RandomForestRegressor + + def _instantiate_skl(self): + return self.generic_skl_attributes() + + @classmethod + def _instantiate_dessia(cls, model, parameters: dict[str, Any], name: str = ''): + kwargs = cls.generic_dessia_attributes(model, parameters=parameters, name=name) + kwargs.update({'estimators_': [DecisionTreeRegressor._instantiate_dessia(tree, {}) + for tree in model.estimators_]}) + return cls(**kwargs) + + +class RandomForestClassifier(RandomForest): + """ + Base class for handling scikit-learn `RandomForestClassifier`. + + Please refer to https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html for + more information on `RandomForestClassifier`. + + :param parameters: + Hyperparameters of the used scikit-learn object. + + :param n_classes_: + The number of classes (for single output problems), or a list containing the number of classes for each output + (for multi-output problems). + + :param classes_: + The number of outputs when fit is performed. + + :param n_outputs_: + The number of outputs when fit is performed. + + :param estimators_: + The collection of fitted sub-trees. + + :param name: + Name of `RandomForestClassifier` + """ + + _standalone_in_db = True + + def __init__(self, parameters: dict[str, Any], n_classes_: int = None, classes_: list[int] = None, + n_outputs_: int = None, estimators_: list[DecisionTreeRegressor] = None, name: str = ''): + self.n_classes_ = n_classes_ + self.classes_ = classes_ + RandomForest.__init__(self, estimators_=estimators_, n_outputs_=n_outputs_, parameters=parameters, name=name) + + @classmethod + def _skl_class(cls): + return ensemble.RandomForestClassifier + + @classmethod + def _check_criterion(cls, criterion: str): + if criterion == 'squared_error': + return 'gini' + return criterion + + @classmethod + def _check_outputs(cls, outputs: Matrix): + if isinstance(outputs[0], float): + return [int(output) for output in outputs] + if not isinstance(outputs[0], list): + return outputs + if not isinstance(outputs[0][0], (int, str)): + return [list(map(int, output)) for output in outputs] + return outputs + + def _instantiate_skl(self): + model = self.generic_skl_attributes() + model.n_classes_ = self.n_classes_ + if isinstance(self.n_classes_, list): + model.classes_ = [npy.array(class_) for class_ in self.classes_] + else: + model.classes_ = npy.array(self.classes_) + return model + + @classmethod + def _instantiate_dessia(cls, model, parameters: dict[str, Any], name: str = ''): + kwargs = cls.generic_dessia_attributes(model, parameters=parameters, name=name) + kwargs.update({'estimators_': [DecisionTreeClassifier._instantiate_dessia(tree, {}) + for tree in model.estimators_], + 'n_classes_': (model.n_classes_ if isinstance(model.n_classes_, (int, list)) + else model.n_classes_.tolist()), + 'classes_': (model.classes_.tolist() if isinstance(model.classes_, npy.ndarray) + else [klass.tolist() for klass in model.classes_])}) + return cls(**kwargs) + + +class SupportVectorMachine(Model): + """ + Base object for handling a scikit-learn `SupportVectorMachine` objects. + + Please refer to https://scikit-learn.org/stable/modules/svm.html for more + information on `SupportVectorMachine` object and understanding the `SupportVectorMachine` for basic usage. + """ + + _allowed_methods = Model._allowed_methods + ["fit", "fit_predict"] + + def __init__(self, parameters: dict[str, Any], raw_coef_: Matrix = None, _dual_coef_: Matrix = None, + _intercept_: Vector = None, support_: list[int] = 1, support_vectors_: Matrix = None, + _n_support: list[int] = None, _probA: Vector = None, _probB: Vector = None, _gamma: float = 1., + _sparse: bool = False, name: str = ''): + self.raw_coef_ = raw_coef_ + self._dual_coef_ = _dual_coef_ + self._intercept_ = _intercept_ + self.support_ = support_ + self.support_vectors_ = support_vectors_ + self._n_support = _n_support + self._probA = _probA + self._probB = _probB + self._gamma = _gamma + self._sparse = _sparse + Model.__init__(self, parameters=parameters, name=name) + + @classmethod + def _skl_class(cls): + raise NotImplementedError('Method _skl_class not implemented for SupportVectorMachine. Please use '\ + 'SupportVectorClassifier or SupportVectorRegressor.') + + def generic_skl_attributes(self): + """ Generic method to set scikit-learn model attributes from self attributes. """ + model = self._call_skl_model() + model.raw_coef_ = npy.array(self.raw_coef_) + model._dual_coef_ = npy.array(self._dual_coef_) + model._intercept_ = npy.array(self._intercept_) + model.support_ = npy.array(self.support_, dtype=npy.int32) + model.support_vectors_ = npy.array(self.support_vectors_) + model._n_support = npy.array(self._n_support, dtype=npy.int32) + model._probA = npy.array(self._probA) + model._probB = npy.array(self._probB) + model._gamma = self._gamma + model._sparse = self._sparse + return model + + @classmethod + def generic_dessia_attributes(cls, model, parameters: dict[str, Any], name: str = ''): + """ Generic method to set self attributes from scikit-learn model attributes. """ + return {'name': name, + 'parameters': parameters, + 'raw_coef_': model._get_coef().tolist(), + '_dual_coef_': model._dual_coef_.tolist(), + '_intercept_': model._intercept_.tolist(), + 'support_': model.support_.tolist(), + 'support_vectors_': model.support_vectors_.tolist(), + '_n_support': model._n_support.tolist(), + '_probA': model._probA.tolist(), + '_probB': model._probB.tolist(), + '_gamma': float(model._gamma), + '_sparse': model._sparse} + + @classmethod + def init_for_modeler(cls, C: float = 1., kernel: str = 'rbf') -> 'SupportVectorMachine': + """ + Initialize class `SupportVectorMachine` with its name and hyperparameters to fit in Modeler. + + :param C: + Regularization parameter. The strength of the regularization is inversely proportional to C. + Must be strictly positive. The penalty is a squared l2 penalty. + + :param kernel: + Specifies the kernel type to be used in the algorithm. + Can be one of `[‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’]`. If `None` is given, `‘rbf’` will be + used. If a callable is given, it is used to compute the kernel matrix from data matrices; that matrix + should be an matrix of shape `n_samples x n_samples`. + + :return: The `SupportVectorMachine` model fit on inputs and outputs. + """ + return cls.init_for_modeler_(C=C, kernel=kernel) + + @classmethod + def fit(cls, inputs: Matrix, outputs: Vector, C: float = 1., kernel: str = 'rbf', + name: str = '') -> 'SupportVectorMachine': + """ + Standard method to fit outputs to inputs thanks to `SupportVectorMachine` model from scikit-learn. + + More information: + - Classifier: https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html + - Regressor: https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html + + :param inputs: + Matrix of data of dimension `n_samples x n_features` + + :param outputs: + Matrix of data of dimension `n_samples x 1` + + :param C: + Regularization parameter. The strength of the regularization is inversely proportional to C. + Must be strictly positive. The penalty is a squared l2 penalty. + + :param kernel: + Specifies the kernel type to be used in the algorithm. + Can be one of `[‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’]`. If `None` is given, ‘rbf’ will be used. + If a callable is given, it is used to compute the kernel matrix from data matrices; that matrix should be + an matrix of shape `n_samples x n_samples` + + :param name: + Name of `SupportVectorRegressor` or `SupportVectorClassifier` model + + :return: The `SupportVectorRegressor` or `SupportVectorClassifier` model fit on inputs and outputs. + """ + if is_sequence(outputs[0]): + if len(outputs[0]) != 1: + raise NotImplementedError("Support Vector Machine do not handle multi-output.") + return cls.fit_(inputs, npy.array(outputs).ravel(), name=name, C=C, kernel=kernel) + + @classmethod + def fit_predict(cls, inputs: Matrix, outputs: Vector, predicted_inputs: Matrix, C: float = 1., kernel: str = 'rbf', + name: str = '') -> tuple['SupportVectorMachine', Union[Vector, Matrix]]: + """ Fit outputs to inputs and predict outputs for `predicted_inputs`: succession of fit and predict. """ + return cls.fit_predict_(inputs, npy.array(outputs).ravel(), predicted_inputs, name=name, C=C, kernel=kernel) + + +class SupportVectorRegressor(SupportVectorMachine): + """ + Base object for handling a scikit-learn `SupportVectorRegressor` objects. + + Please refer to https://scikit-learn.org/stable/modules/svm.html#svm-regression for more + information on `SupportVectorRegressor` object and understanding the `SupportVectorRegressor` for basic usage. + + :param parameters: + Hyperparameters of the used scikit-learn object. + + :param raw_coef_: + The number of classes (for single output problems), or a list containing the number of classes for each output + (for multi-output problems). + + :param _dual_coef_: + Coefficients of the support vector in the decision function. Shape is `1 x _n_support`. + + :param _intercept_: + Constants in decision function. + + :param support_: + Indices of support vectors. + + :param support_vectors_: + Support vectors. + + :param _n_support: + Number of support vectors for each class. + + :param _probA: + Parameter learned in Platt scaling when `probability=True`. + https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC.probA_ + + :param _probB: + Parameter learned in Platt scaling when `probability=True`. + https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC.probB_ + + :param _gamma: + Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’. + - if `gamma='scale'` (default) is passed then it uses `1 / (n_features * X.var())` as value of gamma, + - if `‘auto’`, uses `1 / n_features`. + + :param _sparse: + Specify if the inputs are a sparse matrix or not. + """ + + _standalone_in_db = True + + def __init__(self, parameters: dict[str, Any], raw_coef_: Matrix = None, _dual_coef_: Matrix = None, + _intercept_: Vector = None, support_: list[int] = 1, support_vectors_: Matrix = None, + _n_support: list[int] = None, _probA: Vector = None, _probB: Vector = None, _gamma: float = 1., + _sparse: bool = False, name: str = ''): + SupportVectorMachine.__init__(self, raw_coef_=raw_coef_, _dual_coef_=_dual_coef_, + support_vectors_=support_vectors_, _sparse=_sparse, + _n_support=_n_support, support_=support_, _intercept_=_intercept_, _probA=_probA, + _probB=_probB, _gamma=_gamma, parameters=parameters, name=name) + + @classmethod + def _skl_class(cls): + return svm.SVR + + def _instantiate_skl(self): + return self.generic_skl_attributes() + + @classmethod + def _instantiate_dessia(cls, model, parameters: dict[str, Any], name: str = ''): + return cls(**cls.generic_dessia_attributes(model, parameters=parameters, name=name)) + + +class SupportVectorClassifier(SupportVectorMachine): + """ + Base object for handling a scikit-learn `SupportVectorClassifier` objects. + + Please refer to https://scikit-learn.org/stable/modules/svm.html#svm-classification for more + information on `SupportVectorClassifier` object and understanding the `SupportVectorClassifier` for basic usage. + + :param parameters: + Hyperparameters of the used scikit-learn object. + + :param raw_coef_: + The number of classes (for single output problems), or a list containing the number of classes for each output + (for multi-output problems). + + :param _dual_coef_: + Coefficients of the support vector in the decision function. Shape is `1 x _n_support`. + + :param _intercept_: + Constants in decision function. + + :param support_: + Indices of support vectors. + + :param support_vectors_: + Support vectors. + + :param _n_support: + Number of support vectors for each class. + + :param _probA: + Parameter learned in Platt scaling when `probability=True`. + https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC.probA_ + + :param _probB: + Parameter learned in Platt scaling when `probability=True`. + https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC.probB_ + + :param _gamma: + Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’. + - if `gamma='scale'` (default) is passed then it uses `1 / (n_features * X.var())` as value of gamma, + - if `‘auto’`, uses `1 / n_features`. + + :param _sparse: + Specify if the inputs are a sparse matrix or not. + + :param classes_: + The classes labels. + """ + + _standalone_in_db = True + + def __init__(self, parameters: dict[str, Any], raw_coef_: Matrix = None, _dual_coef_: Matrix = None, + _intercept_: Vector = None, support_: list[int] = 1, support_vectors_: Matrix = None, + _n_support: list[int] = None, _probA: Vector = None, _probB: Vector = None, _gamma: float = 1., + _sparse: bool = False, classes_: list[int] = None, name: str = ''): + self.classes_ = classes_ + SupportVectorMachine.__init__(self, raw_coef_=raw_coef_, _dual_coef_=_dual_coef_, + support_vectors_=support_vectors_, _sparse=_sparse, + _n_support=_n_support, support_=support_, _intercept_=_intercept_, _probA=_probA, + _probB=_probB, _gamma=_gamma, parameters=parameters, name=name) + + @classmethod + def _skl_class(cls): + return svm.SVC + + def _instantiate_skl(self): + model = self.generic_skl_attributes() + model.classes_ = npy.array(self.classes_) + return model + + @classmethod + def _instantiate_dessia(cls, model, parameters: dict[str, Any], name: str = ''): + kwargs = cls.generic_dessia_attributes(model, parameters=parameters, name=name) + kwargs['classes_'] = model.classes_.tolist() + return cls(**kwargs) + + +class MultiLayerPerceptron(Model): + """ + Base object for handling a scikit-learn `MultiLayerPerceptron` (dense neural network). + + Please refer to https://scikit-learn.org/stable/modules/neural_networks_supervised.html for more + information on `MultiLayerPerceptron`. + """ + + _allowed_methods = Model._allowed_methods + ["fit", "fit_predict", "init_for_modeler"] + + def __init__(self, parameters: dict[str, Any], coefs_: list[Matrix] = None, intercepts_: Matrix = None, + n_layers_: int = None, activation: str = 'relu', out_activation_: str = 'identity', name: str = ''): + self.coefs_ = coefs_ + self.intercepts_ = intercepts_ + self.n_layers_ = n_layers_ + self.activation = activation + self.out_activation_ = out_activation_ + Model.__init__(self, parameters=parameters, name=name) + + @classmethod + def _skl_class(cls): + raise NotImplementedError('Method _skl_class not implemented for MultiLayerPerceptron. Please use '\ + 'MLPRegressor or MLPClassifier.') + + def generic_skl_attributes(self): + """ Generic method to set scikit-learn model attributes from self attributes. """ + model = self._call_skl_model() + model.coefs_ = [npy.array(coefs_) for coefs_ in self.coefs_] + model.intercepts_ = [npy.array(intercepts_) for intercepts_ in self.intercepts_] + model.n_layers_ = self.n_layers_ + model.activation = self.activation + model.out_activation_ = self.out_activation_ + return model + + @classmethod + def generic_dessia_attributes(cls, model, parameters: dict[str, Any], name: str = ''): + """ Generic method to set self attributes from scikit-learn model attributes. """ + return {'name': name, + 'parameters': parameters, + 'coefs_': [coefs_.tolist() for coefs_ in model.coefs_], + 'intercepts_': [intercepts_.tolist() for intercepts_ in model.intercepts_], + 'n_layers_': model.n_layers_, + 'activation': model.activation, + 'out_activation_': model.out_activation_} + + @classmethod + def init_for_modeler(cls, hidden_layer_sizes: list[int], activation: str = 'relu', alpha: float = 0.0001, + solver: str = 'adam', max_iter: int = 200, tol: float = 0.0001) -> 'MultiLayerPerceptron': + """ + Initialize class `MultiLayerPerceptron` with its name and hyperparameters to fit in `Modeler`. + + :param hidden_layer_sizes: + Regularization parameter. The strength of the regularization is inversely proportional to `C`. + Must be strictly positive. The penalty is a squared l2 penalty. + + :param activation: + Activation function for the hidden layer: + - `‘identity’`, no-op activation, useful to implement linear bottleneck, returns `f(x) = x` + - `‘logistic’`, the logistic sigmoid function, returns `f(x) = 1 / (1 + exp(-x))`. + - `‘tanh’`, the hyperbolic tan function, returns `f(x) = tanh(x)`. + - `‘relu’`, the rectified linear unit function, returns `f(x) = max(0, x)` + + :param alpha: + Constant that multiplies the L2 term, controlling regularization strength. alpha must be a non-negative + float i.e. in `[0, inf[`. + + :param solver: + The solver for weight optimization: + - `‘lbfgs’` is an optimizer in the family of quasi-Newton methods. + - `‘sgd’` refers to stochastic gradient descent. + - `‘adam’` refers to a stochastic gradient-based optimizer proposed in https://arxiv.org/abs/1412.6980 + Note: The default solver ‘adam’ works pretty well on relatively large datasets (with thousands of training + samples or more) in terms of both training time and validation score. For small datasets, however, + `‘lbfgs’` can converge faster and perform better. + + :param max_iter: + Maximum number of iterations. The solver iterates until convergence (determined by `‘tol’`) or this number + of iterations. For stochastic solvers (`‘sgd’`, `‘adam’`), note that this determines the number of epochs + (how many times each data point will be used), not the number of gradient steps. + + :param tol: + Tolerance for the optimization. When the loss or score is not improving by at least tol for + `n_iter_no_change` consecutive iterations, unless `learning_rate` is set to `‘adaptive’`, convergence is + considered to be reached and training stops. + + :return: The `MultiLayerPerceptron` model fit on inputs and outputs. + """ + return cls.init_for_modeler_(hidden_layer_sizes=hidden_layer_sizes, activation=activation, alpha=alpha, + solver=solver, max_iter=max_iter, tol=tol) + + @classmethod + def fit(cls, inputs: Matrix, outputs: Vector, hidden_layer_sizes: list[int], activation: str = 'relu', + alpha: float = 0.0001, solver: str = 'adam', max_iter: int = 200, tol: float = 0.0001, + name: str = '') -> 'MultiLayerPerceptron': + """ + Standard method to fit outputs to inputs thanks to `MLPRegressor` or `MLPClassifier` models from scikit-learn. + + More information: + - Classifier: https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html + - Regressor: https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPRegressor.html + + :param inputs: + Matrix of data of dimension `n_samples x n_features` + + :param outputs: + Matrix of data of dimension `n_samples x n_outputs` + + :param hidden_layer_sizes: + Regularization parameter. The strength of the regularization is inversely proportional to C. + Must be strictly positive. The penalty is a squared l2 penalty. + + :param activation: + Activation function for the hidden layer: + - `‘identity’`, no-op activation, useful to implement linear bottleneck, returns `f(x) = x` + - `‘logistic’`, the logistic sigmoid function, returns `f(x) = 1 / (1 + exp(-x))`. + - `‘tanh’`, the hyperbolic tan function, returns `f(x) = tanh(x)`. + - `‘relu’`, the rectified linear unit function, returns `f(x) = max(0, x)` + + :param alpha: + Constant that multiplies the L2 term, controlling regularization strength. alpha must be a non-negative + float i.e. in [0, inf[. + + :param solver: + The solver for weight optimization: + - `‘lbfgs’` is an optimizer in the family of quasi-Newton methods. + - `‘sgd’` refers to stochastic gradient descent. + - `‘adam’` refers to a stochastic gradient-based optimizer proposed in https://arxiv.org/abs/1412.6980 + Note: The default solver `‘adam’` works pretty well on relatively large datasets (with thousands of training + samples or more) in terms of both training time and validation score. For small datasets, however, + `‘lbfgs’` can converge faster and perform better. + + :param max_iter: + Maximum number of iterations. The solver iterates until convergence (determined by `‘tol’`) or this number + of iterations. For stochastic solvers (`‘sgd’`, `‘adam’`), note that this determines the number of epochs + (how many times each data point will be used), not the number of gradient steps. + + :param tol: + Tolerance for the optimization. When the loss or score is not improving by at least tol for + `n_iter_no_change` consecutive iterations, unless `learning_rate` is set to `‘adaptive’`, convergence is + considered to be reached and training stops. + + :return: The `MLPRegressor` or `MLPClassifier` model fit on inputs and outputs. + """ + outputs = matrix_1d_to_vector(outputs) + return cls.fit_(inputs, outputs, name=name, hidden_layer_sizes=hidden_layer_sizes, activation=activation, + alpha=alpha, solver=solver, max_iter=max_iter, tol=tol) + + @classmethod + def fit_predict(cls, inputs: Matrix, outputs: Vector, predicted_inputs: Matrix, hidden_layer_sizes: list[int], + activation: str = 'relu', alpha: float = 0.0001, solver: str = 'adam', max_iter: int = 200, + tol: float = 0.0001, name: str = '') -> tuple['MultiLayerPerceptron', Union[Vector, Matrix]]: + """ Fit outputs to inputs and predict outputs for `predicted_inputs`: succession of fit and predict. """ + return cls.fit_predict_(inputs, outputs, predicted_inputs, name=name, hidden_layer_sizes=hidden_layer_sizes, + activation=activation, alpha=alpha, solver=solver, max_iter=max_iter, tol=tol) + + +class MLPRegressor(MultiLayerPerceptron): + """ + Base object for handling a scikit-learn `MLPRegressor` (dense neural network) object. + + Please refer to https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPRegressor.html for more + information on `MLPRegressor`. + + :param parameters: + Hyperparameters of the used scikit-learn object. + + :param coef_: + List of coefficients of the model. + + :param intercept_: + List of offsets of the model. + + :param n_layers_: + Number of hidden layers contained in the current `MLPRegressor`. + + :param activation: + Activation function for hidden layers. + + :param out_activation_: + Activation function for the output layer. + + :param name: + Name of `MLPRegressor` + """ + + _standalone_in_db = True + + def __init__(self, parameters: dict[str, Any], coefs_: list[Matrix] = None, intercepts_: Matrix = None, + n_layers_: int = None, activation: str = 'relu', out_activation_: str = 'identity', name: str = ''): + MultiLayerPerceptron.__init__(self, coefs_=coefs_, intercepts_=intercepts_, n_layers_=n_layers_, + activation=activation, out_activation_=out_activation_, parameters=parameters, + name=name) + + @classmethod + def _skl_class(cls): + return neural_network.MLPRegressor + + def _instantiate_skl(self): + return self.generic_skl_attributes() + + @classmethod + def _instantiate_dessia(cls, model, parameters: dict[str, Any], name: str = ''): + return cls(**cls.generic_dessia_attributes(model, parameters=parameters, name=name)) + + +class MLPClassifier(MultiLayerPerceptron): + """ + Base object for handling a scikit-learn `MLPClassifier` (dense neural network) object. + + Please refer to https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html for more + information on `MLPClassifier`. + + :param parameters: + Hyperparameters of the used scikit-learn object. + + :param coef_: + List of coefficients of the model. + + :param intercept_: + List of offsets of the model. + + :param n_layers_: + Number of hidden layers contained in the current `MLPClassifier`. + + :param activation: + Activation function for hidden layers. + + :param out_activation_: + Activation function for the output layer. + + :param n_outputs_: + The number of outputs when fit is performed. + + :param _label_binarizer: + Data scaler used in `MLPClassifier` to standardize class labels. + + :param name: + Name of `MLPClassifier` + """ + + _standalone_in_db = True + + def __init__(self, parameters: dict[str, Any], coefs_: list[Matrix] = None, intercepts_: Matrix = None, + n_layers_: int = None, activation: str = 'relu', out_activation_: str = 'identity', + n_outputs_: int = None, _label_binarizer: LabelBinarizer = None, name: str = ''): + self.n_outputs_ = n_outputs_ + self._label_binarizer = _label_binarizer + MultiLayerPerceptron.__init__(self, coefs_=coefs_, intercepts_=intercepts_, n_layers_=n_layers_, + activation=activation, out_activation_=out_activation_, parameters=parameters, + name=name) + + @classmethod + def _skl_class(cls): + return neural_network.MLPClassifier + + def _instantiate_skl(self): + model = self.generic_skl_attributes() + model.n_outputs_ = self.n_outputs_ + model._label_binarizer = self._label_binarizer.instantiate_skl() + return model + + @classmethod + def _instantiate_dessia(cls, model, parameters: dict[str, Any], name: str = ''): + kwargs = cls.generic_dessia_attributes(model, parameters=parameters, name=name) + kwargs.update({'n_outputs_': model.n_outputs_, + '_label_binarizer': LabelBinarizer.instantiate_dessia(model._label_binarizer)}) + return cls(**kwargs) + + +def get_scaler_attribute(scaler, attr: str): + """ Get attribute `attr` of scikit-learn scaler with an exception for numpy arrays (to instantiate a Scaler). """ + scaler_attr = getattr(scaler, attr) + if isinstance(scaler_attr, npy.ndarray): + return scaler_attr.tolist() + return scaler_attr + +def get_split_indexes(len_matrix: int, ratio: float = 0.8, shuffled: bool = True) -> tuple[Vector, Vector]: + """ Get two lists of indexes to split randomly a matrix in two matrices. """ + if ratio > 1: + len_train = int(ratio) + else: + len_train = int(len_matrix * ratio) + + idx_range = range(0, len_matrix) + ind_train = random.sample(idx_range, len_train) + ind_test = list(set(idx_range).difference(set(ind_train))) + + if not shuffled: + ind_train.sort() + ind_test.sort() + return ind_train, ind_test + +def train_test_split(*matrices: list[Matrix], ratio: float = 0.8, shuffled: bool = True) -> list[Matrix]: + """ + Split a list of matrices of the same length into a list of train and test split matrices. + + The first one is of length `int(len_matrix * ratio)`, the second of length `len_matrix - int(len_matrix * ratio)`. + + :param len_matrix: + Length of matrix to split. + + :param ratio: + Ratio on which to split matrix. If ratio > 1, ind_train will be of length `int(ratio)` and ind_test of + length `len_matrix - int(ratio)`. + + :param shuffled: + Whether to shuffle or not the results. + + :return: + A list containing all split matrices in the following order: `[train_M1, test_M1, train_M2, test_M2, ..., + train_Mn, test_Mn]`. + """ + len_matrices = [len(matrix) for matrix in matrices] + if len(set(len_matrices)) != 1: + raise ValueError("Matrices are not of the same length in train_test_split.") + + ind_train, ind_test = get_split_indexes(len_matrices[0], ratio=ratio, shuffled=shuffled) + train_test_split_matrices = [[[matrix[idx] for idx in ind_train], [matrix[idx] for idx in ind_test]] + for matrix in matrices] + return sum(train_test_split_matrices, []) + +def matrix_1d_to_vector(matrix: Matrix) -> Union[Vector, Matrix]: + """ Transform a `list[list[float]]` of shape `(n, 1)` into a `list[float]` of shape `(n,)`. """ + if isinstance(matrix[0], list): + if len(matrix[0]) == 1: + return sum(matrix, []) + return matrix + +def vector_to_2d_matrix(matrix: Union[Vector, Matrix]) -> Matrix: + """ Transform a `list[float]` of shape `(n,)` into a `list[list[float]]` of shape `(n, 1)`. """ + if not isinstance(matrix[0], list): + return [[x] for x in matrix] + return matrix diff --git a/dessia_common/datatools/metrics.py b/dessia_common/datatools/math.py similarity index 79% rename from dessia_common/datatools/metrics.py rename to dessia_common/datatools/math.py index f91ea3f6f..beb852298 100644 --- a/dessia_common/datatools/metrics.py +++ b/dessia_common/datatools/math.py @@ -1,31 +1,46 @@ """ Distances operators for metrics on Datasets. """ +from typing import Union + import numpy as npy from scipy.spatial.distance import mahalanobis +Vector = list[float] +Matrix = list[Vector] def diff_list(list_a, list_b): """ Difference between to lists. :param list_a: First list - :type list_a: List[float] + :type list_a: list[float] :param list_b: Second list - :type list_b: List[float] + :type list_b: list[float] :return: a generator of the difference between each element :rtype: generator """ return (a - b for a, b in zip(list_a, list_b)) +def maximums(matrix: Union[Vector, Matrix]) -> Vector: + """ Compute maximum values and store it in a list of length `len(matrix[0])`. """ + if not isinstance(matrix[0], list): + return [max(matrix)] + return [max(column) for column in zip(*matrix)] + +def minimums(matrix: Union[Vector, Matrix]) -> Vector: + """ Compute minimum values and store it in a list of length `len(matrix[0])`. """ + if not isinstance(matrix[0], list): + return [min(matrix)] + return [min(column) for column in zip(*matrix)] def l1_norm(vector): """ L1-norm of vector. :param vector: vector to get norm - :type vector: List[float] + :type vector: list[float] :return: the l1-norm :rtype: float @@ -38,7 +53,7 @@ def l2_norm(vector): L2-norm of vector. :param vector: vector to get norm - :type vector: List[float] + :type vector: list[float] :return: the l2-norm :rtype: float @@ -52,7 +67,7 @@ def lp_norm(vector, mink_power=2): Minkowski norm of vector. :param vector: vector to get norm - :type vector: List[float] + :type vector: list[float] :param mink_power: the value of exponent in Minkowski norm :type mink_power: float @@ -68,7 +83,7 @@ def inf_norm(vector): Infinite norm of vector. :param vector: vector to get norm - :type vector: List[float] + :type vector: list[float] :return: maximum value of absolute values in vector :rtype: float @@ -81,10 +96,10 @@ def manhattan_distance(list_a, list_b): Compute the l1 distance between list_a and list_b, i.e. the l1-norm of difference between list_a and list_b. :param list_a: First list - :type list_a: List[float] + :type list_a: list[float] :param list_b: Second list - :type list_b: List[float] + :type list_b: list[float] :return: the l1 distance between the two list :rtype: float @@ -93,17 +108,17 @@ def manhattan_distance(list_a, list_b): return l1_norm(diff_list(list_a, list_b)) -def euclidian_distance(list_a, list_b): +def euclidean_distance(list_a, list_b): """ Compute the euclidean distance between list_a and list_b, i.e. the l2-norm of difference between list_a and list_b. It is the natural distance of 3D space. :param list_a: First list - :type list_a: List[float] + :type list_a: list[float] :param list_b: Second list - :type list_b: List[float] + :type list_b: list[float] :return: the l2 distance between the two list :rtype: float @@ -117,10 +132,10 @@ def minkowski_distance(list_a, list_b, mink_power= 2): Compute the Minkowski distance between list_a and list_b, i.e. the lp-norm of difference between list_a and list_b. :param list_a: First list - :type list_a: List[float] + :type list_a: list[float] :param list_b: Second list - :type list_b: List[float] + :type list_b: list[float] :param mink_power: the value of exponent in Minkowski norm :type mink_power: float @@ -137,7 +152,7 @@ def mean(vector): Mean of vector. :param vector: vector to get mean - :type vector: List[float] + :type vector: list[float] :return: the mean of vector :rtype: float @@ -150,12 +165,12 @@ def variance(vector): Variance of vector. :param vector: vector to get variance - :type vector: List[float] + :type vector: list[float] :return: the variance of vector :rtype: float """ - # faster than euclidian_distance(vector, [mean(vector)] * len(vector))**2 / len(vector) + # faster than euclidean_distance(vector, [mean(vector)] * len(vector))**2 / len(vector) return float(npy.var(vector)) @@ -164,10 +179,10 @@ def covariance(vector_x, vector_y): Covariance between vector_x and vector_y. :param vector_x: first vector to get covariance - :type vector_x: List[float] + :type vector_x: list[float] :param vector_y: second vector to get covariance - :type vector_y: List[float] + :type vector_y: list[float] :return: the covariance between vector_x and vector_y :rtype: float @@ -185,7 +200,7 @@ def covariance_matrix(matrix): Compute the covariance matrix of `matrix` of dimension `N x M`. :return: the covariance matrix of `matrix` - :rtype: List[List[float]], `N x N` + :rtype: list[list[float]], `N x N` :Examples: >>> from dessia_common.datatools.metrics import covariance_matrix @@ -204,12 +219,12 @@ def std(vector): Standard deviation of vector. :param vector: vector to get standard deviation - :type vector: List[float] + :type vector: list[float] :return: the standard deviation of vector :rtype: float """ - # faster than euclidian_distance(vector, [mean(vector)] * len(vector)) / math.sqrt(len(vector)) + # faster than euclidean_distance(vector, [mean(vector)] * len(vector)) / math.sqrt(len(vector)) return float(npy.std(vector)) @@ -222,13 +237,13 @@ def mahalanobis_distance(list_a, list_b, cov_matrix): distances in spaces constituted of very different dimensions in terms of scale and data distribution. :param list_a: First list - :type list_a: List[float] + :type list_a: list[float] :param list_b: Second list - :type list_b: List[float] + :type list_b: list[float] :param cov_matrix: the covariance matrix of data - :type cov_matrix: List[List[float]] + :type cov_matrix: list[list[float]] :return: the Mahalanobis distance between the two list :rtype: float diff --git a/dessia_common/datatools/modeler.py b/dessia_common/datatools/modeler.py new file mode 100644 index 000000000..388cd49d3 --- /dev/null +++ b/dessia_common/datatools/modeler.py @@ -0,0 +1,696 @@ +""" Library for building machine learning modelers from Dataset or Lists using sklearn models handled in models. """ +from typing import List, Dict, Tuple, Union, Any + +import numpy as npy + +try: + from plot_data.core import Dataset as pl_Dataset + from plot_data.core import (EdgeStyle, Tooltip, MultiplePlots, PointStyle, PrimitiveGroup, Axis, Sample, Point2D, + LineSegment2D, Label, Graph2D) + from plot_data.colors import BLACK, RED, BLUE, WHITE +except ImportError: + pass + +from dessia_common.core import DessiaObject +from dessia_common.typings import JsonSerializable +from dessia_common.serialization import SerializableObject +from dessia_common.datatools import learning_models as models +from dessia_common.datatools.dataset import Dataset +from dessia_common.datatools.math import Vector, Matrix + +Points = list[dict[str, float]] + +NO_LINE = EdgeStyle(0.0001) +STD_LINE = EdgeStyle(line_width=1.5, color_stroke=BLACK) + +REF_POINT_STYLE = PointStyle(BLUE, BLUE, 0.1, 12, 'circle') +VAL_POINT_STYLE = PointStyle(RED, RED, 0.1, 12, 'circle') +LIN_POINT_STYLE = PointStyle(BLACK, BLACK, 0.1, 12, 'crux') +INV_POINT_STYLE = PointStyle(WHITE, WHITE, 0.1, 1, 'crux') + + +class SampleDataset(Dataset): + """ Class allowing to plot and study data generated from a DOE and its prediction from a modeler modeling. """ + + _standalone_in_db = True + _allowed_methods = Dataset._allowed_methods + ["from_matrices", "matrix"] + + def __init__(self, dessia_objects: list[Sample] = None, input_names: list[str] = None, + output_names: list[str] = None, name: str = ''): + self.input_names = input_names + self.output_names = output_names + Dataset.__init__(self, dessia_objects=dessia_objects, name=name) + self._common_attributes = None + if input_names is not None and output_names is not None: + self._common_attributes = input_names + output_names + + def to_dict(self, use_pointers: bool = True, memo=None, path: str = '#', id_method=True, + id_memo=None) -> JsonSerializable: + """ Specific to_dict method. """ + dict_ = super().to_dict(use_pointers, memo=memo, path=path) + for dessia_object in dict_['dessia_objects']: + for attr in dessia_object['values']: + dessia_object.pop(attr) + return dict_ + + @classmethod + def dict_to_object(cls, dict_: JsonSerializable, force_generic: bool = False, global_dict=None, + pointers_memo: dict[str, Any] = None, path: str = '#') -> 'SerializableObject': + """ Specific `dict_to_object` method. """ + dessia_objects = [Sample(obj['values'], obj['reference_path'], obj['name']) for obj in dict_['dessia_objects']] + return cls(dessia_objects, dict_['input_names'], dict_['output_names'], dict_['name']) + + def _printed_attributes(self): + return self.common_attributes + + @classmethod + def from_matrices(cls, inputs: Matrix, predictions: Matrix, input_names: list[str], output_names: list[str], + name: str = '') -> 'SampleDataset': + """ Build a `SampleDataset` from inputs matrix and their predictions matrix. """ + samples = [] + for index, (input_, pred) in enumerate(zip(inputs, predictions)): + sample = {attr: input_[attr_index] for attr_index, attr in enumerate(input_names)} + sample.update(dict(zip(output_names, pred))) + samples.append(Sample(sample, reference_path=f"#/dessia_objects/{index}", name=f"{name}_{index}")) + return cls(samples, input_names, output_names) + + @property + def matrix(self) -> Matrix: + """ Get equivalent matrix of `dessia_objects` (`len(dessia_objects) x len(common_attributes)`). """ + if self._matrix is None: + matrix = [] + for sample in self: + vector_features, temp_row = list(zip(*list(sample.values.items()))) + matrix.append([temp_row[vector_features.index(attr)] for attr in self.common_attributes]) + self._matrix = matrix + return self._matrix + + +class Modeler(DessiaObject): + """ + Object that encapsulate standard processes in machine learning modelings. + + `Modeler` object allows to: + * fit a model from models + * scale input and output data before fit or predict + * score a model from models + * validate a modeling process with cross_validation method + * plot performances and predictions of a model stored in `Modeler` + * store a fitted model and associated fitted scaler in a `Modeler` element that can be re-used in another + workflow as an already trained machine learning model + + :param model: + Fitted model to make predictions. + + :param input_scaler: + Scaler for input data. + + :param output_scaler: + Scaler for output data. + + :param name: + Name of `Modeler`. + """ + + _standalone_in_db = True + _allowed_methods = DessiaObject._allowed_methods + ["fit_matrix", "fit_dataset", "predict_matrix", + "predict_dataset", "fit_predict_matrix", "fit_predict_dataset", + "score_matrix", "score_dataset", "fit_score_matrix", + "fit_score_dataset"] + + def __init__(self, model: models.Model, input_scaler: models.Scaler, output_scaler: models.Scaler, name: str = ''): + self.model = model + self.input_scaler = input_scaler + self.output_scaler = output_scaler + self.in_scaled = self._is_scaled(self.input_scaler) + self.out_scaled = self._is_scaled(self.output_scaler) + DessiaObject.__init__(self, name=name) + + def _is_scaled(self, scaler: models.Scaler): + return not isinstance(scaler, models.IdentityScaler) + + def _format_output(self, scaled_outputs: Matrix): + """ Format output to list[list[float]] in any case for code consistency and simplicity. """ + if not isinstance(scaled_outputs[0], (list, tuple)): + return [[value] for value in scaled_outputs] + return scaled_outputs + + @staticmethod + def _compute_scalers(inputs: Matrix, outputs: Matrix, input_is_scaled: bool = True, output_is_scaled: bool = False, + name: str = '') -> tuple[models.Scaler, models.Scaler, Matrix, Matrix]: + in_scaler_class, input_scaler_name = models.Scaler.set_in_modeler(name, "in", input_is_scaled) + out_scaler_class, output_scaler_name = models.Scaler.set_in_modeler(name, "out", output_is_scaled) + + in_scaler, scaled_inputs = in_scaler_class.fit_transform(inputs, input_scaler_name) + out_scaler, scaled_outputs = out_scaler_class.fit_transform(outputs, output_scaler_name) + return in_scaler, out_scaler, scaled_inputs, scaled_outputs + + @classmethod + def _fit(cls, inputs: Matrix, outputs: Matrix, model: models.Model, input_is_scaled: bool = True, + output_is_scaled: bool = False, name: str = '') -> 'Modeler': + """ Private method to fit outputs to inputs with a machine learning method from datatools.models objects. """ + in_scaler, out_scaler, scaled_inputs, scaled_outputs = cls._compute_scalers(inputs, outputs, input_is_scaled, + output_is_scaled, name) + fit_model = model.fit(scaled_inputs, scaled_outputs, **model.parameters, name=name + '_model') + return cls(fit_model, in_scaler, out_scaler, name) + + @classmethod + def fit_matrix(cls, inputs: Matrix, outputs: Matrix, model: models.Model, input_is_scaled: bool = True, + output_is_scaled: bool = False, name: str = '') -> 'Modeler': + """ + Method to fit outputs to inputs with a machine learning method from datatools.models objects for matrix data. + + :param inputs: + Matrix of data of dimension `n_samples x n_features` + + :param outputs: + Matrix of data of dimension `n_samples x n_features` + + :param input_is_scaled: + Whether to standardize inputs or not with a `models.StandardScaler` + + :param output_is_scaled: + Whether to standardize outputs or not with a `models.StandardScaler` + + :param name: + Name of Modeler + + :return: The equivalent Modeler object containing the fitted model and scalers associated to inputs and outputs + """ + return cls._fit(inputs, outputs, model, input_is_scaled, output_is_scaled, name) + + @classmethod + def fit_dataset(cls, dataset: Dataset, input_names: list[str], output_names: list[str], model: models.Model, + input_is_scaled: bool = True, output_is_scaled: bool = False, name: str = '') -> 'Modeler': + """ + Method to fit outputs to inputs with a machine learning method from datatools.models objects for a Dataset. + + :param dataset: + Dataset containing data, both inputs and outputs + + :param input_names: + Names of input features + + :param output_names: + Names of output features + + :param input_is_scaled: + Whether to standardize inputs or not with a `models.StandardScaler` + + :param output_is_scaled: + Whether to standardize outputs or not with a `models.StandardScaler` + + :param name: + Name of Modeler + + :return: The equivalent Modeler object containing the fitted model and scalers associated to inputs and outputs + """ + inputs, outputs = dataset.to_input_output(input_names, output_names) + return cls.fit_matrix(inputs, outputs, model, input_is_scaled, output_is_scaled, name) + + def _predict(self, inputs: list[list[float]]) -> Union[Vector, Matrix]: + """ Private method to predict outputs from inputs with self.model. """ + return self.output_scaler.inverse_transform(self.model.predict(self.input_scaler.transform(inputs))) + + def predict_matrix(self, inputs: list[list[float]]) -> Matrix: + """ + Method to predict outputs from inputs with the current Modeler for matrix data. + + :param inputs: + Matrix of data of dimension `n_samples x n_features` + + :return: The predicted values for inputs. + """ + return self._format_output(self._predict(inputs)) + + def predict_dataset(self, dataset: Dataset, input_names: list[str], output_names: list[str]) -> SampleDataset: + """ + Method to predict outputs from inputs with the current Modeler for Dataset object. + + :param dataset: + Dataset containing data, both inputs and outputs + + :param input_names: + Names of input features to predict + + :param output_names: + Names of predicted features + + :return: The predicted values for inputs. + """ + inputs = dataset.sub_matrix(input_names) + outputs = self.predict_matrix(inputs) + return SampleDataset.from_matrices(inputs, outputs, input_names, output_names, f'{self.name}_preds') + + @classmethod + def _fit_predict(cls, inputs: Matrix, outputs: Matrix, predicted_inputs: Matrix, model: models.Model, + input_is_scaled: bool = True, output_is_scaled: bool = False, + name: str = '') -> tuple['Modeler', Union[Vector, Matrix]]: + """ Private method to fit outputs to inputs and predict `predicted_inputs` for a Dataset. """ + modeler = cls._fit(inputs, outputs, model, input_is_scaled, output_is_scaled, name) + return modeler, modeler._predict(predicted_inputs) + + @classmethod + def fit_predict_matrix(cls, inputs: Matrix, outputs: Matrix, predicted_inputs: Matrix, model: models.Model, + input_is_scaled: bool = True, output_is_scaled: bool = False, + name: str = '') -> tuple['Modeler', Matrix]: + """ Fit outputs to inputs and predict `predicted_inputs` for matrix data (fit then predict). """ + modeler, predictions = cls._fit_predict(inputs, outputs, predicted_inputs, model, input_is_scaled, + output_is_scaled, name) + return modeler, modeler._format_output(predictions) + + @classmethod + def fit_predict_dataset(cls, fit_dataset: Dataset, to_predict_dataset: Dataset, input_names: list[str], + output_names: list[str], model: models.Model, input_is_scaled: bool = True, + output_is_scaled: bool = False, name: str = '') -> tuple['Modeler', Matrix]: + """ Fit outputs to inputs and predict outputs of `to_predict_dataset` (fit then predict). """ + modeler = cls.fit_dataset(fit_dataset, input_names, output_names, model, input_is_scaled, output_is_scaled, + name) + return modeler, modeler.predict_dataset(to_predict_dataset, input_names, output_names) + + def _score(self, inputs: Matrix, outputs: Matrix) -> float: + """ Compute the score of Modeler. """ + return self.model.score(self.input_scaler.transform(inputs), self.output_scaler.transform(outputs)) + + def score_matrix(self, inputs: Matrix, outputs: Matrix) -> float: + """ + Compute the score of Modeler from matrix. + + Please be sure to fit the model before computing its score and use test data and not train data. + Train data is data used to train the model and shall not be used to evaluate its quality. + Test data is data used to test the model and must not be used to train (fit) it. + + :param inputs: + Matrix of data of dimension `n_samples x n_features` + + :param outputs: + Matrix of data of dimension `n_samples x n_features` + + :return: The score of Modeler. + """ + return self._score(inputs, outputs) + + def score_dataset(self, dataset: Dataset, input_names: list[str], output_names: list[str]) -> float: + """ + Compute the score of Modeler from Dataset. + + Please be sure to fit the model before computing its score and use test data and not train data. + Train data is data used to train the model and shall not be used to evaluate its quality. + Test data is data used to test the model and must not be used to train (fit) it. + + :param dataset: + Dataset containing data, both inputs and outputs + + :param input_names: + Names of input features + + :param output_names: + Names of output features + + :return: The score of Modeler. + """ + inputs, outputs = dataset.to_input_output(input_names, output_names) + return self._score(inputs, outputs) + + @classmethod + def _fit_score(cls, inputs_train: Matrix, inputs_test: Matrix, outputs_train: Matrix, outputs_test: Matrix, + model: models.Model, input_is_scaled: bool, output_is_scaled: bool, + name: str) -> tuple['Modeler', float]: + """ Private method to fit modeler with train matrices and test it with test matrices. """ + mdlr = cls._fit(inputs_train, outputs_train, model, input_is_scaled, output_is_scaled, name) + return mdlr, mdlr._score(inputs_test, outputs_test) + + @classmethod + def fit_score_matrix(cls, inputs: Matrix, outputs: Matrix, model: models.Model, input_is_scaled: bool, + output_is_scaled: bool, ratio: float = 0.8, name: str = '') -> tuple['Modeler', float]: + """ Fit modeler with train matrices and test it with test matrices. """ + in_train, in_test, out_train, out_test = models.train_test_split(inputs, outputs, ratio=ratio) + return cls._fit_score(in_train, in_test, out_train, out_test, model, input_is_scaled, + output_is_scaled, name) + + @classmethod + def fit_score_dataset(cls, dataset: Dataset, input_names: list[str], output_names: list[str], model: models.Model, + input_is_scaled: bool = True, output_is_scaled: bool = False, ratio: float = 0.8, + name: str = '') -> tuple['Modeler', float]: + """ Train test split dataset, fit modeler with train matrices and score it with test matrices. """ + train_dataset, test_dataset = dataset.train_test_split(ratio=ratio, shuffled=True) + inputs_train, output_train = train_dataset.to_input_output(input_names, output_names) + inputs_test, output_test = test_dataset.to_input_output(input_names, output_names) + return cls._fit_score(inputs_train, inputs_test, output_train, output_test, model, input_is_scaled, + output_is_scaled, name) + +class ValidationData(DessiaObject): + """ + Object that stores modeling data as inputs, outputs and predictions matrices. + + :param inputs: + Matrix of input data. + + :param outputs: + Matrix of output data. + + :param predictions: + Matrix of predicted inputs with a `Modeler`. + + :param input_names: + Names of input features. + + :param output_names: + Names of output features. + + :param name: + Name of `ValidationData`. + """ + + def __init__(self, inputs: Matrix, outputs: Matrix, predictions: Matrix, name: str = ''): + self.inputs = inputs + self.outputs = outputs + self.predictions = predictions + DessiaObject.__init__(self, name=name) + + def points(self, input_names: list[str], output_names: list[str], reference_path: str) -> Points: + """ Get output vs prediction for each row of outputs matrix. """ + samples_list = [] + for row, (input_, ref_out, pred_out) in enumerate(zip(self.inputs, self.outputs, self.predictions)): + values = {attr: input_[col] for col, attr in enumerate(input_names)} + values.update({f"{attr}_ref": ref_out[col] for col, attr in enumerate(output_names)}) + values.update({f"{attr}_pred": pred_out[col] for col, attr in enumerate(output_names)}) + full_reference_path = f"{reference_path}/dessia_objects/{row}" + name = f"Sample_{row}" + samples_list.append(Sample(values=values, reference_path=full_reference_path, name=name)) + return samples_list + + +class TrainTestData(DessiaObject): + """ + Object that train and test data to validate modelers. + + :param training_valdata: + `ValidationData` of training data. + + :param testing_valdata: + `ValidationData` of testing data. + + :param input_names: + Names of input features. + + :param output_names: + Names of output features. + + :param name: + Name of `TrainTestData`. + """ + + def __init__(self, training_valdata: ValidationData, testing_valdata: ValidationData, input_names: list[str], + output_names: list[str], name: str = ''): + self.training_valdata = training_valdata + self.testing_valdata = testing_valdata + self.input_names = input_names + self.output_names = output_names + DessiaObject.__init__(self, name=name) + + def _concatenate_outputs(self) -> Matrix: + return self.training_valdata.outputs + self.testing_valdata.outputs + \ + self.training_valdata.predictions + self.testing_valdata.predictions + + def _matrix_ranges(self) -> Matrix: + return matrix_ranges(self._concatenate_outputs(), nb_points=10) + + def _ref_pred_names(self) -> list[list[str]]: + return [[name + '_ref', name + '_pred'] for name in self.output_names] + + def _tooltip(self) -> Tooltip: + return Tooltip(self.input_names + sum(self._ref_pred_names(), [])) + + def _ref_pred_datasets(self, points_train: Points, points_test: Points) -> list[Point2D]: + ref_args = {'point_style': REF_POINT_STYLE, 'edge_style': NO_LINE, 'name': 'Train data'} + pred_args = {'point_style': VAL_POINT_STYLE, 'edge_style': NO_LINE, 'name': 'Test data'} + + points = [Point2D(sample.values['n_di_ref'], sample.values['n_di_pred'], ref_args['point_style'], + reference_path=sample.reference_path, + tooltip=f"ref: {sample.values['n_di_ref']} ; pred: {sample.values['n_di_pred']}") + for sample in points_train] + points += [Point2D(sample.values['n_di_ref'], sample.values['n_di_pred'], pred_args['point_style'], + reference_path=sample.reference_path, + tooltip=f"ref: {sample.values['n_di_ref']} ; pred: {sample.values['n_di_pred']}") + for sample in points_test] + return points + + def _bisectrice_points(self) -> Points: + hack_bisectrices = [] + for point in zip(*self._matrix_ranges()): + hack_bisectrices.append({f"{self.output_names[0]}_ref": point[0], f"{self.output_names[0]}_pred": point[0]}) + for idx, name in enumerate(self.output_names): + hack_bisectrices[-1].update({name + '_ref': point[idx], name + '_pred': point[idx]}) + return hack_bisectrices + + def _to_val_points(self, reference_path: str) -> list[pl_Dataset]: + return self.training_valdata.points(self.input_names, self.output_names, reference_path), \ + self.testing_valdata.points(self.input_names, self.output_names, reference_path), self._bisectrice_points() + + def build_labels(self) -> list[Label]: + return [ + Label(title="Train Data", shape=Point2D(0, 0, point_style=REF_POINT_STYLE)), + Label(title="Test Data", shape=Point2D(0, 0, point_style=VAL_POINT_STYLE)), + Label(title="y = x", shape=LineSegment2D([0, 0], [1, 1], edge_style=STD_LINE)) + ] + + def build_graphs(self, reference_path: str) -> tuple[PrimitiveGroup, list[Point2D]]: + """ Build elements and graphs for `plot_data` method. """ + points_train, points_test, points_bisectrice = self._to_val_points(reference_path) + primitives = [LineSegment2D([points_bisectrice[0]['n_di_ref'], points_bisectrice[0]['n_di_pred']], + [points_bisectrice[-1]['n_di_ref'], points_bisectrice[-1]['n_di_pred']], + edge_style=STD_LINE)] + primitives += [Point2D(point['n_di_ref'], point['n_di_pred'], point_style=LIN_POINT_STYLE) + for point in points_bisectrice] + primitives += self._ref_pred_datasets(points_train, points_test) + primitives += self.build_labels() + return [PrimitiveGroup(primitives, axis_on=True)], points_train + points_test + points_bisectrice + + def plot_data(self, reference_path: str = '#', **_): + """ Plot data method for `TrainTestData`. """ + graphs, elements = self.build_graphs(reference_path) + if len(graphs) == 1: + return graphs + return [MultiplePlots(elements=elements, plots=graphs, initial_view_on=True)] + + +class ModelValidation(DessiaObject): + """ Class to handle a modeler and the `TrainTestData` used to train and test it. """ + + _non_data_eq_attributes = ['_score'] + _standalone_in_db = True + _allowed_methods = DessiaObject._allowed_methods + ["from_matrix", "from_dataset", "scores"] + + def __init__(self, data: TrainTestData, score: float, name: str = ''): + self.data = data + self.score = score + DessiaObject.__init__(self, name=name) +# TODO: is this too heavy ? To merge with TrainTestData ? + + @classmethod + def _build(cls, modeler: Modeler, input_train: Matrix, input_test: Matrix, output_train: Matrix, + output_test: Matrix, input_names: list[str], output_names: list[str], + name: str = '') -> 'ModelValidation': + trained_mdlr, pred_test = Modeler.fit_predict_matrix(input_train, output_train, input_test, modeler.model, + modeler.in_scaled, modeler.out_scaled, name) + pred_train = trained_mdlr.predict_matrix(input_train) + train_test_data = TrainTestData(ValidationData(input_train, output_train, pred_train), + ValidationData(input_test, output_test, pred_test), + input_names, output_names, f"{name}_data") + return cls(train_test_data, trained_mdlr.score_matrix(input_test, output_test), name) + + + @classmethod + def from_matrix(cls, modeler: Modeler, inputs: Matrix, outputs: Matrix, input_names: list[str], + output_names: list[str], ratio: float = 0.8, name: str = '') -> 'ModelValidation': + """ + Create a `ModelValidation` object from inputs and outputs matrices. + + :param modeler: + Modeler type and its hyperparameters, stored in a `Modeler` object for the sake of simplicity. Here, + modeler does not need to be fitted. + + :param inputs: + Matrix of data of dimension `n_samples x n_features` + + :param outputs: + Matrix of data of dimension `n_samples x n_features` + + :param input_names: + Names of input features + + :param output_names: + Names of output features + + :param ratio: + Ratio on which to split matrix. If ratio > 1, `in_train` will be of length `int(ratio)` and `in_test` of + length `len_matrix - int(ratio)`. + + :param name: + Name of `ModelValidation` + + :return: A `ModelValidation` object, containing the fitted modeler, its score, train and test data and their + predictions for input, stored in a `TrainTestData` object. + """ + in_train, in_test, out_train, out_test = models.train_test_split(inputs, outputs, ratio=ratio) + return cls._build(modeler, in_train, in_test, out_train, out_test, input_names, output_names, name) + + @classmethod + def from_dataset(cls, modeler: Modeler, dataset: Dataset, input_names: list[str], output_names: list[str], + ratio: float = 0.8, name: str = '') -> 'ModelValidation': + """ + Create a `ModelValidation` object from a dataset. + + :param modeler: + Modeler type and its hyperparameters, stored in a `Modeler` object for the sake of simplicity. Here, + modeler does not need to be fitted. + + :param dataset: + Dataset containing data, both inputs and outputs + + :param input_names: + Names of input features + + :param output_names: + Names of output features + + :param ratio: + Ratio on which to split matrix. If ratio > 1, `in_train` will be of length `int(ratio)` and `in_test` of + length `len_matrix - int(ratio)`. + + :param name: + Name of `ModelValidation` + + :return: A `ModelValidation` object, containing the fitted modeler, its score, train and test data and their + predictions for input, stored in a TrainTestData object. + """ + train_dataset, test_dataset = dataset.train_test_split(ratio=ratio, shuffled=True) + in_train, out_train = train_dataset.to_input_output(input_names, output_names) + in_test, out_test = test_dataset.to_input_output(input_names, output_names) + return cls._build(modeler, in_train, in_test, out_train, out_test, input_names, output_names, name) + + def plot_data(self, reference_path: str = '#', **_): + """ Plot data method for `ModelValidation`. """ + return self.data.plot_data(reference_path=reference_path) + + +class CrossValidation(DessiaObject): + """ + Class to cross validate a `Modeler` modeling. + + The purpose of cross validation is to validate a modeling process for a specific type of machine learning + method, set with specific hyperparameters. + The first step of cross validation is to split data into train and test data. Then the model is fitted with + train data and scored with test data. Furthermore, train and test inputs are predicted with the model and + plotted in a graph that plots these predictions versus reference values. In this plot, the more red points are + near the black line, the more the model can predict new data precisely. + This process of cross validation is ran `nb_tests` times. If all of them show a good score and a nice train test + plot, then the tested modeling is validated and can be used in other, but similar, processes for + predictions. + """ + + _non_data_eq_attributes = ['_scores'] + _standalone_in_db = True + _allowed_methods = DessiaObject._allowed_methods + ["from_matrix", "from_dataset", "scores"] + + def __init__(self, model_validations: list[ModelValidation], name: str = ''): + self.model_validations = model_validations + self._scores = None + DessiaObject.__init__(self, name=name) + + @property + def scores(self) -> Vector: + """ List of scores of modelers contained in `model_validations`. """ + if self._scores is None: + self._scores = [model_val.score for model_val in self.model_validations] + return self._scores + + def _points_scores(self, reference_path: str) -> Points: + scores = self.scores + samples_scores = [] + for idx, score in enumerate(scores): + values = {'Index': idx, 'Score': score} + full_reference_path = f"{reference_path}/model_validations/{idx}" + name = f"model_validation_{idx}" + samples_scores.append(Sample(values=values, reference_path=full_reference_path, name=name)) + return samples_scores + + def _plot_score(self, reference_path: str) -> Graph2D: + scores = self._points_scores(reference_path) + nidx = len(scores) + limits = pl_Dataset(elements=scores_limits(nidx), point_style=INV_POINT_STYLE, edge_style=NO_LINE) + axis = axis_style(nidx, nidx) + + scores_ds = pl_Dataset(elements=scores, tooltip=Tooltip(['Index', 'Score']), point_style=REF_POINT_STYLE, + edge_style=STD_LINE, name="Scores") + + return Graph2D(x_variable='Index', y_variable='Score', graphs=[scores_ds, limits], axis=axis) + + @classmethod + def from_matrix(cls, modeler: Modeler, inputs: Matrix, outputs: Matrix, input_names: list[str], + output_names: list[str], nb_tests: int = 5, ratio: float = 0.8, + name: str = '') -> 'CrossValidation': + """ Cross validation of modeler from inputs and outputs matrices, given `input_names` and `output_names`. """ + validations = [ModelValidation.from_matrix(modeler, inputs, outputs, input_names, output_names, ratio, + f"{name}_val_{idx}") for idx in range(nb_tests)] + return cls(validations, name) + + @classmethod + def from_dataset(cls, modeler: Modeler, dataset: Dataset, input_names: list[str], output_names: list[str], + nb_tests: int = 5, ratio: float = 0.8) -> 'CrossValidation': + """ + Cross validation of modeler from a Dataset object, given `input_names` and `output_names`. + + :param modeler: + Modeler type and its hyperparameters, stored in a `Modeler` object for the sake of simplicity. Here, + modeler does not need to be fitted. + + :param dataset: + Dataset containing data, both inputs and outputs + + :param input_names: + Names of input features + + :param output_names: + Names of output features + + :param nb_tests: + Number of train test validation to run in cross_validation method + + :param ratio: + Ratio on which to split matrix. If ratio > 1, `in_train` will be of length `int(ratio)` and `in_test` of + length `len_matrix - int(ratio)`. + """ + validations = [] + for idx in range(nb_tests): + name = f"{modeler.name}_val_{idx}" + validations.append(ModelValidation.from_dataset(modeler, dataset, input_names, output_names, ratio, name)) + return cls(validations, f"{name}_crossval") + + def plot_data(self, reference_path: str = '#', **_): + """ Plot data method for `CrossValidation`. """ + graphs = [] + for idx, validation in enumerate(self.model_validations): + graphs += validation.data.build_graphs(reference_path=f"{reference_path}/model_validations/{idx}")[0] + return [self._plot_score(reference_path=reference_path), + MultiplePlots(graphs, elements=[{"factice_key":0}], initial_view_on=True)] + + +def matrix_ranges(matrix: Matrix, nb_points: int = 20) -> Matrix: + """ Dessia linspace of `nb_points` points between extrema of each column of matrix. """ + ranges = [] + for feature_column in zip(*matrix): + min_value = min(feature_column) + max_value = max(feature_column) + step_range = (max_value - min_value)/nb_points + ranges.append(npy.arange(min_value, max_value, step_range).tolist() + [1.05 * max_value]) + return ranges + +def axis_style(nb_x: int = 10, nb_y: int = 10) -> Axis: + """ Set axis style for `Modeler` objects. """ + return Axis(nb_points_x=nb_x, nb_points_y=nb_y, axis_style=STD_LINE, grid_on=True) + +def scores_limits(number: int) -> Points: + """ Draw white points in scatter for it to be plotted between 0 and number on x axis and 0 and 1 on y axis. """ + return [{'Index': -0.05, 'Score': -0.05}, {'Index': number + 0.05, 'Score': 1.05}] diff --git a/dessia_common/datatools/sampling.py b/dessia_common/datatools/sampling.py index 4951abe3d..9c5399826 100644 --- a/dessia_common/datatools/sampling.py +++ b/dessia_common/datatools/sampling.py @@ -1,5 +1,5 @@ """ Library for sampling data. """ -from typing import List, Type +from typing import Type import random import numpy as npy @@ -18,10 +18,10 @@ class ClassSampler(DessiaObject): :type sampled_class: `type` :param sampled_attributes: List of varying attributes in the DOE - :type sampled_attributes: `List[BoundedAttributeValue]` + :type sampled_attributes: `list[BoundedAttributeValue]` :param constant_attributes: List of fixed attributes in the DOE - :type constant_attributes: `List[FixedAttributeValue]` + :type constant_attributes: `list[FixedAttributeValue]` :param name: Name of Sampler :type name: `str`, `optional`, defaults to `''` @@ -30,8 +30,8 @@ class ClassSampler(DessiaObject): _standalone_in_db = True _vector_features = [] - def __init__(self, sampled_class: Type, sampled_attributes: List[BoundedAttributeValue], - constant_attributes: List[FixedAttributeValue], name: str = ''): + def __init__(self, sampled_class: Type, sampled_attributes: list[BoundedAttributeValue], + constant_attributes: list[FixedAttributeValue], name: str = ''): self.sampled_class = sampled_class self.sampled_attributes = sampled_attributes self.constant_attributes = constant_attributes @@ -44,7 +44,7 @@ def _get_attributes_names(self): def _get_instances_numbers(self): return [1] * len(self.constant_attributes) + [attr.number for attr in self.sampled_attributes] - def _build_parameter_grid(self, instances_numbers: List[int]): + def _build_parameter_grid(self, instances_numbers: list[int]): parameter_grid = [] for attr, instances_number in zip(self.constant_attributes, instances_numbers[:len(self.constant_attributes)]): parameter_grid.append([attr.value]) diff --git a/dessia_common/tests.py b/dessia_common/tests.py index bc214e517..e32a2ca49 100644 --- a/dessia_common/tests.py +++ b/dessia_common/tests.py @@ -174,6 +174,7 @@ class Car(DessiaObject): _standalone_in_db = True _non_data_hash_attributes = ['name'] + _allowed_methods = ["from_csv"] def __init__(self, name: str, mpg: float, cylinders: int, displacement: dcm.Distance, horsepower: float, weight: dcm.Mass, acceleration: dcm.Time, model: int, origin: str): @@ -236,6 +237,7 @@ class RandDataD1(DessiaObject): _standalone_in_db = True _non_data_hash_attributes = ['name'] + _allowed_methods = ["create_dataset"] _nb_dims = 1 _vector_features = [f'p_{i+1}' for i in range(_nb_dims)] diff --git a/doc/source/datatools.rst b/doc/source/datatools.rst new file mode 100644 index 000000000..9767b7ec9 --- /dev/null +++ b/doc/source/datatools.rst @@ -0,0 +1,41 @@ +Datatools +**** + +Dataset +=============== + +A Dataset is a container object for DessiaObjetcs. + +It implements many features that can help engineers to explore the data contained in a list. +Dataset includes, among others, a plot_data method, filtering capabilities, data exploration features, metrics, statistics +and can be clustered into a ClusteredDataset to help in clustering similar data. + +.. autoclass:: dessia_common.datatools.dataset.Dataset + :members: + +Sampler +=============== + +A Sampler is an object that allows to generate a DOE from a class and bounds for its attributes. + +.. autoclass:: dessia_common.datatools.sampling.ClassSampler + :members: + +Clustered Dataset +=============== + +A Clustered Dataset is a container object for DessiaObjetcs. + +It implements many features that can help engineers to explore clusters in data. +ClusteredDataset includes, among others, a plot_data method, user friendly clusters manipulation and metrics. + +.. autoclass:: dessia_common.datatools.cluster.ClusteredDataset + :members: + +Modeling +=============== + +Modeling objects from sklearn. + +.. automodule:: dessia_common.datatools.modeling + :members: diff --git a/scripts/ci_scripts.py b/scripts/ci_scripts.py index ed448eae2..c5b9d9408 100644 --- a/scripts/ci_scripts.py +++ b/scripts/ci_scripts.py @@ -16,6 +16,10 @@ "sampling.py", "markdowns.py", "checks.py", + "datatools_models.py", + "datatools_modeler.py", + "datatools_math.py", + "unit_tests.py", "docx_writer.py", "markdown_to_docx.py", diff --git a/scripts/clustering.py b/scripts/clustering.py index 92ff5bce1..ceb6fb124 100644 --- a/scripts/clustering.py +++ b/scripts/clustering.py @@ -44,12 +44,15 @@ clustered_cars_without.labels[0] = 15000 clustered_cars_without.labels[1] = -1 clustered_cars_without.labels[2:100] = [999999] * len(clustered_cars_without[2:100]) -print(clustered_cars_without) -hlist = Dataset(all_cars_wi_feat, name="cars") -clist = ClusteredDataset.from_agglomerative_clustering(hlist, n_clusters=10, name="cars") -split_clist = clist.clustered_sublists() -split_clist[0].name = "15g6e4rg84reh56rt4h56j458hrt56gb41rth674r68jr6" -print(split_clist) +ref_strs = ["15000 | Chevrolet C... |", "+ 396 undisplayed", "0 | Chevy S-10 |", "| 999999 | F"] +assert(all(string in clustered_cars_without.__str__() for string in ref_strs)) + +dataset = Dataset(all_cars_wi_feat, name="cars") +clustered_dataset = ClusteredDataset.from_agglomerative_clustering(dataset, n_clusters=10, name="cars") +split_clustered_dataset = clustered_dataset.clustered_sublists() +split_clustered_dataset[0].name = "15g6e4rg84reh56rt4h56j458hrt56gb41rth674r68jr6" +ref_strs = ["0 | 15g6e4rg84r... | ['mp", "9 | cars_9 | ['m", "0 samples, 2 features, 10 clusters"] +assert(all(string in split_clustered_dataset.__str__() for string in ref_strs)) # Test ClusterResults instances on platform clustered_cars_without._check_platform() diff --git a/scripts/dataset.py b/scripts/dataset.py index dc662940b..f538305ed 100644 --- a/scripts/dataset.py +++ b/scripts/dataset.py @@ -1,15 +1,14 @@ """ -Tests for dessia_common.Dataset class (loadings, check_platform and plots) +Tests for dessia_common.Dataset class (loadings, check_platform and plots). """ import random from dessia_common.core import DessiaObject from dessia_common.models import all_cars_no_feat, all_cars_wi_feat, rand_data_middl -from dessia_common.datatools.metrics import covariance, manhattan_distance, euclidian_distance, minkowski_distance,\ +from dessia_common.datatools.math import covariance, manhattan_distance, euclidean_distance, minkowski_distance,\ inf_norm, mahalanobis_distance from dessia_common.datatools.dataset import Dataset # Tests on common_attributes - class SubObject(DessiaObject): def __init__(self, sub_attr: float = 1.5, name: str = ''): self.sub_attr = sub_attr @@ -72,6 +71,10 @@ def to_vector(self): # Compute one common_attributes all_cars_without_features.common_attributes +# Compute features importances from RandomForest algorithm +input_attributes = ['displacement', 'horsepower', 'model', 'acceleration', 'cylinders'] +output_attributes = ['weight'] + # Check platform for datasets all_cars_with_features._check_platform() all_cars_without_features._check_platform() @@ -107,12 +110,14 @@ def to_vector(self): assert(int(manhattan_distance(all_cars_with_features.matrix[3], all_cars_with_features.matrix[125])) == 1361) assert(int(minkowski_distance(all_cars_with_features.matrix[3], all_cars_with_features.matrix[125], mink_power=7.2)) == 1275) -assert(int(euclidian_distance(all_cars_with_features.matrix[3], all_cars_with_features.matrix[125])) == 1277) +assert(int(euclidean_distance(all_cars_with_features.matrix[3], all_cars_with_features.matrix[125])) == 1277) assert(int(covariance(all_cars_with_features.matrix[3], all_cars_with_features.matrix[125])) == 1155762) assert(int(inf_norm([1, 2, 3, 45, 4., 4.21515, -12, -0, 0, -25214.1511])) == 25214) assert(int(mahalanobis_distance(all_cars_with_features.matrix[3], all_cars_with_features.matrix[125], all_cars_with_features.covariance_matrix())) == 2) +assert(all_cars_with_features.maximums == [46.6, 0.455, 230.0, 24.8, 5140.0]) +assert(all_cars_with_features.minimums == [0.0, 0.068, 0.0, 8.0, 1613.0]) # Tests for empty Dataset empty_list = Dataset() @@ -167,8 +172,7 @@ def to_vector(self): all_cars_without_features[[float]] raise ValueError("float should not work as __getitem__ object for Dataset") except Exception as e: - assert(e.args[0] == "key of type with elements not implemented for indexing " + - "Datasets") + assert(e.args[0] == "key of type with elements not implemented for indexing Datasets") try: covariance([1, 2], [1]) diff --git a/scripts/datatools_math.py b/scripts/datatools_math.py new file mode 100644 index 000000000..2a17e578f --- /dev/null +++ b/scripts/datatools_math.py @@ -0,0 +1,9 @@ +""" +Tests for dessia_common.datatools.math. +""" +from dessia_common.datatools.math import maximums, minimums + +int_vector = [1, 2, 3, 4, 5, 6] + +assert(maximums(int_vector*6) == [6]) +assert(minimums(int_vector*6) == [1]) diff --git a/scripts/datatools_modeler.py b/scripts/datatools_modeler.py new file mode 100644 index 000000000..11ad67de6 --- /dev/null +++ b/scripts/datatools_modeler.py @@ -0,0 +1,111 @@ +""" +Tests for dessia_common.datatools.modeler file. +""" +from dessia_common.models import all_cars_no_feat +from dessia_common.datatools.dataset import Dataset +from dessia_common.datatools import learning_models as models +from dessia_common.datatools.modeler import Modeler, ModelValidation, CrossValidation + +# ====================================================================================================================== +# Load Data +# ====================================================================================================================== +# Load data and put it in Datasets +training_data, testing_data = Dataset(all_cars_no_feat).train_test_split(0.8, False) # True is better but it is for test +inputs = ['displacement', 'horsepower', 'acceleration'] + +# Set outputs to predict for regression +outputs_reg = ['weight', 'mpg'] + +# Set outputs to predict for classification +outputs_clf = ['cylinders', 'model'] + +# Extract training matrices +input_train = training_data.sub_matrix(inputs) +output_train_reg = training_data.sub_matrix(outputs_reg) + +# Extract testing matrices +input_test = testing_data.sub_matrix(inputs) +output_test_reg = testing_data.sub_matrix(outputs_reg) + + +# ====================================================================================================================== +# Train and test +# ====================================================================================================================== +# Initialize machine learning modeling regressors to fit in next operations and use to predict (2 examples) +ridge = models.Ridge.init_for_modeler(alpha=0.01, fit_intercept=True, tol=0.01) # linear regression with regularization +random_forest = models.RandomForestClassifier.init_for_modeler(n_estimators=100, criterion='squared_error') # classifier +mlp = models.MLPRegressor.init_for_modeler(hidden_layer_sizes=(50, 50, 50), activation='relu', max_iter=500) # NN +svr = models.SupportVectorRegressor.init_for_modeler(C=0.1, kernel='rbf') + +# Train / Fit models (scaled output is not advised) +ridge_mdlr = Modeler.fit_matrix(input_train, output_train_reg, ridge, True, False, "ridge_modeler") +rf_mdlr = Modeler.fit_dataset(training_data, inputs, outputs_clf, random_forest, True, False, "rf_modeler") +mlp_mdlr = Modeler.fit_dataset(training_data, inputs, outputs_reg, mlp, True, True, "mlp_modeler") +svr_mdlr = Modeler.fit_dataset(training_data, inputs, [outputs_reg[1]], svr, True, True, "svr_modeler") + +# Get score of a trained model (use test data) +ridge_score = ridge_mdlr.score_matrix(input_test, output_test_reg) +# rf_score = rf_mdlr.score_dataset(testing_data, inputs, outputs_clf) # score not available with multioutput +mlp_scrore = mlp_mdlr.score_dataset(testing_data, inputs, outputs_reg) + +# Fit and score in a row +ridge_mdlr, ridge_score = Modeler.fit_score_matrix(input_train, output_train_reg, ridge, True, False, 250, "ridge") +mlp_mdlr, mlp_scrore = Modeler.fit_score_dataset(training_data, inputs, outputs_reg, mlp, True, True, 0.8, "mlp") + +# Predict with models +ridge_predictions = ridge_mdlr.predict_matrix(input_test) +rf_predictions = rf_mdlr.predict_dataset(testing_data, inputs, outputs_clf) +mlp_predictions = mlp_mdlr.predict_dataset(testing_data, inputs, outputs_reg) + +# Fit and predict in one operation (to use if hyperparameters are known to give a model with a good score) +ridge_mdlr, ridge_predictions = Modeler.fit_predict_matrix(input_train, output_train_reg, input_test, ridge, True, + False, "ridge_modeler") +rf_mdlr, rf_predictions = Modeler.fit_predict_dataset(training_data, testing_data, inputs, outputs_clf, random_forest, + True, False, "rf_modeler") +mlp_mdlr, mlp_predictions = Modeler.fit_predict_dataset(training_data, testing_data, inputs, outputs_reg, mlp, True, + True, "mlp_modeler") +svr_mdlr, svr_predictions = Modeler.fit_predict_dataset(training_data, testing_data, inputs, [outputs_reg[1]], svr, + True, False, "svr_modeler") + +# ====================================================================================================================== +# Validate models +# ====================================================================================================================== +# Mono validation (not advised) +model_validation = ModelValidation.from_dataset(mlp_mdlr, training_data, inputs, outputs_reg, 0.8, "validation") +model_validation_2 = ModelValidation.from_matrix(mlp_mdlr, input_train, output_train_reg, inputs, outputs_reg, 0.8) +assert(abs(model_validation.score - model_validation_2.score) <= 0.2) + +# Plots (and tests) +model_validation_clf = ModelValidation.from_dataset(rf_mdlr, training_data, inputs, [outputs_clf[0]], 0.8, "clf val") +validation_reg_plot = model_validation.plot_data() +validation_clf_plot = model_validation_clf.plot_data() +assert(type(validation_reg_plot[0]).__name__ == "MultiplePlots") +assert(type(validation_clf_plot[0]).__name__ == "Graph2D") +assert(len(validation_reg_plot[0].elements) == 335) +assert(len(validation_clf_plot[0].graphs[0].elements) == 259) + +# Cross Validation (advised) +cross_validation = CrossValidation.from_dataset(rf_mdlr, training_data, inputs, [outputs_clf[0]], 10, 0.75) +cross_validation_2 = CrossValidation.from_matrix(mlp_mdlr, input_train, output_train_reg, inputs, outputs_reg, 5, 0.75) +cross_val_plot = cross_validation.plot_data() +assert(len(cross_val_plot) == 2) +assert(len(cross_val_plot[1].plots) == 10) +assert(len(cross_val_plot[1].elements) == 1) +assert(len(cross_val_plot[1].plots[1].graphs[1].elements) == 81) + + +# ====================================================================================================================== +# Additionnal Tests +# ====================================================================================================================== +linear_regression = models.LinearRegression.init_for_modeler() +decision_tree_regression = models.DecisionTreeRegressor.init_for_modeler() +randomforest_reg = models.RandomForestRegressor.init_for_modeler() +supportvector_clf = models.SupportVectorClassifier.init_for_modeler() +mlp_clf = models.MLPClassifier.init_for_modeler(hidden_layer_sizes=(50, 50, 50)) + +try: + svr_mdlr = Modeler.fit_dataset(training_data, inputs, outputs_reg, svr, True, True, "svr_modeler") +except Exception as e: + assert isinstance(e, NotImplementedError) + +rf_predictions._check_platform() diff --git a/scripts/datatools_models.py b/scripts/datatools_models.py new file mode 100644 index 000000000..0f52f039e --- /dev/null +++ b/scripts/datatools_models.py @@ -0,0 +1,170 @@ +""" +Tests for dessia_common.datatools.models file +""" +import time +import numpy as npy +from sklearn import linear_model, tree, ensemble, svm, neural_network + +from dessia_common.models import all_cars_no_feat +from dessia_common.datatools import learning_models as models +from dessia_common.datatools.dataset import Dataset + +max_time_check_platform = 30. + +# TODO review the way data are generated +# Load Data and put it in a Dataset (matrix is automatically computed) +dataset_example = Dataset(all_cars_no_feat) +inputs = dataset_example.sub_matrix(['displacement', 'horsepower', 'model', 'acceleration', 'cylinders']) +double_outputs = dataset_example.sub_matrix(['mpg', 'weight']) +labelled_outputs = [npy.random.randint(4) for _ in double_outputs] +doubled_labelled_outputs = [[npy.random.randint(4),npy.random.randint(4)] for _ in double_outputs] +mono_outputs = [output[0] for output in double_outputs] + +# Train test split +inputs_train, inputs_test, outputs_train, outputs_test = models.train_test_split(inputs, double_outputs, ratio=0.7) +assert(len(inputs_train) + len(inputs_test) == len(inputs)) +assert(len(outputs_train) + len(outputs_test) == len(double_outputs)) +assert(len(inputs_train) == len(outputs_train)) +assert(len(inputs_test) == len(outputs_test)) + +# Test scalers +idty_scaler = models.IdentityScaler().fit(dataset_example.matrix) +idty_matrix = idty_scaler.transform(dataset_example.matrix) +idty_scaler, idty_matrix = models.IdentityScaler().fit_transform(dataset_example.matrix) + +std_scaler = models.StandardScaler().fit(inputs) +std_inputs = std_scaler.transform(inputs) +std_multi_inputs = std_scaler.transform_matrices(inputs, inputs, inputs) +multi_inputs = std_scaler.inverse_transform_matrices(*std_multi_inputs) +std_scaler, std_inputs = models.StandardScaler().fit_transform(inputs) + +for i in range(2): + assert(all(value in std_multi_inputs[i][255] for value in std_multi_inputs[i + 1][255])) + assert(all(value in multi_inputs[i][255] for value in multi_inputs[i + 1][255])) + +# Hyperparameters +ridge_hyperparams = {'alpha': 0.1, 'tol': 0.00001, 'fit_intercept': True} +linearreg_hyperparams = {'fit_intercept': True, 'positive': False} +dt_hyperparams = {'max_depth': None} +rf_hyperparams = {'n_estimators': 40, 'max_depth': None} +svm_hyperparams = {'C': 0.1, 'kernel': 'rbf'} +mlp_hyperparams = {'hidden_layer_sizes': (100, 100, 100, 100, 100), 'alpha': 100, 'max_iter': 1000, 'solver': 'adam', + 'activation': 'identity', 'tol': 1.} + +hyperparameters = {'ridge_regressor': ridge_hyperparams, 'linearreg_regressor': linearreg_hyperparams, + 'dt_regressor': dt_hyperparams, 'dt_classifier': dt_hyperparams, + 'dt_classifier_doubled': dt_hyperparams, 'rf_regressor': rf_hyperparams, + 'rf_classifier': rf_hyperparams, + 'rf_classifier_doubled': rf_hyperparams, + 'svm_regressor': svm_hyperparams, 'svm_classifier': svm_hyperparams, + #'svm_classifier_doubled': svm_hyperparams, + 'mlp_regressor': mlp_hyperparams, 'mlp_classifier': mlp_hyperparams} + # 'mlp_classifier_doubled': mlp_hyperparams} + + +# Sklearn models +skl_models = {'ridge_regressor': linear_model.Ridge(**ridge_hyperparams), + 'linearreg_regressor': linear_model.LinearRegression(**linearreg_hyperparams), + 'dt_regressor': tree.DecisionTreeRegressor(**dt_hyperparams), + 'dt_classifier': tree.DecisionTreeClassifier(**dt_hyperparams), + 'dt_classifier_doubled': tree.DecisionTreeClassifier(**dt_hyperparams), + 'rf_regressor': ensemble.RandomForestRegressor(**rf_hyperparams), + 'rf_classifier': ensemble.RandomForestClassifier(**rf_hyperparams), + 'rf_classifier_doubled': ensemble.RandomForestClassifier(**rf_hyperparams), + 'svm_regressor': svm.SVR(**svm_hyperparams), + 'svm_classifier': svm.SVC(**svm_hyperparams), + # 'svm_classifier_doubled': svm.SVC(**svm_hyperparams), + 'mlp_regressor': neural_network.MLPRegressor(**mlp_hyperparams), + 'mlp_classifier': neural_network.MLPClassifier(**mlp_hyperparams)} + # 'mlp_classifier_doubled': neural_network.MLPClassifier(**mlp_hyperparams)} + +# Fit sklearn models +for key, model in skl_models.items(): + if 'regressor' in key: + if 'svm' in key: + model.fit(std_inputs[:-10], mono_outputs[:-10]) + continue + model.fit(std_inputs[:-10], double_outputs[:-10]) + continue + if 'doubled' in key: + model.fit(std_inputs[:-10], doubled_labelled_outputs[:-10]) + continue + model.fit(std_inputs[:-10], labelled_outputs[:-10]) + +# Dessia models +dessia_classes = {'ridge_regressor': models.Ridge, 'linearreg_regressor': models.LinearRegression, + 'dt_regressor': models.DecisionTreeRegressor, 'dt_classifier': models.DecisionTreeClassifier, + 'dt_classifier_doubled': models.DecisionTreeClassifier, + 'rf_regressor': models.RandomForestRegressor, 'rf_classifier': models.RandomForestClassifier, + 'rf_classifier_doubled': models.RandomForestClassifier, + 'svm_regressor': models.SupportVectorRegressor, 'svm_classifier': models.SupportVectorClassifier, + 'mlp_regressor': models.MLPRegressor, 'mlp_classifier': models.MLPClassifier} + #, 'mlp_classifier_doubled': models.MLPClassifier} + + +# Assert regenerated sklearn models from dessia models make the same predictions as sklearn models from sklearn.fit +dessia_models = {} +for key, model in skl_models.items(): + dessia_models[key] = dessia_classes[key]._instantiate_dessia(model, hyperparameters[key]) + assert(npy.all(dessia_models[key].predict(std_inputs[-10:]) == model.predict(std_inputs[-10:]))) + + +# Test dessia models methods +dessia_models = {} +for key, model in skl_models.items(): + print(key) + if 'regressor' in key: + if 'svm' in key: + local_outputs = mono_outputs + else: + local_outputs = double_outputs + else: + if 'doubled' in key: + local_outputs = doubled_labelled_outputs + else: + local_outputs = labelled_outputs + + parameters = hyperparameters[key] + dessia_models[key], preds = dessia_classes[key].fit_predict(std_inputs[:-10], local_outputs[:-10], std_inputs[-10:], + **parameters) + dessia_models[key] = dessia_classes[key].fit(std_inputs[:-10], local_outputs[:-10], **parameters) + try: + assert(isinstance(dessia_models[key].score(std_inputs[-10:], local_outputs[-10:]), float)) + except ValueError as e: + assert(e.args[0] == 'multiclass-multioutput is not supported' and + isinstance(dessia_models[key], (models.DecisionTreeClassifier, models.MLPClassifier, + models.SupportVectorClassifier, models.RandomForestClassifier))) + t=time.time() + dessia_models[key]._check_platform() + assert(time.time() - t <= max_time_check_platform) + + +# Tests errors and base objects +base_models = [models.Scaler(None), models.Model(None), models.LinearModel(None), models.RandomForest(None), + models.SupportVectorMachine(None), models.MultiLayerPerceptron(None)] +model = models.Model(None) + +for base_model in base_models: + try: + base_model._skl_class() + raise ValueError(f"_skl_class() should not work for {type(base_model)} object.") + except NotImplementedError as e: + assert isinstance(e, NotImplementedError) + +try: + model._instantiate_skl() + raise ValueError("_instantiate_skl() should not work for Model object.") +except NotImplementedError as e: + assert isinstance(e, NotImplementedError) + +try: + model._instantiate_dessia(None, None) + raise ValueError("_instantiate_dessia() should not work for Model object.") +except NotImplementedError as e: + assert isinstance(e, NotImplementedError) + +try: + models.train_test_split(inputs, [[1,2,3], [1,2,3]], ratio=0.7) + raise ValueError("train_test_split function should not work with matrices of different sizes.") +except ValueError as e: + assert isinstance(e, ValueError) diff --git a/scripts/utils/helpers.py b/scripts/utils/helpers.py index 44f2714c1..c598cc969 100644 --- a/scripts/utils/helpers.py +++ b/scripts/utils/helpers.py @@ -1,5 +1,5 @@ """ -Tests for dessia_common.utils.helpers +Tests for dessia_common.utils.helpers. """ from dessia_common.models import all_cars_no_feat from dessia_common.datatools.dataset import Dataset @@ -7,18 +7,18 @@ values_list = [all_cars_no_feat, all_cars_no_feat] values_hlist = [Dataset(all_cars_no_feat), Dataset(all_cars_no_feat)] -multi_types = [all_cars_no_feat, Dataset(all_cars_no_feat)] -wrong_type = [1,2,3,4,5,6] +heterogeneous_list = [all_cars_no_feat, Dataset(all_cars_no_feat)] +int_vector = [1, 2, 3, 4, 5, 6] assert(concatenate(values_list) == all_cars_no_feat + all_cars_no_feat) assert(concatenate(values_hlist) == Dataset(all_cars_no_feat + all_cars_no_feat)) try: - concatenate(multi_types) + concatenate(heterogeneous_list) except Exception as e: assert(e.args[0] == "Block Concatenate only defined for operands of the same type.") try: - concatenate(wrong_type) + concatenate(int_vector) except Exception as e: assert(e.args[0] == ("Block Concatenate only defined for classes with 'extend' method"))