diff --git a/sklearn_questions.py b/sklearn_questions.py index fa02e0d..90c70b4 100644 --- a/sklearn_questions.py +++ b/sklearn_questions.py @@ -1,188 +1,190 @@ -"""Assignment - making a sklearn estimator and cv splitter. +"""Assignment - making a sklearn estimator and CV splitter. The goal of this assignment is to implement by yourself: - -- a scikit-learn estimator for the KNearestNeighbors for classification +- A scikit-learn estimator for the KNearestNeighbors for classification tasks and check that it is working properly. -- a scikit-learn CV splitter where the splits are based on a Pandas +- A scikit-learn CV splitter where the splits are based on a Pandas DateTimeIndex. -Detailed instructions for question 1: -The nearest neighbor classifier predicts for a point X_i the target y_k of -the training sample X_k which is the closest to X_i. We measure proximity with -the Euclidean distance. The model will be evaluated with the accuracy (average -number of samples corectly classified). You need to implement the `fit`, -`predict` and `score` methods for this class. The code you write should pass -the test we implemented. You can run the tests by calling at the root of the -repo `pytest test_sklearn_questions.py`. Note that to be fully valid, a -scikit-learn estimator needs to check that the input given to `fit` and -`predict` are correct using the `check_*` functions imported in the file. -You can find more information on how they should be used in the following doc: -https://scikit-learn.org/stable/developers/develop.html#rolling-your-own-estimator. -Make sure to use them to pass `test_nearest_neighbor_check_estimator`. - - -Detailed instructions for question 2: -The data to split should contain the index or one column in -datatime format. Then the aim is to split the data between train and test -sets when for each pair of successive months, we learn on the first and -predict of the following. For example if you have data distributed from -november 2020 to march 2021, you have have 4 splits. The first split -will allow to learn on november data and predict on december data, the -second split to learn december and predict on january etc. - -We also ask you to respect the pep8 convention: https://pep8.org. This will be -enforced with `flake8`. You can check that there is no flake8 errors by -calling `flake8` at the root of the repo. - -Finally, you need to write docstrings for the methods you code and for the -class. The docstring will be checked using `pydocstyle` that you can also -call at the root of the repo. - -Hints ------ -- You can use the function: - -from sklearn.metrics.pairwise import pairwise_distances - -to compute distances between 2 sets of samples. +Detailed instructions are provided in the original problem description. """ import numpy as np import pandas as pd - -from sklearn.base import BaseEstimator -from sklearn.base import ClassifierMixin - +from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.model_selection import BaseCrossValidator - -from sklearn.utils.validation import check_X_y, check_is_fitted -from sklearn.utils.validation import check_array +from sklearn.utils.validation import check_X_y, check_is_fitted, validate_data from sklearn.utils.multiclass import check_classification_targets from sklearn.metrics.pairwise import pairwise_distances +from pandas.api.types import is_datetime64_any_dtype as is_datetime -class KNearestNeighbors(BaseEstimator, ClassifierMixin): +class KNearestNeighbors(ClassifierMixin, BaseEstimator): """KNearestNeighbors classifier.""" - def __init__(self, n_neighbors=1): # noqa: D107 + def __init__(self, n_neighbors=1): + """ + Initialize the KNearestNeighbors classifier. + + Parameters + ---------- + n_neighbors : int, default=1 + Number of neighbors to use for classification. + """ self.n_neighbors = n_neighbors def fit(self, X, y): - """Fitting function. + """ + Fit the classifier using the training data. - Parameters + Parameters ---------- - X : ndarray, shape (n_samples, n_features) - Data to train the model. - y : ndarray, shape (n_samples,) - Labels associated with the training data. + X : ndarray of shape (n_samples, n_features) + Training data. + y : ndarray of shape (n_samples,) + Target labels. Returns - ---------- - self : instance of KNearestNeighbors - The current instance of the classifier + ------- + self : object + Fitted estimator. """ + X, y = check_X_y(X, y) + check_classification_targets(y) + self.classes_ = np.unique(y) + self.n_features_in_ = X.shape[1] + self.X_, self.y_ = X, y return self def predict(self, X): - """Predict function. + """ + Predict the class labels for the provided data. Parameters ---------- - X : ndarray, shape (n_test_samples, n_features) + X : ndarray of shape (n_samples, n_features) Data to predict on. Returns - ---------- - y : ndarray, shape (n_test_samples,) + ------- + y_pred : ndarray of shape (n_samples,) Predicted class labels for each test data sample. """ - y_pred = np.zeros(X.shape[0]) + check_is_fitted(self) + X = validate_data(self, X, ensure_2d=True, dtype=None, reset=False) + y_pred = np.zeros(X.shape[0], dtype=self.y_.dtype) + for i, x_test in enumerate(X): + distances = pairwise_distances(x_test.reshape(1, -1), self.X_) + indices = np.argsort(distances, axis=1)[:, :self.n_neighbors] + unique, counts = np.unique(self.y_[indices], return_counts=True) + y_pred[i] = unique[np.argmax(counts)] return y_pred def score(self, X, y): - """Calculate the score of the prediction. + """ + Calculate the accuracy of the model. Parameters ---------- - X : ndarray, shape (n_samples, n_features) + X : ndarray of shape (n_samples, n_features) Data to score on. - y : ndarray, shape (n_samples,) - target values. + y : ndarray of shape (n_samples,) + Target values. Returns - ---------- + ------- score : float Accuracy of the model computed for the (X, y) pairs. """ - return 0. + return np.mean(self.predict(X) == y) class MonthlySplit(BaseCrossValidator): - """CrossValidator based on monthly split. + """ + Cross-validator based on monthly splits. - Split data based on the given `time_col` (or default to index). Each split - corresponds to one month of data for the training and the next month of - data for the test. + Each split corresponds to training on one month's data and testing on the + following month's data. Parameters ---------- - time_col : str, defaults to 'index' - Column of the input DataFrame that will be used to split the data. This - column should be of type datetime. If split is called with a DataFrame - for which this column is not a datetime, it will raise a ValueError. - To use the index as column just set `time_col` to `'index'`. + time_col : str, default='index' + Column of the input DataFrame that will be used to split the data. + This column should be of type datetime. If split is called with a + DataFrame where this column is not datetime, it will raise . """ - def __init__(self, time_col='index'): # noqa: D107 + def __init__(self, time_col='index'): + """ + Initialize the MonthlySplit cross-validator. + + Parameters + ---------- + time_col : str, default='index' + Column used for datetime-based splitting. + """ self.time_col = time_col def get_n_splits(self, X, y=None, groups=None): - """Return the number of splitting iterations in the cross-validator. + """ + Return the number of splitting iterations in the cross-validator. Parameters ---------- - X : array-like of shape (n_samples, n_features) - Training data, where `n_samples` is the number of samples - and `n_features` is the number of features. - y : array-like of shape (n_samples,) - Always ignored, exists for compatibility. - groups : array-like of shape (n_samples,) - Always ignored, exists for compatibility. + X : DataFrame + Input data. + y : None + Ignored. + groups : None + Ignored. Returns ------- n_splits : int - The number of splits. + Number of splits. """ - return 0 + newX = X.reset_index() if self.time_col == 'index' else X.copy() - def split(self, X, y, groups=None): - """Generate indices to split data into training and test set. + if not is_datetime(newX[self.time_col]): + raise ValueError(f"{self.time_col} should be of type datetime.") + + start_date = newX[self.time_col].max() + end_date = newX[self.time_col].min() + return ( + 12 * (start_date.year - end_date.year) + + start_date.month + - end_date.month + ) + + def split(self, X, y=None, groups=None): + """ + Generate indices for training and test sets. Parameters ---------- - X : array-like of shape (n_samples, n_features) - Training data, where `n_samples` is the number of samples - and `n_features` is the number of features. - y : array-like of shape (n_samples,) - Always ignored, exists for compatibility. - groups : array-like of shape (n_samples,) - Always ignored, exists for compatibility. + X : DataFrame + Input data. + y : None + Ignored. + groups : None + Ignored. Yields ------ idx_train : ndarray - The training set indices for that split. + Training set indices. idx_test : ndarray - The testing set indices for that split. + Test set indices. """ + newX = X.reset_index() + n_splits = self.get_n_splits(newX, y, groups) + + Xtodivide = ( + newX.sort_values(self.time_col) + .groupby(pd.Grouper(key=self.time_col, freq="ME")) + ) + idxs = [batch.index for _, batch in Xtodivide] - n_samples = X.shape[0] - n_splits = self.get_n_splits(X, y, groups) for i in range(n_splits): - idx_train = range(n_samples) - idx_test = range(n_samples) - yield ( - idx_train, idx_test - ) + idx_train = list(idxs[i]) + idx_test = list(idxs[i + 1]) + yield idx_train, idx_test