diff --git a/README.md b/README.md index 7b560f8..4e54dbe 100644 --- a/README.md +++ b/README.md @@ -4,8 +4,6 @@ [![Documentation Status](https://readthedocs.org/projects/classifierpromax/badge/?version=latest)](https://classifierpromax.readthedocs.io/en/latest/?badge=latest)[![Python 3.12](https://img.shields.io/badge/python-3.12-blue.svg)](https://www.python.org/downloads/release/python-3120/) ![ci-cd](https://github.com/UBC-MDS/classifierpromax/actions/workflows/ci-cd.yml/badge.svg) [![codecov](https://codecov.io/gh/UBC-MDS/classifierpromax/branch/main/graph/badge.svg)](https://codecov.io/gh/UBC-MDS/classifierpromax) -https://app.codecov.io/github/UBC-MDS/ClassifierProMax - `classifierpromax` is a scikit-learn wrapper library that helps to train and optimize multiple classifier models in parallel. `ClassifierTrainer()`: diff --git a/src/classifierpromax/ClassifierOptimizer.py b/src/classifierpromax/ClassifierOptimizer.py index 095c240..c317460 100644 --- a/src/classifierpromax/ClassifierOptimizer.py +++ b/src/classifierpromax/ClassifierOptimizer.py @@ -5,34 +5,143 @@ from sklearn.model_selection import RandomizedSearchCV, cross_validate from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, make_scorer +# Helper function: Define hyperparameter distributions +def get_param_distributions(): + return { + 'logreg': { + 'logisticregression__C': loguniform(1e-2, 1e3), + 'logisticregression__class_weight': [None, 'balanced'] + }, + 'svc': { + 'svc__C': loguniform(1e-2, 1e3), + 'svc__class_weight': [None, 'balanced'] + }, + 'random_forest': { + 'randomforestclassifier__n_estimators': randint(10, 30), + 'randomforestclassifier__max_depth': randint(5, 10) + } + } + +# Helper function: Define scoring metrics +def get_scoring_metrics(): + return { + "accuracy": "accuracy", + "precision": make_scorer(precision_score, zero_division=0, average='weighted'), + "recall": make_scorer(recall_score, average='weighted'), + "f1": make_scorer(f1_score, average='weighted'), + } + +# Helper function: Validate inputs +def validate_inputs(model_dict, param_dist, X_train, y_train, scoring, n_iter, cv, random_state, n_jobs): + + # Validate model_dict + if not isinstance(model_dict, dict) or not model_dict: + raise ValueError("model_dict must be a non-empty dictionary of sklearn Pipeline objects.") + + for name, model in model_dict.items(): + if not isinstance(name, str) or not name: + raise ValueError(f"Invalid model name '{name}'. Model names must be non-empty strings.") + if not isinstance(model, Pipeline): + raise ValueError(f"The model '{name}' is not a valid scikit-learn Pipeline.") + + # Validate X_train + if not isinstance(X_train, (pd.DataFrame, np.ndarray)): + raise ValueError("X_train must be a pandas DataFrame or a numpy array.") + if X_train.size == 0: + raise ValueError("X_train cannot be empty.") + + # Validate y_train + if not isinstance(y_train, (pd.Series, np.ndarray)): + raise ValueError("y_train must be a pandas Series or a numpy array.") + if y_train.size == 0: + raise ValueError("y_train cannot be empty.") + + # Validate consistency of X_train and y_train + if X_train.shape[0] != y_train.shape[0]: + raise ValueError("The number of samples in X_train and y_train must match.") + + # Validate scoring metric + valid_metrics = get_scoring_metrics().keys() + if scoring not in valid_metrics: + raise ValueError(f"Invalid scoring metric '{scoring}'. Choose from {list(valid_metrics)}.") + + # Validate numeric parameters + if not isinstance(n_iter, int) or n_iter <= 0: + raise ValueError("n_iter must be a positive integer.") + if not isinstance(cv, int) or cv <= 1: + raise ValueError("cv must be an integer greater than 1.") + if not isinstance(random_state, int): + raise ValueError("random_state must be an integer.") + if not isinstance(n_jobs, int) or n_jobs == 0: + raise ValueError("n_jobs must be a nonzero integer (use -1 for all processors).") + +# Helper function: Optimize a single model +def optimize_model(name, model, param_dist, X_train, y_train, scoring, n_iter, cv, random_state, n_jobs): + print(f"\nTraining {name}...") + search = RandomizedSearchCV( + estimator=model, + param_distributions=param_dist, + scoring=get_scoring_metrics()[scoring], + n_iter=n_iter, + cv=cv, + random_state=random_state, + n_jobs=n_jobs, + return_train_score=True + ) + search.fit(X_train, y_train) + print(f"Best parameters for {name}: {search.best_params_}") + return search.best_estimator_ + +# Helper function: Evaluate model performance +def evaluate_model(name, model, X_train, y_train, cv): + cv_results = cross_validate( + model, + X_train, + y_train, + cv=cv, + scoring=get_scoring_metrics(), + return_train_score=True, + error_score='raise' + ) + return pd.DataFrame(cv_results).agg(['mean', 'std']).T + +# Main function: ClassifierOptimizer def ClassifierOptimizer(model_dict, X_train, y_train, scoring='f1', n_iter=100, cv=5, random_state=42, n_jobs=-1): """ - Optimizes a dictionary of scikit-learn Pipeline classifiers using RandomizedSearchCV and evaluates their performance. - - Parameters: + Optimizes a dictionary of scikit-learn Pipeline classifiers using RandomizedSearchCV + and evaluates their performance. + + Parameters: ----------- model_dict : dict A dictionary where keys are model names (str) and values are scikit-learn Pipeline objects. Each pipeline must contain a classifier whose hyperparameters are defined in `param_dist`. + X_train : pandas.DataFrame or numpy.ndarray The feature matrix for training the classifiers. Must have the same number of samples as `y_train`. + y_train : pandas.Series or numpy.ndarray The target labels for training the classifiers. Must have the same number of samples as `X_train`. - scoring : dict, optional - A dictionary specifying scoring metrics to evaluate the classifiers during cross-validation. - Default is None, which uses the following metrics: - - "accuracy" - - "precision" (weighted) - - "recall" (weighted) - - "f1" (weighted) - n_iter : int, optional - The number of parameter settings sampled for RandomizedSearchCV. Default is 100. - cv : int, optional - The number of cross-validation folds for both RandomizedSearchCV and cross_validate. Default is 5. - random_state : int, optional - Random seed for reproducibility of RandomizedSearchCV. Default is 42. - n_jobs : int, optional - The number of jobs to run in parallel for RandomizedSearchCV. Default is -1 (use all available processors). + + scoring : str, optional, default='f1' + The scoring metric to use for hyperparameter optimization and model evaluation. + Must be one of the following: + - "accuracy" + - "precision" + - "recall" + - "f1" + + n_iter : int, optional, default=100 + The number of parameter settings sampled for RandomizedSearchCV. + + cv : int, optional, default=5 + The number of cross-validation folds for both RandomizedSearchCV and cross_validate. + + random_state : int, optional, default=42 + Random seed for reproducibility of RandomizedSearchCV. + + n_jobs : int, optional, default=-1 + The number of jobs to run in parallel for RandomizedSearchCV (-1 uses all available processors). Returns: -------- @@ -40,9 +149,16 @@ def ClassifierOptimizer(model_dict, X_train, y_train, scoring='f1', n_iter=100, A dictionary containing the best estimators for each classifier after hyperparameter optimization. scoring_dict : dict - A dictionary containing cross-validation results for each optimized model, with metrics aggregated by mean and standard deviation. + A dictionary containing cross-validation results for each optimized model, with + metrics aggregated by mean and standard deviation. - Examples: + Raises: + ------- + ValueError + If the input parameters are invalid (e.g., empty model dictionary, mismatched data shapes, + unsupported scoring metric). + + Examples: --------- >>> from sklearn.pipeline import Pipeline >>> from sklearn.linear_model import LogisticRegression @@ -64,94 +180,18 @@ def ClassifierOptimizer(model_dict, X_train, y_train, scoring='f1', n_iter=100, ... } >>> optimized_models, scoring_results = ClassifierOptimizer(model_dict, X_train, y_train) """ - - - param_dist = { - 'logreg' : { - 'logisticregression__C': loguniform(1e-2, 1e3), - 'logisticregression__class_weight': [None, 'balanced'] - }, - 'svc' : { - 'svc__C': loguniform(1e-2, 1e3), - 'svc__class_weight': [None, 'balanced'] - }, - 'random_forest' : { - 'randomforestclassifier__n_estimators': randint(10,30), - 'randomforestclassifier__max_depth': randint(5,10) - } - } - - # Default metrics if not provided - scoring_metrics = { - "accuracy": "accuracy", - "precision": make_scorer(precision_score, zero_division=0, average='weighted'), - "recall": make_scorer(recall_score, average='weighted'), - "f1": make_scorer(f1_score, average='weighted'), - } - - # Validate scoring - if scoring not in scoring_metrics: - raise ValueError(f"Invalid scoring metric '{scoring}'. Choose from {list(scoring_metrics.keys())}.") - # Validate model_dict - if not isinstance(model_dict, dict): - raise ValueError("model_dict must be a dictionary.") - if not model_dict: - raise ValueError("model_dict is empty. Please provide at least one model.") - - for name, model in model_dict.items(): - if not isinstance(model, Pipeline): - raise ValueError(f"The model '{name}' is not a valid scikit-learn Pipeline.") - - # Validate X_train, y_train - if not isinstance(X_train, (pd.DataFrame, np.ndarray)): - raise ValueError("X_train must be a pandas DataFrame or a numpy array.") - if not isinstance(y_train, (pd.Series, np.ndarray)): - raise ValueError("y_train must be a pandas Series or a numpy array.") - if X_train.size == 0 or y_train.size == 0: - raise ValueError("X_train and y_train cannot be empty.") - if X_train.shape[0] != len(y_train): - raise ValueError("The number of samples in X_train and y_train must match.") + param_dist = get_param_distributions() + validate_inputs(model_dict, param_dist, X_train, y_train, scoring, n_iter, cv, random_state, n_jobs) - # Check if the model names match the param_dist keys - if not all(name in param_dist for name in model_dict): - raise ValueError("Each model name in model_dict must have corresponding hyperparameters in param_dist.") - - # Drop dummy model - model_dict.pop('dummy', None) - optimized_model_dict = {} scoring_dict = {} - - # Loop through classifiers and perform RandomizedSearchCV + for name, model in model_dict.items(): - print(f"\nTraining {name}...") - - search = RandomizedSearchCV( - estimator=model, - param_distributions=param_dist[name], - scoring=scoring_metrics[scoring], - n_iter=n_iter, - cv=cv, - random_state=random_state, - n_jobs=n_jobs, - return_train_score=True + best_model = optimize_model( + name, model, param_dist[name], X_train, y_train, scoring, n_iter, cv, random_state, n_jobs ) - - search.fit(X_train, y_train) - search.best_params_ - - best_model = search.best_estimator_ optimized_model_dict[name] = best_model - - cv_results = cross_validate( - best_model, - X_train, - y_train, - cv=cv, - scoring=scoring_metrics, - return_train_score=True, - error_score='raise' - ) - scoring_dict[name] = pd.DataFrame(cv_results).agg(['mean', 'std']).T - return optimized_model_dict, scoring_dict + scoring_dict[name] = evaluate_model(name, best_model, X_train, y_train, cv) + + return optimized_model_dict, scoring_dict \ No newline at end of file diff --git a/tests/test_ClassifierOptimizer.py b/tests/test_ClassifierOptimizer.py index 0f75d70..43c3f6b 100644 --- a/tests/test_ClassifierOptimizer.py +++ b/tests/test_ClassifierOptimizer.py @@ -6,19 +6,19 @@ from sklearn.svm import SVC from sklearn.ensemble import RandomForestClassifier from sklearn.preprocessing import StandardScaler -from sklearn.model_selection import train_test_split from classifierpromax.ClassifierOptimizer import ClassifierOptimizer +# Fixture: Generate sample data @pytest.fixture def sample_data(): - # Generate synthetic data for testing - X = pd.DataFrame(np.random.rand(100, 5), columns=[f"feature_{i}" for i in range(5)]) - y = pd.Series(np.random.randint(0, 2, size=100)) - return X, y + np.random.seed(42) + X_train = np.random.rand(100, 5) # 100 samples, 5 features + y_train = np.random.randint(0, 2, size=100) # Binary classification + return X_train, y_train +# Fixture: Define test model dictionary @pytest.fixture -def sample_models(): - # Create a dictionary of pipelines for testing +def test_models(): return { 'logreg': Pipeline([ ('scaler', StandardScaler()), @@ -29,60 +29,129 @@ def sample_models(): ('svc', SVC()) ]), 'random_forest': Pipeline([ - ('scaler', StandardScaler()), ('randomforestclassifier', RandomForestClassifier()) ]) } -def test_classifier_optimizer_valid_input(sample_data, sample_models): - X, y = sample_data - model_dict = sample_models - scoring = 'accuracy' - - optimized_models, scoring_dict = ClassifierOptimizer( - model_dict=model_dict, - X_train=X, - y_train=y, - scoring=scoring, - n_iter=5, - cv=2 - ) - - assert isinstance(optimized_models, dict) - assert isinstance(scoring_dict, dict) - for name in model_dict.keys(): - assert name in optimized_models - assert name in scoring_dict - +# Test Case: Model dictionary is not a dictionary def test_classifier_optimizer_invalid_model_dict(sample_data): - X, y = sample_data - invalid_model_dict = "not_a_dict" - with pytest.raises(ValueError, match="model_dict must be a dictionary."): - ClassifierOptimizer(model_dict=invalid_model_dict, X_train=X, y_train=y, scoring='accuracy') + X_train, y_train = sample_data + invalid_model_dict = ["not", "a", "dict"] + + with pytest.raises(ValueError, match="model_dict must be a non-empty dictionary of sklearn Pipeline objects."): + ClassifierOptimizer(invalid_model_dict, X_train, y_train) +# Test Case: Model dictionary is empty def test_classifier_optimizer_empty_model_dict(sample_data): - X, y = sample_data - empty_model_dict = {} - with pytest.raises(ValueError, match="model_dict is empty. Please provide at least one model."): - ClassifierOptimizer(model_dict=empty_model_dict, X_train=X, y_train=y, scoring='accuracy') - -def test_classifier_optimizer_invalid_X_train(sample_models): - model_dict = sample_models - X_train = "not_a_dataframe" - y_train = pd.Series(np.random.randint(0, 2, size=100)) + X_train, y_train = sample_data + + with pytest.raises(ValueError, match="model_dict must be a non-empty dictionary of sklearn Pipeline objects."): + ClassifierOptimizer({}, X_train, y_train) + +# Test Case: Model names in `model_dict` are not strings +def test_classifier_optimizer_invalid_model_names(sample_data): + X_train, y_train = sample_data + invalid_model_dict = {123: Pipeline([("scaler", StandardScaler()), ("logreg", LogisticRegression())])} + + with pytest.raises(ValueError, match="Invalid model name '123'. Model names must be non-empty strings."): + ClassifierOptimizer(invalid_model_dict, X_train, y_train) + +# Test Case: Model dictionary contains non-Pipeline objects +def test_classifier_optimizer_invalid_model_type(sample_data): + X_train, y_train = sample_data + invalid_model_dict = {"invalid_model": "not_a_pipeline"} + + with pytest.raises(ValueError, match="The model 'invalid_model' is not a valid scikit-learn Pipeline."): + ClassifierOptimizer(invalid_model_dict, X_train, y_train) + +# Test Case: X_train is not a DataFrame or ndarray +def test_classifier_optimizer_invalid_X_train(sample_data, test_models): + X_train, y_train = "invalid_input", sample_data[1] + with pytest.raises(ValueError, match="X_train must be a pandas DataFrame or a numpy array."): - ClassifierOptimizer(model_dict=model_dict, X_train=X_train, y_train=y_train, scoring='accuracy') + ClassifierOptimizer(test_models, X_train, y_train) + +# Test Case: y_train is not a Series or ndarray +def test_classifier_optimizer_invalid_y_train(sample_data, test_models): + X_train, y_train = sample_data[0], "invalid_labels" -def test_classifier_optimizer_invalid_y_train(sample_models): - model_dict = sample_models - X_train = pd.DataFrame(np.random.rand(100, 5)) - y_train = "not_a_series" with pytest.raises(ValueError, match="y_train must be a pandas Series or a numpy array."): - ClassifierOptimizer(model_dict=model_dict, X_train=X_train, y_train=y_train, scoring='accuracy') + ClassifierOptimizer(test_models, X_train, y_train) + +# Test Case: X_train is empty +def test_classifier_optimizer_empty_X_train(test_models): + X_train = np.array([]).reshape(0, 5) + y_train = np.random.randint(0, 2, size=10) + + with pytest.raises(ValueError, match="X_train cannot be empty."): + ClassifierOptimizer(test_models, X_train, y_train) + +# Test Case: y_train is empty +def test_classifier_optimizer_empty_y_train(test_models): + X_train = np.random.rand(10, 5) + y_train = np.array([]) + + with pytest.raises(ValueError, match="y_train cannot be empty."): + ClassifierOptimizer(test_models, X_train, y_train) + +# Test Case: X_train and y_train have mismatched samples +def test_classifier_optimizer_mismatched_X_y(test_models): + X_train = np.random.rand(50, 5) + y_train = np.random.randint(0, 2, size=100) -def test_classifier_optimizer_mismatched_X_y_lengths(sample_models): - model_dict = sample_models - X_train = pd.DataFrame(np.random.rand(100, 5)) - y_train = pd.Series(np.random.randint(0, 2, size=50)) # Mismatched length with pytest.raises(ValueError, match="The number of samples in X_train and y_train must match."): - ClassifierOptimizer(model_dict=model_dict, X_train=X_train, y_train=y_train, scoring='accuracy') \ No newline at end of file + ClassifierOptimizer(test_models, X_train, y_train) + +# Test Case: Invalid scoring metric +def test_classifier_optimizer_invalid_scoring(sample_data, test_models): + X_train, y_train = sample_data + + with pytest.raises(ValueError, match="Invalid scoring metric 'invalid_score'. Choose from .*"): + ClassifierOptimizer(test_models, X_train, y_train, scoring="invalid_score") + +# Test Case: `n_iter` is non-positive +@pytest.mark.parametrize("n_iter", [0, -5]) +def test_classifier_optimizer_invalid_n_iter(sample_data, test_models, n_iter): + X_train, y_train = sample_data + + with pytest.raises(ValueError, match="n_iter must be a positive integer."): + ClassifierOptimizer(test_models, X_train, y_train, n_iter=n_iter) + +# Test Case: `cv` is too low +def test_classifier_optimizer_invalid_cv(sample_data, test_models): + X_train, y_train = sample_data + + with pytest.raises(ValueError, match="cv must be an integer greater than 1."): + ClassifierOptimizer(test_models, X_train, y_train, cv=1) + +# Test Case: Invalid `random_state` +@pytest.mark.parametrize("random_state", ["invalid", 3.14, None]) +def test_classifier_optimizer_invalid_random_state(sample_data, test_models, random_state): + X_train, y_train = sample_data + + with pytest.raises(ValueError, match="random_state must be an integer."): + ClassifierOptimizer(test_models, X_train, y_train, random_state=random_state) + +# Test Case: `n_jobs` is zero +def test_classifier_optimizer_invalid_n_jobs(sample_data, test_models): + X_train, y_train = sample_data + + with pytest.raises(ValueError, match="n_jobs must be a nonzero integer .*"): + ClassifierOptimizer(test_models, X_train, y_train, n_jobs=0) + +# Test Case: Different scoring metrics +@pytest.mark.parametrize("scoring_metric", ["accuracy", "precision", "recall", "f1"]) +def test_classifier_optimizer_different_scoring(sample_data, test_models, scoring_metric): + X_train, y_train = sample_data + + optimized_models, scoring_results = ClassifierOptimizer( + test_models, X_train, y_train, scoring=scoring_metric, n_iter=5, cv=3, random_state=42, n_jobs=1 + ) + + assert isinstance(optimized_models, dict) + assert isinstance(scoring_results, dict) + assert set(optimized_models.keys()) == set(test_models.keys()) + + for model_name, scores in scoring_results.items(): + assert isinstance(scores, pd.DataFrame) + assert any(scoring_metric in str(index) for index in scores.index), f"Scoring metric '{scoring_metric}' not found in results" \ No newline at end of file