Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat: user-specified feature groups #5

Merged
merged 2 commits into from
Mar 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 64 additions & 7 deletions src/sparsely/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,20 @@

from __future__ import annotations

import warnings
from abc import ABC, abstractmethod
from numbers import Real, Integral
from typing import Optional, Callable, ClassVar
from typing import Optional, Callable, ClassVar, Sequence

import numpy as np
from halfspace import Model
from mip import OptimizationStatus
from sklearn.base import BaseEstimator
from sklearn.exceptions import FitFailedWarning
from sklearn.preprocessing import StandardScaler
from sklearn.utils import check_random_state
from sklearn.utils._param_validation import Interval
from sklearn.utils.validation import check_is_fitted
from sklearn.utils._param_validation import Interval, StrOptions
from sklearn.utils.validation import check_is_fitted, check_scalar


class BaseSparseEstimator(BaseEstimator, ABC):
Expand All @@ -33,8 +36,14 @@ class BaseSparseEstimator(BaseEstimator, ABC):
normalize: Whether to normalize the data before fitting the model.
max_iters: The maximum number of iterations.
tol: The tolerance for the stopping criterion.
start: The initial guess for the selected features. If `None`, then the initial guess is randomly selected.
Providing a good initial guess based on problem-specific knowledge can significantly speed up the search.
start: The initial guess for the selected features. For example if `start={0, 1, 2}`, then the first three
features will be selected. If `None`, then the initial guess is randomly selected. Providing a good initial
guess based on problem-specific knowledge can significantly speed up the search.
feature_groups: Set of features that are mutually exclusive. For example, if `feature_groups=[{0, 1}, {2, 3}]`,
then at most one features 0 and 1 will be selected, and at most one features 2 and 3 will be selected. This
can be used to encode prior knowledge about the problem.
solver: The solver to use for the optimization problem. The available options are "CBC" and "GUROBI". Support
for the "HiGHS" solver is also planned for a future release.
random_state: Controls the random seed for the initial guess if a user-defined initial guess is not provided.
verbose: Whether to enable logging of the search progress.
"""
Expand All @@ -46,6 +55,8 @@ class BaseSparseEstimator(BaseEstimator, ABC):
"max_iters": [Interval(type=Integral, left=1, right=None, closed="left")],
"tol": [Interval(type=Real, left=0, right=None, closed="left")],
"start": ["array-like", None],
"feature_groups": ["array-like", None],
"solver": [StrOptions({"HiGHS", "CBC", "GUROBI"})],
"random_state": ["random_state"],
"verbose": ["boolean"],
}
Expand All @@ -58,6 +69,8 @@ def __init__(
max_iters: int = 500,
tol: float = 1e-4,
start: Optional[set[int]] = None,
feature_groups: Optional[Sequence[set[int]]] = None,
solver: str = "CBC",
random_state: Optional[int] = None,
verbose: bool = False,
):
Expand All @@ -70,6 +83,8 @@ def __init__(
max_iters: The value for the `max_iters` attribute.
tol: The value for the `tol` attribute.
start: The value for the `start` attribute.
feature_groups: The value for the `feature_groups` attribute.
solver: The value for the `solver` attribute.
random_state: The value for the `random_state` attribute.
verbose: The value for the `verbose` attribute.
"""
Expand All @@ -79,6 +94,8 @@ def __init__(
self.max_iters = max_iters
self.tol = tol
self.start = start
self.feature_groups = feature_groups
self.solver = solver
self.random_state = random_state
self.verbose = verbose

Expand Down Expand Up @@ -113,8 +130,9 @@ def fit(self, X: np.ndarray, y: np.ndarray) -> BaseSparseEstimator:
else:
start = self.start

# Optimize feature selection
# Implement feature selection optimization model
model = Model(
solver_name=self.solver,
max_gap=self.tol,
max_gap_abs=self.tol,
log_freq=1 if self.verbose else None,
Expand All @@ -126,8 +144,19 @@ def fit(self, X: np.ndarray, y: np.ndarray) -> BaseSparseEstimator:
model.add_objective_term(var=selected, func=func, grad=True)
model.add_linear_constr(sum(selected) <= self._k)
model.add_linear_constr(sum(selected) >= 1)
if self.feature_groups:
for group in self.feature_groups:
model.add_linear_constr(sum(selected[i] for i in group) <= 1)
model.start = [(selected[i], 1) for i in start]
model.optimize()

# Run solve and extract selected features
status = model.optimize()
if status not in (OptimizationStatus.OPTIMAL, OptimizationStatus.FEASIBLE):
warnings.warn(
f"Optimization failed with status: {status}.",
category=FitFailedWarning,
)
return self
selected = np.round([model.var_value(var) for var in selected]).astype(bool)

# Compute coefficients
Expand Down Expand Up @@ -163,6 +192,34 @@ def intercept(self) -> float:
check_is_fitted(estimator=self)
return self._get_intercept()

def _validate_params(self):
super()._validate_params()
if self.start is not None:
for i in self.start:
check_scalar(
x=i,
name="start",
target_type=int,
min_val=0,
max_val=self.n_features_in_,
include_boundaries="both",
)
if self.feature_groups is not None:
for group in self.feature_groups:
if not isinstance(group, set):
raise TypeError(
f"Each feature group must provided as a set, not type '{type(group)}'."
)
for i in group:
check_scalar(
x=i,
name="start",
target_type=int,
min_val=0,
max_val=self.n_features_in_,
include_boundaries="both",
)

@abstractmethod
def _pre_process_y(self, y: np.ndarray) -> np.ndarray:
pass
Expand Down
13 changes: 10 additions & 3 deletions tests/test_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
SparseLinearClassifier(normalize=False),
SparseLinearClassifier(k=3),
SparseLinearClassifier(gamma=1e-1),
SparseLinearClassifier(feature_groups=[{0, 1}, {2, 3}]),
SparseLinearClassifier(start={0, 1, 2}),
],
)
def test_sparse_linear_regressor(
Expand All @@ -29,9 +31,10 @@ def test_sparse_linear_regressor(
assert estimator._coef.shape == (X_train.shape[1],)
assert predicted.shape == (X_test.shape[0],)
assert predicted_proba.shape == (X_test.shape[0], 2)
assert balanced_accuracy_score(y_test, predicted) > 0.9
assert roc_auc_score(y_test, predicted_proba[:, 1]) > 0.9
assert estimator._coef.shape == (X_train.shape[1],)
if estimator.feature_groups is None:
assert balanced_accuracy_score(y_test, predicted) > 0.9
assert roc_auc_score(y_test, predicted_proba[:, 1]) > 0.9


@pytest.mark.parametrize(
Expand All @@ -40,11 +43,15 @@ def test_sparse_linear_regressor(
SparseLinearClassifier(k=0),
SparseLinearClassifier(k=11),
SparseLinearClassifier(gamma=-1e-2),
SparseLinearClassifier(start={0, 1, 1000}),
SparseLinearClassifier(feature_groups=[{-1, 0, 1}]),
SparseLinearClassifier(feature_groups=[{0, 1, 1000}]),
SparseLinearClassifier(feature_groups=[[0, 0, 1]]),
],
)
def test_sparse_linear_regressor_invalid_params(
classification_dataset: Dataset, estimator: SparseLinearClassifier
):
X_train, X_test, y_train, y_test = classification_dataset
with pytest.raises(ValueError):
with pytest.raises((ValueError, TypeError)):
estimator.fit(X_train, y_train)
22 changes: 16 additions & 6 deletions tests/test_regressor.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ def test_sklearn_compatibility():
SparseLinearRegressor(normalize=False),
SparseLinearRegressor(k=3),
SparseLinearRegressor(gamma=1e-2),
SparseLinearRegressor(feature_groups=[{0, 1}, {2, 3}]),
SparseLinearRegressor(start={0, 1, 2}),
],
)
def test_sparse_linear_regressor(
Expand All @@ -27,11 +29,14 @@ def test_sparse_linear_regressor(
predicted = estimator.fit(X_train, y_train).predict(X_test)
assert estimator._coef.shape == (X_train.shape[1],)
assert predicted.shape == (X_test.shape[0],)
assert estimator.score(X_train, y_train) > 0.95
assert estimator.score(X_test, y_test) > 0.95
assert estimator._coef.shape == (X_train.shape[1],)
assert (~np.isclose(coef, 0)).sum() <= estimator._k
assert (np.isclose(estimator._coef, 0) == np.isclose(coef, 0)).all()
if estimator.feature_groups is None:
assert estimator.score(X_train, y_train) > 0.95
assert estimator.score(X_test, y_test) > 0.95
assert estimator._coef.shape == (X_train.shape[1],)
assert (~np.isclose(coef, 0)).sum() <= estimator._k
assert (np.isclose(estimator._coef, 0) == np.isclose(coef, 0)).all()
else:
assert estimator._coef.shape == (X_train.shape[1],)


@pytest.mark.parametrize(
Expand All @@ -40,11 +45,16 @@ def test_sparse_linear_regressor(
SparseLinearRegressor(k=0),
SparseLinearRegressor(k=11),
SparseLinearRegressor(gamma=-1e-2),
SparseLinearRegressor(start={-1, 0, 1}),
SparseLinearRegressor(start={0, 1, 1000}),
SparseLinearRegressor(feature_groups=[{-1, 0, 1}]),
SparseLinearRegressor(feature_groups=[{0, 1, 1000}]),
SparseLinearRegressor(feature_groups=[[0, 0, 1]]),
],
)
def test_sparse_linear_regressor_invalid_params(
regression_dataset: Dataset, estimator: SparseLinearRegressor
):
X_train, X_test, y_train, y_test, coef = regression_dataset
with pytest.raises(ValueError):
with pytest.raises((ValueError, TypeError)):
estimator.fit(X_train, y_train)
Loading