joshivanhoe · joshivanhoe · Mar 7, 2024 · Feb 27, 2024 · Mar 7, 2024
diff --git a/src/sparsely/base.py b/src/sparsely/base.py
@@ -6,17 +6,20 @@
 
 from __future__ import annotations
 
+import warnings
 from abc import ABC, abstractmethod
 from numbers import Real, Integral
-from typing import Optional, Callable, ClassVar
+from typing import Optional, Callable, ClassVar, Sequence
 
 import numpy as np
 from halfspace import Model
+from mip import OptimizationStatus
 from sklearn.base import BaseEstimator
+from sklearn.exceptions import FitFailedWarning
 from sklearn.preprocessing import StandardScaler
 from sklearn.utils import check_random_state
-from sklearn.utils._param_validation import Interval
-from sklearn.utils.validation import check_is_fitted
+from sklearn.utils._param_validation import Interval, StrOptions
+from sklearn.utils.validation import check_is_fitted, check_scalar
 
 
 class BaseSparseEstimator(BaseEstimator, ABC):
@@ -33,8 +36,14 @@ class BaseSparseEstimator(BaseEstimator, ABC):
         normalize: Whether to normalize the data before fitting the model.
         max_iters: The maximum number of iterations.
         tol: The tolerance for the stopping criterion.
-        start: The initial guess for the selected features. If `None`, then the initial guess is randomly selected.
-            Providing a good initial guess based on problem-specific knowledge can significantly speed up the search.
+        start: The initial guess for the selected features. For example if `start={0, 1, 2}`, then the first three
+            features will be selected. If `None`, then the initial guess is randomly selected. Providing a good initial
+            guess based on problem-specific knowledge can significantly speed up the search.
+        feature_groups: Set of features that are mutually exclusive. For example, if `feature_groups=[{0, 1}, {2, 3}]`,
+            then at most one features 0 and 1 will be selected, and at most one features 2 and 3 will be selected. This
+            can be used to encode prior knowledge about the problem.
+        solver: The solver to use for the optimization problem. The available options are "CBC" and "GUROBI". Support
+            for the "HiGHS" solver is also planned for a future release.
         random_state: Controls the random seed for the initial guess if a user-defined initial guess is not provided.
         verbose: Whether to enable logging of the search progress.
     """
@@ -46,6 +55,8 @@ class BaseSparseEstimator(BaseEstimator, ABC):
         "max_iters": [Interval(type=Integral, left=1, right=None, closed="left")],
         "tol": [Interval(type=Real, left=0, right=None, closed="left")],
         "start": ["array-like", None],
+        "feature_groups": ["array-like", None],
+        "solver": [StrOptions({"HiGHS", "CBC", "GUROBI"})],
         "random_state": ["random_state"],
         "verbose": ["boolean"],
     }
@@ -58,6 +69,8 @@ def __init__(
         max_iters: int = 500,
         tol: float = 1e-4,
         start: Optional[set[int]] = None,
+        feature_groups: Optional[Sequence[set[int]]] = None,
+        solver: str = "CBC",
         random_state: Optional[int] = None,
         verbose: bool = False,
     ):
@@ -70,6 +83,8 @@ def __init__(
             max_iters: The value for the `max_iters` attribute.
             tol: The value for the `tol` attribute.
             start: The value for the `start` attribute.
+            feature_groups: The value for the `feature_groups` attribute.
+            solver: The value for the `solver` attribute.
             random_state: The value for the `random_state` attribute.
             verbose: The value for the `verbose` attribute.
         """
@@ -79,6 +94,8 @@ def __init__(
         self.max_iters = max_iters
         self.tol = tol
         self.start = start
+        self.feature_groups = feature_groups
+        self.solver = solver
         self.random_state = random_state
         self.verbose = verbose
 
@@ -113,8 +130,9 @@ def fit(self, X: np.ndarray, y: np.ndarray) -> BaseSparseEstimator:
         else:
             start = self.start
 
-        # Optimize feature selection
+        # Implement feature selection optimization model
         model = Model(
+            solver_name=self.solver,
             max_gap=self.tol,
             max_gap_abs=self.tol,
             log_freq=1 if self.verbose else None,
@@ -126,8 +144,19 @@ def fit(self, X: np.ndarray, y: np.ndarray) -> BaseSparseEstimator:
         model.add_objective_term(var=selected, func=func, grad=True)
         model.add_linear_constr(sum(selected) <= self._k)
         model.add_linear_constr(sum(selected) >= 1)
+        if self.feature_groups:
+            for group in self.feature_groups:
+                model.add_linear_constr(sum(selected[i] for i in group) <= 1)
         model.start = [(selected[i], 1) for i in start]
-        model.optimize()
+
+        # Run solve and extract selected features
+        status = model.optimize()
+        if status not in (OptimizationStatus.OPTIMAL, OptimizationStatus.FEASIBLE):
+            warnings.warn(
+                f"Optimization failed with status: {status}.",
+                category=FitFailedWarning,
+            )
+            return self
         selected = np.round([model.var_value(var) for var in selected]).astype(bool)
 
         # Compute coefficients
@@ -163,6 +192,34 @@ def intercept(self) -> float:
         check_is_fitted(estimator=self)
         return self._get_intercept()
 
+    def _validate_params(self):
+        super()._validate_params()
+        if self.start is not None:
+            for i in self.start:
+                check_scalar(
+                    x=i,
+                    name="start",
+                    target_type=int,
+                    min_val=0,
+                    max_val=self.n_features_in_,
+                    include_boundaries="both",
+                )
+        if self.feature_groups is not None:
+            for group in self.feature_groups:
+                if not isinstance(group, set):
+                    raise TypeError(
+                        f"Each feature group must provided as a set, not type '{type(group)}'."
+                    )
+                for i in group:
+                    check_scalar(
+                        x=i,
+                        name="start",
+                        target_type=int,
+                        min_val=0,
+                        max_val=self.n_features_in_,
+                        include_boundaries="both",
+                    )
+
     @abstractmethod
     def _pre_process_y(self, y: np.ndarray) -> np.ndarray:
         pass

diff --git a/tests/test_classifier.py b/tests/test_classifier.py
@@ -17,6 +17,8 @@
         SparseLinearClassifier(normalize=False),
         SparseLinearClassifier(k=3),
         SparseLinearClassifier(gamma=1e-1),
+        SparseLinearClassifier(feature_groups=[{0, 1}, {2, 3}]),
+        SparseLinearClassifier(start={0, 1, 2}),
     ],
 )
 def test_sparse_linear_regressor(
@@ -29,9 +31,10 @@ def test_sparse_linear_regressor(
     assert estimator._coef.shape == (X_train.shape[1],)
     assert predicted.shape == (X_test.shape[0],)
     assert predicted_proba.shape == (X_test.shape[0], 2)
-    assert balanced_accuracy_score(y_test, predicted) > 0.9
-    assert roc_auc_score(y_test, predicted_proba[:, 1]) > 0.9
     assert estimator._coef.shape == (X_train.shape[1],)
+    if estimator.feature_groups is None:
+        assert balanced_accuracy_score(y_test, predicted) > 0.9
+        assert roc_auc_score(y_test, predicted_proba[:, 1]) > 0.9
 
 
 @pytest.mark.parametrize(
@@ -40,11 +43,15 @@ def test_sparse_linear_regressor(
         SparseLinearClassifier(k=0),
         SparseLinearClassifier(k=11),
         SparseLinearClassifier(gamma=-1e-2),
+        SparseLinearClassifier(start={0, 1, 1000}),
+        SparseLinearClassifier(feature_groups=[{-1, 0, 1}]),
+        SparseLinearClassifier(feature_groups=[{0, 1, 1000}]),
+        SparseLinearClassifier(feature_groups=[[0, 0, 1]]),
     ],
 )
 def test_sparse_linear_regressor_invalid_params(
     classification_dataset: Dataset, estimator: SparseLinearClassifier
 ):
     X_train, X_test, y_train, y_test = classification_dataset
-    with pytest.raises(ValueError):
+    with pytest.raises((ValueError, TypeError)):
         estimator.fit(X_train, y_train)
diff --git a/tests/test_regressor.py b/tests/test_regressor.py
@@ -18,6 +18,8 @@ def test_sklearn_compatibility():
         SparseLinearRegressor(normalize=False),
         SparseLinearRegressor(k=3),
         SparseLinearRegressor(gamma=1e-2),
+        SparseLinearRegressor(feature_groups=[{0, 1}, {2, 3}]),
+        SparseLinearRegressor(start={0, 1, 2}),
     ],
 )
 def test_sparse_linear_regressor(
@@ -27,11 +29,14 @@ def test_sparse_linear_regressor(
     predicted = estimator.fit(X_train, y_train).predict(X_test)
     assert estimator._coef.shape == (X_train.shape[1],)
     assert predicted.shape == (X_test.shape[0],)
-    assert estimator.score(X_train, y_train) > 0.95
-    assert estimator.score(X_test, y_test) > 0.95
-    assert estimator._coef.shape == (X_train.shape[1],)
-    assert (~np.isclose(coef, 0)).sum() <= estimator._k
-    assert (np.isclose(estimator._coef, 0) == np.isclose(coef, 0)).all()
+    if estimator.feature_groups is None:
+        assert estimator.score(X_train, y_train) > 0.95
+        assert estimator.score(X_test, y_test) > 0.95
+        assert estimator._coef.shape == (X_train.shape[1],)
+        assert (~np.isclose(coef, 0)).sum() <= estimator._k
+        assert (np.isclose(estimator._coef, 0) == np.isclose(coef, 0)).all()
+    else:
+        assert estimator._coef.shape == (X_train.shape[1],)
 
 
 @pytest.mark.parametrize(
@@ -40,11 +45,16 @@ def test_sparse_linear_regressor(
         SparseLinearRegressor(k=0),
         SparseLinearRegressor(k=11),
         SparseLinearRegressor(gamma=-1e-2),
+        SparseLinearRegressor(start={-1, 0, 1}),
+        SparseLinearRegressor(start={0, 1, 1000}),
+        SparseLinearRegressor(feature_groups=[{-1, 0, 1}]),
+        SparseLinearRegressor(feature_groups=[{0, 1, 1000}]),
+        SparseLinearRegressor(feature_groups=[[0, 0, 1]]),
     ],
 )
 def test_sparse_linear_regressor_invalid_params(
     regression_dataset: Dataset, estimator: SparseLinearRegressor
 ):
     X_train, X_test, y_train, y_test, coef = regression_dataset
-    with pytest.raises(ValueError):
+    with pytest.raises((ValueError, TypeError)):
         estimator.fit(X_train, y_train)