Skip to content

Commit

Permalink
[enhancement] add sklearnex version of validate_data, ```_check…
Browse files Browse the repository at this point in the history
…_sample_weight``` (#2177)

* add finiteness_checker pybind11 bindings

* added finiteness checker

* Update finiteness_checker.cpp

* Update finiteness_checker.cpp

* Update finiteness_checker.cpp

* Update finiteness_checker.cpp

* Update finiteness_checker.cpp

* Update finiteness_checker.cpp

* Rename finiteness_checker.cpp to finiteness_checker.cpp

* Update finiteness_checker.cpp

* add next step

* follow conventions

* make xtable explicit

* remove comment

* Update validation.py

* Update __init__.py

* Update validation.py

* Update __init__.py

* Update __init__.py

* Update validation.py

* Update _data_conversion.py

* Update _data_conversion.py

* Update policy_common.cpp

* Update policy_common.cpp

* Update _policy.py

* Update policy_common.cpp

* Rename finiteness_checker.cpp to finiteness_checker.cpp

* Create finiteness_checker.py

* Update validation.py

* Update __init__.py

* attempt at fixing circular imports again

* fix isort

* remove __init__ changes

* last move

* Update policy_common.cpp

* Update policy_common.cpp

* Update policy_common.cpp

* Update policy_common.cpp

* Update validation.py

* add testing

* isort

* attempt to fix module error

* add fptype

* fix typo

* Update validation.py

* remove sua_ifcae from to_table

* isort and black

* Update test_memory_usage.py

* format

* Update _data_conversion.py

* Update _data_conversion.py

* Update test_validation.py

* remove unnecessary code

* make reviewer changes

* make dtype check change

* add sparse testing

* try again

* try again

* try again

* temporary commit

* first attempt

* missing change?

* modify DummyEstimator for testing

* generalize DummyEstimator

* switch test

* further testing changes

* add initial validate_data test, will be refactored

* fixes for CI

* Update validation.py

* Update validation.py

* Update test_memory_usage.py

* Update base.py

* Update base.py

* improve tests

* fix logic

* fix logic

* fix logic again

* rename file

* Revert "rename file"

This reverts commit 8d47744.

* remove duplication

* fix imports

* Rename test_finite.py to test_validation.py

* Revert "Rename test_finite.py to test_validation.py"

This reverts commit ee799f6.

* updates

* Update validation.py

* fixes for some test failures

* fix text

* fixes for some failures

* make consistent

* fix bad logic

* fix in string

* attempt tp see if dataframe conversion is causing the issue

* fix iter problem

* fix testing issues

* formatting

* revert change

* fixes for pandas

* there is a slowdown with pandas that needs to be solved

* swap to transpose for speed

* more clarity

* add _check_sample_weight

* add more testing'

* rename

* remove unnecessary imports

* fix test slowness

* focus get_dataframes_and_queues

* put config_context around

* Update test_validation.py

* Update base.py

* Update test_validation.py

* generalize regex

* add fixes for sklearn 1.0 and input_name

* fixes for test failures

* Update validation.py

* Update test_validation.py

* Update validation.py

* formattintg

* make suggested changes

* follow changes made in #2126

* fix future device problem

* Update validation.py

* minor changes based on #2206, suggestions

* remove xp as keyword

* only_non_negative -> ensure_non_negative

* add commentary

* formatting

* address changes

* Update test_validation.py

* Update base.py

* Update test_validation.py

* Update sklearnex/utils/validation.py

Co-authored-by: ethanglaser <42726565+ethanglaser@users.noreply.github.com>

---------

Co-authored-by: ethanglaser <42726565+ethanglaser@users.noreply.github.com>
  • Loading branch information
icfaust and ethanglaser authored Dec 10, 2024
1 parent 5d8d9bb commit 95bd1ea
Show file tree
Hide file tree
Showing 7 changed files with 487 additions and 130 deletions.
45 changes: 7 additions & 38 deletions sklearnex/tests/test_memory_usage.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,14 @@
get_dataframes_and_queues,
)
from onedal.tests.utils._device_selection import get_queues, is_dpctl_device_available
from onedal.utils._array_api import _get_sycl_namespace
from onedal.utils._dpep_helpers import dpctl_available, dpnp_available
from sklearnex import config_context
from sklearnex.tests.utils import PATCHED_FUNCTIONS, PATCHED_MODELS, SPECIAL_INSTANCES
from sklearnex.tests.utils import (
PATCHED_FUNCTIONS,
PATCHED_MODELS,
SPECIAL_INSTANCES,
DummyEstimator,
)
from sklearnex.utils._array_api import get_namespace

if dpctl_available:
Expand Down Expand Up @@ -131,41 +135,6 @@ def gen_functions(functions):
ORDER_DICT = {"F": np.asfortranarray, "C": np.ascontiguousarray}


if _is_dpc_backend:

from sklearn.utils.validation import check_is_fitted

from onedal.datatypes import from_table, to_table

class DummyEstimatorWithTableConversions(BaseEstimator):

def fit(self, X, y=None):
sua_iface, xp, _ = _get_sycl_namespace(X)
X_table = to_table(X)
y_table = to_table(y)
# The presence of the fitted attributes (ending with a trailing
# underscore) is required for the correct check. The cleanup of
# the memory will occur at the estimator instance deletion.
self.x_attr_ = from_table(
X_table, sua_iface=sua_iface, sycl_queue=X.sycl_queue, xp=xp
)
self.y_attr_ = from_table(
y_table, sua_iface=sua_iface, sycl_queue=X.sycl_queue, xp=xp
)
return self

def predict(self, X):
# Checks if the estimator is fitted by verifying the presence of
# fitted attributes (ending with a trailing underscore).
check_is_fitted(self)
sua_iface, xp, _ = _get_sycl_namespace(X)
X_table = to_table(X)
returned_X = from_table(
X_table, sua_iface=sua_iface, sycl_queue=X.sycl_queue, xp=xp
)
return returned_X


def gen_clsf_data(n_samples, n_features, dtype=None):
data, label = make_classification(
n_classes=2, n_samples=n_samples, n_features=n_features, random_state=777
Expand Down Expand Up @@ -369,7 +338,7 @@ def test_table_conversions_memory_leaks(dataframe, queue, order, data_shape, dty
pytest.skip("SYCL device memory leak check requires the level zero sysman")

_kfold_function_template(
DummyEstimatorWithTableConversions,
DummyEstimator,
dataframe,
data_shape,
queue,
Expand Down
2 changes: 2 additions & 0 deletions sklearnex/tests/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
SPECIAL_INSTANCES,
UNPATCHED_FUNCTIONS,
UNPATCHED_MODELS,
DummyEstimator,
_get_processor_info,
call_method,
gen_dataset,
Expand All @@ -39,6 +40,7 @@
"gen_models_info",
"gen_dataset",
"sklearn_clone_dict",
"DummyEstimator",
]

_IS_INTEL = "GenuineIntel" in _get_processor_info()
44 changes: 44 additions & 0 deletions sklearnex/tests/utils/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,11 @@
)
from sklearn.datasets import load_diabetes, load_iris
from sklearn.neighbors._base import KNeighborsMixin
from sklearn.utils.validation import check_is_fitted

from onedal.datatypes import from_table, to_table
from onedal.tests.utils._dataframes_support import _convert_to_dataframe
from onedal.utils._array_api import _get_sycl_namespace
from sklearnex import get_patch_map, patch_sklearn, sklearn_is_patched, unpatch_sklearn
from sklearnex.basic_statistics import BasicStatistics, IncrementalBasicStatistics
from sklearnex.linear_model import LogisticRegression
Expand Down Expand Up @@ -369,3 +372,44 @@ def _get_processor_info():
)

return proc


class DummyEstimator(BaseEstimator):

def fit(self, X, y=None):
sua_iface, xp, _ = _get_sycl_namespace(X)
X_table = to_table(X)
y_table = to_table(y)
# The presence of the fitted attributes (ending with a trailing
# underscore) is required for the correct check. The cleanup of
# the memory will occur at the estimator instance deletion.
if sua_iface:
self.x_attr_ = from_table(
X_table, sua_iface=sua_iface, sycl_queue=X.sycl_queue, xp=xp
)
self.y_attr_ = from_table(
y_table,
sua_iface=sua_iface,
sycl_queue=X.sycl_queue if y is None else y.sycl_queue,
xp=xp,
)
else:
self.x_attr = from_table(X_table)
self.y_attr = from_table(y_table)

return self

def predict(self, X):
# Checks if the estimator is fitted by verifying the presence of
# fitted attributes (ending with a trailing underscore).
check_is_fitted(self)
sua_iface, xp, _ = _get_sycl_namespace(X)
X_table = to_table(X)
if sua_iface:
returned_X = from_table(
X_table, sua_iface=sua_iface, sycl_queue=X.sycl_queue, xp=xp
)
else:
returned_X = from_table(X_table)

return returned_X
4 changes: 2 additions & 2 deletions sklearnex/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,6 @@
# limitations under the License.
# ===============================================================================

from .validation import _assert_all_finite
from .validation import assert_all_finite

__all__ = ["_assert_all_finite"]
__all__ = ["assert_all_finite"]
89 changes: 0 additions & 89 deletions sklearnex/utils/tests/test_finite.py

This file was deleted.

Loading

0 comments on commit 95bd1ea

Please sign in to comment.