From fbe82372cfaff6dc98953090dc1de05689985d28 Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Wed, 2 Feb 2022 18:21:10 +0100 Subject: [PATCH 1/3] Add: Doc for `dataset_compression` --- autosklearn/estimators.py | 153 ++++++++++++++++---------------------- doc/conf.py | 2 +- doc/faq.rst | 6 ++ doc/manual.rst | 48 ++++++++++++ 4 files changed, 121 insertions(+), 88 deletions(-) diff --git a/autosklearn/estimators.py b/autosklearn/estimators.py index 84fe97797c..a1b0775332 100644 --- a/autosklearn/estimators.py +++ b/autosklearn/estimators.py @@ -156,58 +156,39 @@ def __init__( 'feature_preprocessor': ["no_preprocessing"] } - resampling_strategy : Union[str, BaseCrossValidator, _RepeatedSplits, BaseShuffleSplit] = "holdout" + resampling_strategy : str | BaseCrossValidator | _RepeatedSplits | BaseShuffleSplit = "holdout" How to to handle overfitting, might need to use ``resampling_strategy_arguments`` if using ``"cv"`` based method or a Splitter object. + * **Options** + * ``"holdout"`` - Use a 67:33 (train:test) split + * ``"cv"``: perform cross validation, requires "folds" in ``resampling_strategy_arguments`` + * ``"holdout-iterative-fit"`` - Same as "holdout" but iterative fit where possible + * ``"cv-iterative-fit"``: Same as "cv" but iterative fit where possible + * ``"partial-cv"``: Same as "cv" but uses intensification. + * ``BaseCrossValidator`` - any BaseCrossValidator subclass (found in scikit-learn model_selection module) + * ``_RepeatedSplits`` - any _RepeatedSplits subclass (found in scikit-learn model_selection module) + * ``BaseShuffleSplit`` - any BaseShuffleSplit subclass (found in scikit-learn model_selection module) + If using a Splitter object that relies on the dataset retaining it's current size and order, you will need to look at the ``dataset_compression`` argument and ensure that ``"subsample"`` is not included in the applied compression ``"methods"`` or disable it entirely with ``False``. - **Options** - - * ``"holdout"``: - 67:33 (train:test) split - * ``"holdout-iterative-fit"``: - 67:33 (train:test) split, iterative fit where possible - * ``"cv"``: - crossvalidation, - requires ``"folds"`` in ``resampling_strategy_arguments`` - * ``"cv-iterative-fit"``: - crossvalidation, - calls iterative fit where possible, - requires ``"folds"`` in ``resampling_strategy_arguments`` - * 'partial-cv': - crossvalidation with intensification, - requires ``"folds"`` in ``resampling_strategy_arguments`` - * ``BaseCrossValidator`` subclass: - any BaseCrossValidator subclass (found in scikit-learn model_selection module) - * ``_RepeatedSplits`` subclass: - any _RepeatedSplits subclass (found in scikit-learn model_selection module) - * ``BaseShuffleSplit`` subclass: - any BaseShuffleSplit subclass (found in scikit-learn model_selection module) - - resampling_strategy_arguments : dict, optional if 'holdout' (train_size default=0.67) - Additional arguments for resampling_strategy: - - * ``train_size`` should be between 0.0 and 1.0 and represent the - proportion of the dataset to include in the train split. - * ``shuffle`` determines whether the data is shuffled prior to - splitting it into train and validation. - - Available arguments: - - * 'holdout': {'train_size': float} - * 'holdout-iterative-fit': {'train_size': float} - * 'cv': {'folds': int} - * 'cv-iterative-fit': {'folds': int} - * 'partial-cv': {'folds': int, 'shuffle': bool} - * BaseCrossValidator or _RepeatedSplits or BaseShuffleSplit object: all arguments - required by chosen class as specified in scikit-learn documentation. - If arguments are not provided, scikit-learn defaults are used. - If no defaults are available, an exception is raised. - Refer to the 'n_splits' argument as 'folds'. + resampling_strategy_arguments : Optional[Dict] + Additional arguments for ``resampling_strategy``, this is required if + using a ``cv`` based strategy: + + .. code-block:: python + + { + "train_size": 0.67, # The size of the training set + "shuffle": True, # Whether to shuffle before splitting data + "folds": 5 # Used in 'cv' based resampling strategies + } + + If using a custom splitter class, which takes ``n_splits`` such as + `PredefinedSplit `_, the value of ``"folds"`` will be used. tmp_folder : string, optional (None) folder to store configuration output and log files, if ``None`` @@ -219,12 +200,12 @@ def __init__( n_jobs : int, optional, experimental The number of jobs to run in parallel for ``fit()``. ``-1`` means - using all processors. - - **Important notes**: - - * By default, Auto-sklearn uses one core. - * Ensemble building is not affected by ``n_jobs`` but can be controlled by the number + using all processors. + + **Important notes**: + + * By default, Auto-sklearn uses one core. + * Ensemble building is not affected by ``n_jobs`` but can be controlled by the number of models in the ensemble. * ``predict()`` is not affected by ``n_jobs`` (in contrast to most scikit-learn models) * If ``dask_client`` is ``None``, a new dask client is created. @@ -288,16 +269,14 @@ def __init__( dataset_compression: Union[bool, Mapping[str, Any]] = True We compress datasets so that they fit into some predefined amount of memory. - Currently this does not apply to dataframes or sparse arrays, only to raw numpy arrays. + Currently this does not apply to dataframes or sparse arrays, only to raw + numpy arrays. - **NOTE** - - If using a custom ``resampling_strategy`` that relies on specific + **NOTE** - If using a custom ``resampling_strategy`` that relies on specific size or ordering of data, this must be disabled to preserve these properties. - You can disable this entirely by passing ``False``. - - Default configuration when left as ``True``: + You can disable this entirely by passing ``False`` or leave as the default + ``True`` for configuration below. .. code-block:: python @@ -311,36 +290,36 @@ def __init__( The available options are described here: - **memory_allocation** - - By default, we attempt to fit the dataset into ``0.1 * memory_limit``. This - float value can be set with ``"memory_allocation": 0.1``. We also allow for - specifying absolute memory in MB, e.g. 10MB is ``"memory_allocation": 10``. - - The memory used by the dataset is checked after each reduction method is - performed. If the dataset fits into the allocated memory, any further methods - listed in ``"methods"`` will not be performed. - - For example, if ``methods: ["precision", "subsample"]`` and the - ``"precision"`` reduction step was enough to make the dataset fit into memory, - then the ``"subsample"`` reduction step will not be performed. - - **methods** - - We currently provide the following methods for reducing the dataset size. - These can be provided in a list and are performed in the order as given. - - * ``"precision"`` - We reduce floating point precision as follows: - * ``np.float128 -> np.float64`` - * ``np.float96 -> np.float64`` - * ``np.float64 -> np.float32`` - - * ``subsample`` - We subsample data such that it **fits directly into the - memory allocation** ``memory_allocation * memory_limit``. Therefore, this - should likely be the last method listed in ``"methods"``. - Subsampling takes into account classification labels and stratifies - accordingly. We guarantee that at least one occurrence of each label is - included in the sampled set. + * **memory_allocation** + By default, we attempt to fit the dataset into ``0.1 * memory_limit``. + This float value can be set with ``"memory_allocation": 0.1``. + We also allow for specifying absolute memory in MB, e.g. 10MB is + ``"memory_allocation": 10``. + + The memory used by the dataset is checked after each reduction method is + performed. If the dataset fits into the allocated memory, any further + methods listed in ``"methods"`` will not be performed. + + For example, if ``methods: ["precision", "subsample"]`` and the + ``"precision"`` reduction step was enough to make the dataset fit into + memory, then the ``"subsample"`` reduction step will not be performed. + + * **methods** + We provide the following methods for reducing the dataset size. + These can be provided in a list and are performed in the order as given. + + * ``"precision"`` - We reduce floating point precision as follows: + * ``np.float128 -> np.float64`` + * ``np.float96 -> np.float64`` + * ``np.float64 -> np.float32`` + + * ``subsample`` - We subsample data such that it **fits directly into + the memory allocation** ``memory_allocation * memory_limit``. + Therefore, this should likely be the last method listed in + ``"methods"``. + Subsampling takes into account classification labels and stratifies + accordingly. We guarantee that at least one occurrence of each + label is included in the sampled set. Attributes ---------- diff --git a/doc/conf.py b/doc/conf.py index b1fe966178..5d114b3550 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -198,7 +198,7 @@ ('Start', 'index'), ('Releases', 'releases'), ('Installation', 'installation'), - #('Manual', 'manual'), + ('Manual', 'manual'), ('Examples', 'examples/index'), ('API', 'api'), ('Extending', 'extending'), diff --git a/doc/faq.rst b/doc/faq.rst index 439e5c9be3..000f8ece47 100644 --- a/doc/faq.rst +++ b/doc/faq.rst @@ -409,6 +409,12 @@ Configuring the Search Procedure Examples for using holdout and cross-validation can be found in :ref:`example ` + If using a custom resampling strategy with predefined splits, you may need to disable + the subsampling performed with particularly large datasets or if using a small ``memory_limit``. + Please see the manual section on :ref:`limits` + :class:`AutoSklearnClassifier(dataset_compression=...) `. + for more details. + .. collapse:: Can I use a custom metric Examples for using a custom metric can be found in :ref:`example ` diff --git a/doc/manual.rst b/doc/manual.rst index 2a3df6528b..b1a4d9353a 100644 --- a/doc/manual.rst +++ b/doc/manual.rst @@ -45,6 +45,54 @@ tested. By default, *auto-sklearn* uses **one core**. See also :ref:`parallel` on how to configure this. + +.. collapse:: Managing data compression + + .. _manual_managing_data_compression: + + Auto-sklearn will attempt to fit the dataset into 1/10th of the ``memory_limit``. + This won't happen unless your dataset is quite large or you have small a + ``memory_limit``. This is done using two methods, reducing **precision** and + to **subsample**. One reason you may want to control this is if you require high + precision or you rely on predefined splits for which subsampling does not account + for. + + To turn off data preprocessing: + + .. code:: python + + AutoSklearnClassifier( + dataset_compression = False + ) + + You can specify which of the methods are performed using: + + .. code:: python + + AutoSklearnClassifier( + dataset_compression = { "methods": ["precision", "subsample"] }, + ) + + You can change the memory allocation for the dataset to a percentage of ``memory_limit`` + or an absolute amount using: + + .. code:: python + + AutoSklearnClassifier( + dataset_compression = { "memory_allocation": 0.2 }, + ) + + The default arguments are used when ``dataset_compression = True`` are: + + .. code:: python + + { + "memory_allocation": 0.1, + "methods": ["precision", "subsample"] + } + + The full description is given at :class:`AutoSklearnClassifier(dataset_compression=...) `. + .. _space: The search space From fbd4f0c99794b3c91e79248f672ca347c67667ce Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Wed, 2 Feb 2022 18:33:07 +0100 Subject: [PATCH 2/3] Fix: Shorten line --- autosklearn/estimators.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/autosklearn/estimators.py b/autosklearn/estimators.py index a1b0775332..681822e322 100644 --- a/autosklearn/estimators.py +++ b/autosklearn/estimators.py @@ -188,7 +188,8 @@ def __init__( } If using a custom splitter class, which takes ``n_splits`` such as - `PredefinedSplit `_, the value of ``"folds"`` will be used. + `PredefinedSplit `_, + the value of ``"folds"`` will be used. tmp_folder : string, optional (None) folder to store configuration output and log files, if ``None`` From 03ac9090084fb78026370e8c06ca4fe4281fc436 Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Wed, 2 Feb 2022 18:40:53 +0100 Subject: [PATCH 3/3] Doc: Make more clear that the argument None still provides defaults --- autosklearn/estimators.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/autosklearn/estimators.py b/autosklearn/estimators.py index 681822e322..070230ae94 100644 --- a/autosklearn/estimators.py +++ b/autosklearn/estimators.py @@ -175,9 +175,10 @@ def __init__( and ensure that ``"subsample"`` is not included in the applied compression ``"methods"`` or disable it entirely with ``False``. - resampling_strategy_arguments : Optional[Dict] + resampling_strategy_arguments : Optional[Dict] = None Additional arguments for ``resampling_strategy``, this is required if - using a ``cv`` based strategy: + using a ``cv`` based strategy. The default arguments if left as ``None`` + are: .. code-block:: python