Skip to content

Commit

Permalink
Iteratively build the forest to honor constraints (#439)
Browse files Browse the repository at this point in the history
* Iteratively build the forest to honor constraints

In particular depending on the dataset size either memory or time
constraints can become a problem which makes it unreliable as a
baseline. Gradually growing the forest sidesteps both issues.

* Make iterative fit default, parameterize execution

* Step_size as script parameter, safer check if done

When final_forest_size is not an exact multiple of step_size,
randomforest should still terminate. Additionally step_size is escaped
with an underscore as it is not a RandomForestEstimator hyperparameter.
  • Loading branch information
PGijsbers authored Dec 31, 2021
1 parent 0e46e34 commit 30d9d02
Showing 1 changed file with 45 additions and 3 deletions.
48 changes: 45 additions & 3 deletions frameworks/RandomForest/exec.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
import logging
import os
import tempfile as tmp
from typing import List

os.environ['JOBLIB_TEMP_FOLDER'] = tmp.gettempdir()
os.environ['OMP_NUM_THREADS'] = '1'
os.environ['OPENBLAS_NUM_THREADS'] = '1'
os.environ['MKL_NUM_THREADS'] = '1'

import psutil
import sklearn
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

Expand All @@ -16,29 +18,69 @@
log = logging.getLogger(os.path.basename(__file__))


def extrapolate_with_worst_case(values: List[float], n: int = 5) -> float:
""" Extrapolate the next value for `values`, based on the last `n` samples. """
n = min(len(values), n)
return values[-1] + max(v_next - v_prev for v_prev, v_next in zip(values[-n:], values[-n + 1:]))


def run(dataset, config):
log.info(f"\n**** Random Forest [sklearn v{sklearn.__version__}] ****\n")

is_classification = config.type == 'classification'
this_process = psutil.Process(os.getpid())

encode = config.framework_params.get('_encode', True)
X_train, X_test = dataset.train.X, dataset.test.X
y_train, y_test = dataset.train.y, dataset.test.y

training_params = {k: v for k, v in config.framework_params.items() if not k.startswith('_')}
training_params = {
k: v for k, v in config.framework_params.items()
if not (k.startswith('_') or k == "n_estimators")
}
n_jobs = config.framework_params.get('_n_jobs', config.cores) # useful to disable multicore, regardless of the dataset config
step_size = config.framework_params.get('_step_size', 10)
final_forest_size = config.framework_params.get('n_estimators', 2000)

# Default margins are conservative, because robustness is paramount for a baseline.
time_margin = config.framework_params.get('_time_margin', 0.9)
memory_margin = config.framework_params.get('_memory_margin', 0.9)

log.info("Running RandomForest with a maximum time of {}s on {} cores.".format(config.max_runtime_seconds, n_jobs))
log.warning("We completely ignore the requirement to stay within the time limit.")
log.warning("We completely ignore the advice to optimize towards metric: {}.".format(config.metric))

estimator = RandomForestClassifier if is_classification else RandomForestRegressor
rf = estimator(n_jobs=n_jobs,
random_state=config.seed,
n_estimators=step_size,
warm_start=True,
**training_params)

with Timer() as training:
rf.fit(X_train, y_train)
training_times = [training.duration]
memory_usage = [this_process.memory_info()[0] / (2**20)]

while True:
rf.fit(X_train, y_train)

training_times.append(training.duration)
memory_usage.append(this_process.memory_info()[0] / (2**20))
log.info(f"Model trained {len(rf.estimators_):6d} trees in {int(training_times[-1]):6d} seconds using {int(memory_usage[-1]):6d}mb memory.")

will_run_out_of_memory = extrapolate_with_worst_case(memory_usage) >= config.max_mem_size_mb * memory_margin
will_run_out_of_time = extrapolate_with_worst_case(training_times) >= config.max_runtime_seconds * time_margin
if rf.n_estimators >= final_forest_size:
log.info("Stop training because desired forest size has been reached.")
break
elif will_run_out_of_time:
log.info("Stop training because it expects to exceed its time budget.")
break
elif will_run_out_of_memory:
log.info("Stop training because it expects to exceed its memory budget.")
break
else:
# https://stackoverflow.com/questions/42757892/how-to-use-warm-start/42763502
rf.n_estimators += step_size

with Timer() as predict:
predictions = rf.predict(X_test)
Expand Down

0 comments on commit 30d9d02

Please sign in to comment.