Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PUBDEV-7778: minimal integration of TargetEncoding into AutoML #4927

Merged
merged 25 commits into from
Sep 25, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
1fcee55
integrate TE into AutoML (backend)
Sep 17, 2020
7a85fa6
integrate TE into AutoML (clients) + PUBDEV-7551
Sep 17, 2020
11fa33b
revert some minor changes
Sep 17, 2020
c79880a
fixed keep_original_columns parameter in integration
Sep 17, 2020
38b76ad
added proper backend tests + fixed some issues
Sep 17, 2020
c7f9568
improved encodings key names for better memory tracking
Sep 17, 2020
f325441
fixed R client + add R test
Sep 17, 2020
1309218
fixed some key leakage
Sep 18, 2020
a862506
fixed sklearn tests
Sep 18, 2020
15ddf3e
added internal cardinaltyThreshold property
Sep 18, 2020
6eb3662
fix threshold filter to avoid removing response+foldcolumn
Sep 18, 2020
c9b2ff7
disable TE if no column to encode
Sep 20, 2020
dbf582b
disabling TE for GLM
Sep 21, 2020
6e0e27c
ensure that GLM uses same folding as other models although TE is not …
Sep 21, 2020
f042ebe
enabled TE for GLM is AutoML is used without CV
Sep 22, 2020
7a3336d
added warning regarding MOJO when using AutoML with targetencoding
Sep 22, 2020
9ec9116
activate feature using 'target_encoding' instead of 'targetencoding' …
Sep 22, 2020
ba347a5
set default cardinality threshold to 10
Sep 23, 2020
218efc6
adding filter to not encode features with too high cardinality relati…
Sep 23, 2020
90c31fa
PUBDEV-7778: Adding R documentation for ... in h2o.automl()
ledell Sep 24, 2020
106017b
Update h2o-automl/src/main/java/water/automl/api/schemas3/AutoMLBuild…
Sep 24, 2020
ed273e4
adding possibility to disable restrictions (mainly for testing for now)
Sep 24, 2020
7fbe441
cosmetics
Sep 24, 2020
67ebcfa
changing default blending values for TE in AutoML
Sep 25, 2020
9a53e46
disable TE for DNNs
Sep 25, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions h2o-automl/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,12 @@ dependencies {
compile project(":h2o-core")
compile project(":h2o-algos")
compileOnly project(":h2o-ext-xgboost")
compileOnly project(":h2o-ext-target-encoder")

// Test dependencies only
testCompile project(":h2o-test-support")
testCompile project(":h2o-ext-xgboost")
testCompile project(":h2o-ext-target-encoder")
testRuntimeOnly project(":${defaultWebserverModule}")
}

Expand Down
37 changes: 31 additions & 6 deletions h2o-automl/src/main/java/ai/h2o/automl/AutoML.java
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import ai.h2o.automl.events.EventLogEntry.Stage;
import ai.h2o.automl.StepDefinition.Alias;
import ai.h2o.automl.leaderboard.*;
import ai.h2o.automl.preprocessing.PreprocessingStep;
import hex.Model;
import hex.ScoreKeeper.StoppingMetric;
import hex.splitframe.ShuffleSplitFrame;
Expand Down Expand Up @@ -183,6 +184,7 @@ public Class<AutoMLV99.AutoMLKeyV3> makeSchema() {
private long[] _originalTrainingFrameChecksums;
private transient NonBlockingHashMap<Key, String> _trackedKeys = new NonBlockingHashMap<>();
private transient ModelingStep[] _executionPlan;
private transient PreprocessingStep[] _preprocessing;

public AutoML() {
super(null);
Expand All @@ -206,6 +208,7 @@ public AutoML(Key<AutoML> key, Date startTime, AutoMLBuildSpec buildSpec) {

prepareData();
initLeaderboard();
initPreprocessing();
planWork();
_modelingStepsExecutor = new ModelingStepsExecutor(_leaderboard, _eventLog, _runCountdown);
} catch (Exception e) {
Expand Down Expand Up @@ -343,10 +346,22 @@ private void initLeaderboard() {
_leaderboard.setExtensionsProvider(createLeaderboardExtensionProvider(this));
}

private void initPreprocessing() {
_preprocessing = _buildSpec.build_models.preprocessing == null
? null
: Arrays.stream(_buildSpec.build_models.preprocessing)
.map(def -> def.newPreprocessingStep(this))
.toArray(PreprocessingStep[]::new);
}

PreprocessingStep[] getPreprocessing() {
return _preprocessing;
}

ModelingStep[] getExecutionPlan() {
return _executionPlan == null ? (_executionPlan = _modelingStepsRegistry.getOrderedSteps(_buildSpec.build_models.modeling_plan, this)) : _executionPlan;
}

void planWork() {
Set<IAlgo> skippedAlgos = new HashSet<>();
if (_buildSpec.build_models.exclude_algos != null) {
Expand Down Expand Up @@ -480,7 +495,7 @@ public boolean keepRunning() {
return !_runCountdown.timedOut() && remainingModels() > 0;
}

boolean isCVEnabled() {
public boolean isCVEnabled() {
return _buildSpec.build_control.nfolds > 0 || _buildSpec.input_spec.fold_column != null;
}

Expand Down Expand Up @@ -599,13 +614,19 @@ private void prepareData() {

private void learn() {
List<ModelingStep> executed = new ArrayList<>();
if (_preprocessing != null) {
for (PreprocessingStep preprocessingStep : _preprocessing) preprocessingStep.prepare();
}
for (ModelingStep step : getExecutionPlan()) {
if (!exceededSearchLimits(step)) {
if (_modelingStepsExecutor.submit(step, job())) {
executed.add(step);
}
}
}
if (_preprocessing != null) {
for (PreprocessingStep preprocessingStep : _preprocessing) preprocessingStep.dispose();
}
_actualModelingSteps = _modelingStepsRegistry.createDefinitionPlanFromSteps(executed.toArray(new ModelingStep[0]));
eventLog().info(Stage.Workflow, "Actual modeling steps: "+Arrays.toString(_actualModelingSteps));
}
Expand All @@ -622,13 +643,13 @@ private int nextInstanceCounter(String algoName, String type) {
return _instanceCounters.get(key).incrementAndGet();
}

Key makeKey(String algoName, String type, boolean with_counter) {
public Key makeKey(String algoName, String type, boolean with_counter) {
String counterStr = with_counter ? "_" + nextInstanceCounter(algoName, type) : "";
String prefix = StringUtils.isNullOrEmpty(type) ? algoName : algoName+"_"+type+"_";
return Key.make(prefix + counterStr + "_AutoML_" + timestampFormatForKeys.get().format(_startTime));
}

void trackKey(Key key) {
public void trackKey(Key key) {
_trackedKeys.put(key, Arrays.toString(Thread.currentThread().getStackTrace()));
}

Expand All @@ -653,7 +674,7 @@ private boolean exceededSearchLimits(ModelingStep step) {
//***************** Clean Up + other utility functions *****************//

/**
* Delete the AutoML-related objects, but leave the grids and models that it built.
* Delete the AutoML-related objects, including the grids and models that it built if cascade=true
*/
@Override
protected Futures remove_impl(Futures fs, boolean cascade) {
Expand All @@ -675,7 +696,11 @@ protected Futures remove_impl(Futures fs, boolean cascade) {
Frame.deleteTempFrameAndItsNonSharedVecs(_trainingFrame, _origTrainingFrame);
if (leaderboard() != null) leaderboard().remove(fs, cascade);
if (eventLog() != null) eventLog().remove(fs, cascade);

if (cascade && _preprocessing != null) {
for (PreprocessingStep preprocessingStep : _preprocessing) {
preprocessingStep.remove();
}
}
for (Key key : _trackedKeys.keySet()) Keyed.remove(key, fs, true);

return super.remove_impl(fs, cascade);
Expand Down
2 changes: 2 additions & 0 deletions h2o-automl/src/main/java/ai/h2o/automl/AutoMLBuildSpec.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package ai.h2o.automl;

import ai.h2o.automl.preprocessing.PreprocessingStepDefinition;
import hex.Model;
import hex.ScoreKeeper.StoppingMetric;
import hex.grid.HyperSpaceSearchCriteria;
Expand Down Expand Up @@ -170,6 +171,7 @@ public static final class AutoMLBuildModels extends Iced {
public StepDefinition[] modeling_plan;
public double exploitation_ratio = 0;
public AutoMLCustomParameters algo_parameters = new AutoMLCustomParameters();
public PreprocessingStepDefinition[] preprocessing;
}

public static final class AutoMLCustomParameters extends Iced {
Expand Down
38 changes: 31 additions & 7 deletions h2o-automl/src/main/java/ai/h2o/automl/ModelingStep.java
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@
import ai.h2o.automl.WorkAllocations.JobType;
import ai.h2o.automl.WorkAllocations.Work;
import ai.h2o.automl.leaderboard.Leaderboard;
import ai.h2o.automl.preprocessing.PreprocessingConfig;
import ai.h2o.automl.preprocessing.PreprocessingStep;
import ai.h2o.automl.preprocessing.PreprocessingStepDefinition;
import hex.Model;
import hex.Model.Parameters.FoldAssignmentScheme;
import hex.ModelBuilder;
Expand All @@ -27,9 +30,8 @@
import water.util.EnumUtils;
import water.util.Log;

import java.util.Arrays;
import java.util.Date;
import java.util.Map;
import java.util.*;
import java.util.function.Consumer;
import java.util.function.Predicate;

/**
Expand All @@ -46,6 +48,7 @@ protected enum SeedPolicy {
Incremental
}

static Predicate<Work> isDefaultModel = w -> w._type == JobType.ModelBuild;
static Predicate<Work> isExplorationWork = w -> w._type == JobType.ModelBuild || w._type == JobType.HyperparamSearch;
static Predicate<Work> isExploitationWork = w -> w._type == JobType.Selection;

Expand All @@ -55,6 +58,7 @@ protected <MP extends Model.Parameters> Job<Grid> startSearch(
final Map<String, Object[]> hyperParams,
final HyperSpaceSearchCriteria searchCriteria)
{
applyPreprocessing(baseParams);
aml().eventLog().info(Stage.ModelTraining, "AutoML: starting "+resultKey+" hyperparameter search")
.setNamedValue("start_"+_algo+"_"+_id, new Date(), EventLogEntry.epochFormat.get());
return GridSearch.startGridSearch(
Expand All @@ -72,6 +76,7 @@ protected <M extends Model, MP extends Model.Parameters> Job<M> startModel(
final MP params
) {
Job<M> job = new Job<>(resultKey, ModelBuilder.javaName(_algo.urlName()), _description);
applyPreprocessing(params);
ModelBuilder builder = ModelBuilder.make(_algo.urlName(), job, (Key<Model>) resultKey);
builder._parms = params;
aml().eventLog().info(Stage.ModelTraining, "AutoML: starting "+resultKey+" model training")
Expand All @@ -81,9 +86,9 @@ protected <M extends Model, MP extends Model.Parameters> Job<M> startModel(
return builder.trainModelOnH2ONode();
} catch (H2OIllegalArgumentException exception) {
aml().eventLog().warn(Stage.ModelTraining, "Skipping training of model "+resultKey+" due to exception: "+exception);
onDone(null);
return null;
}

}

private transient AutoML _aml;
Expand All @@ -93,6 +98,7 @@ protected <M extends Model, MP extends Model.Parameters> Job<M> startModel(
protected int _weight;
protected AutoML.Constraint[] _ignoredConstraints = new AutoML.Constraint[0]; // whether or not to ignore the max_models/max_runtime constraints
protected String _description;
private final transient List<Consumer<Job>> _onDone = new ArrayList<>();

StepDefinition _fromDef;

Expand All @@ -112,6 +118,13 @@ protected ModelingStep(IAlgo algo, String id, int weight, AutoML autoML) {

protected abstract Job startJob();

protected void onDone(Job job) {
for (Consumer<Job> exec : _onDone) {
exec.accept(job);
}
_onDone.clear();
};

protected AutoML aml() {
return _aml;
}
Expand Down Expand Up @@ -160,7 +173,7 @@ protected void setCommonModelBuilderParams(Model.Parameters params) {
setCrossValidationParams(params);
setWeightingParams(params);
setClassBalancingParams(params);

params._keep_cross_validation_models = buildSpec.build_control.keep_cross_validation_models;
params._keep_cross_validation_fold_assignment = buildSpec.build_control.nfolds != 0 && buildSpec.build_control.keep_cross_validation_fold_assignment;
params._export_checkpoints_dir = buildSpec.build_control.export_checkpoints_dir;
Expand Down Expand Up @@ -199,7 +212,18 @@ protected void setCustomParams(Model.Parameters params) {
if (customParams == null) return;
customParams.applyCustomParameters(_algo, params);
}


protected void applyPreprocessing(Model.Parameters params) {
if (aml().getPreprocessing() == null) return;
for (PreprocessingStep preprocessingStep : aml().getPreprocessing()) {
PreprocessingStep.Completer complete = preprocessingStep.apply(params, getPreprocessingConfig());
_onDone.add(j -> complete.run());
}
}

protected PreprocessingConfig getPreprocessingConfig() {
return new PreprocessingConfig();
}

/**
* Configures early-stopping for the model or set of models to be built.
Expand Down Expand Up @@ -340,6 +364,7 @@ protected Job<M> trainModel(Key<M> key, Model.Parameters parms) {
Work work = getAllocatedWork();
// double maxAssignedTimeSecs = aml().timeRemainingMs() / 1e3; // legacy
double maxAssignedTimeSecs = aml().timeRemainingMs() * getWorkAllocations().remainingWorkRatio(work) / 1e3; //including default models in the distribution of the time budget.
// double maxAssignedTimeSecs = aml().timeRemainingMs() * getWorkAllocations().remainingWorkRatio(work, isDefaultModel) / 1e3; //PUBDEV-7595
parms._max_runtime_secs = parms._max_runtime_secs == 0
? maxAssignedTimeSecs
: Math.min(parms._max_runtime_secs, maxAssignedTimeSecs);
Expand All @@ -350,7 +375,6 @@ protected Job<M> trainModel(Key<M> key, Model.Parameters parms) {
: "Time assigned for "+key+": "+parms._max_runtime_secs+"s");
return startModel(key, parms);
}

}

/**
Expand Down
22 changes: 14 additions & 8 deletions h2o-automl/src/main/java/ai/h2o/automl/ModelingStepsExecutor.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
import water.util.Log;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.function.Consumer;
Expand Down Expand Up @@ -76,11 +75,18 @@ void stop() {
boolean submit(ModelingStep step, Job parentJob) {
if (step.canRun()) {
Job job = step.startJob();
if (job == null) {
skip(step._description, step.getAllocatedWork(), parentJob);
} else {
monitor(job, step.getAllocatedWork(), parentJob, ArrayUtils.contains(step._ignoredConstraints, Constraint.TIMEOUT));
return true;
try {
if (job==null) {
skip(step._description, step.getAllocatedWork(), parentJob);
} else {
monitor(job,
step.getAllocatedWork(),
parentJob,
ArrayUtils.contains(step._ignoredConstraints, Constraint.TIMEOUT));
return true;
}
} finally {
step.onDone(job);
}
}
return false;
Expand Down Expand Up @@ -145,13 +151,13 @@ void monitor(Job job, Work work, Job parentJob, boolean ignoreTimeout) {
} else if (job.get() == null) {
eventLog.info(Stage.ModelTraining, jobDescription + " cancelled");
} else {
eventLog.debug(Stage.ModelTraining, jobDescription + " complete");
ModelContainer<?> container = (ModelContainer) job.get();
int totalModelsBuilt = container.getModelCount();
if (totalModelsBuilt > lastTotalModelsBuilt) {
eventLog.debug(Stage.ModelTraining, "Built: "+totalModelsBuilt+" models for "+work._type+" : "+jobDescription);
this.addModels(container);
}
eventLog.debug(Stage.ModelTraining, jobDescription + " complete");
}
} else if (JobType.ModelBuild == work._type) {
if (job.isCrashed()) {
Expand All @@ -160,7 +166,7 @@ void monitor(Job job, Work work, Job parentJob, boolean ignoreTimeout) {
eventLog.info(Stage.ModelTraining, jobDescription + " cancelled");
} else {
eventLog.debug(Stage.ModelTraining, jobDescription + " complete");
this.addModel((Model) job.get());
this.addModel((Model)job.get());
}
}

Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
package ai.h2o.automl.modeling;

import ai.h2o.automl.*;
import ai.h2o.automl.preprocessing.PreprocessingConfig;
import ai.h2o.automl.preprocessing.TargetEncoding;
import hex.deeplearning.DeepLearningModel;
import hex.deeplearning.DeepLearningModel.DeepLearningParameters;
import hex.grid.Grid;
Expand All @@ -22,6 +24,14 @@ static abstract class DeepLearningModelStep extends ModelingStep.ModelStep<DeepL
public DeepLearningModelStep(String id, int weight, AutoML autoML) {
super(Algo.DeepLearning, id, weight, autoML);
}

@Override
protected PreprocessingConfig getPreprocessingConfig() {
//TE useless for DNN
PreprocessingConfig config = super.getPreprocessingConfig();
config.put(TargetEncoding.CONFIG_PREPARE_CV_ONLY, aml().isCVEnabled());
return config;
}
}

static abstract class DeepLearningGridStep extends ModelingStep.GridStep<DeepLearningModel> {
Expand All @@ -39,7 +49,14 @@ DeepLearningParameters prepareModelParameters() {

return dlParameters;
}


@Override
protected PreprocessingConfig getPreprocessingConfig() {
//TE useless for DNN
PreprocessingConfig config = super.getPreprocessingConfig();
config.put(TargetEncoding.CONFIG_PREPARE_CV_ONLY, aml().isCVEnabled());
return config;
}

Map<String, Object[]> prepareSearchParams() {
Map<String, Object[]> searchParams = new HashMap<>();
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
package ai.h2o.automl.modeling;

import ai.h2o.automl.*;
import ai.h2o.automl.preprocessing.PreprocessingConfig;
import ai.h2o.automl.preprocessing.PreprocessingStepDefinition;
import ai.h2o.automl.preprocessing.TargetEncoding;
import hex.Model;
import hex.glm.GLMModel;
import hex.glm.GLMModel.GLMParameters;
Expand Down Expand Up @@ -35,6 +38,15 @@ GLMParameters prepareModelParameters() {
: GLMParameters.Family.gaussian; // TODO: other continuous distributions!
return glmParameters;
}

@Override
protected PreprocessingConfig getPreprocessingConfig() {
//GLM (the exception as usual) doesn't support targetencoding if CV is enabled
// because it is initializing its lambdas + other params before CV (preventing changes in train frame during CV).
PreprocessingConfig config = super.getPreprocessingConfig();
config.put(TargetEncoding.CONFIG_PREPARE_CV_ONLY, aml().isCVEnabled());
return config;
}
}


Expand Down
Loading