diff --git a/h2o-automl/build.gradle b/h2o-automl/build.gradle index 0ecb964d1054..96f25b7e23b0 100644 --- a/h2o-automl/build.gradle +++ b/h2o-automl/build.gradle @@ -5,10 +5,12 @@ dependencies { compile project(":h2o-core") compile project(":h2o-algos") compileOnly project(":h2o-ext-xgboost") + compileOnly project(":h2o-ext-target-encoder") // Test dependencies only testCompile project(":h2o-test-support") testCompile project(":h2o-ext-xgboost") + testCompile project(":h2o-ext-target-encoder") testRuntimeOnly project(":${defaultWebserverModule}") } diff --git a/h2o-automl/src/main/java/ai/h2o/automl/AutoML.java b/h2o-automl/src/main/java/ai/h2o/automl/AutoML.java index 5f732fa9cf5f..f5ac70a8f345 100644 --- a/h2o-automl/src/main/java/ai/h2o/automl/AutoML.java +++ b/h2o-automl/src/main/java/ai/h2o/automl/AutoML.java @@ -9,6 +9,7 @@ import ai.h2o.automl.events.EventLogEntry.Stage; import ai.h2o.automl.StepDefinition.Alias; import ai.h2o.automl.leaderboard.*; +import ai.h2o.automl.preprocessing.PreprocessingStep; import hex.Model; import hex.ScoreKeeper.StoppingMetric; import hex.splitframe.ShuffleSplitFrame; @@ -183,6 +184,7 @@ public Class makeSchema() { private long[] _originalTrainingFrameChecksums; private transient NonBlockingHashMap _trackedKeys = new NonBlockingHashMap<>(); private transient ModelingStep[] _executionPlan; + private transient PreprocessingStep[] _preprocessing; public AutoML() { super(null); @@ -206,6 +208,7 @@ public AutoML(Key key, Date startTime, AutoMLBuildSpec buildSpec) { prepareData(); initLeaderboard(); + initPreprocessing(); planWork(); _modelingStepsExecutor = new ModelingStepsExecutor(_leaderboard, _eventLog, _runCountdown); } catch (Exception e) { @@ -343,10 +346,22 @@ private void initLeaderboard() { _leaderboard.setExtensionsProvider(createLeaderboardExtensionProvider(this)); } + private void initPreprocessing() { + _preprocessing = _buildSpec.build_models.preprocessing == null + ? null + : Arrays.stream(_buildSpec.build_models.preprocessing) + .map(def -> def.newPreprocessingStep(this)) + .toArray(PreprocessingStep[]::new); + } + + PreprocessingStep[] getPreprocessing() { + return _preprocessing; + } + ModelingStep[] getExecutionPlan() { return _executionPlan == null ? (_executionPlan = _modelingStepsRegistry.getOrderedSteps(_buildSpec.build_models.modeling_plan, this)) : _executionPlan; } - + void planWork() { Set skippedAlgos = new HashSet<>(); if (_buildSpec.build_models.exclude_algos != null) { @@ -480,7 +495,7 @@ public boolean keepRunning() { return !_runCountdown.timedOut() && remainingModels() > 0; } - boolean isCVEnabled() { + public boolean isCVEnabled() { return _buildSpec.build_control.nfolds > 0 || _buildSpec.input_spec.fold_column != null; } @@ -599,6 +614,9 @@ private void prepareData() { private void learn() { List executed = new ArrayList<>(); + if (_preprocessing != null) { + for (PreprocessingStep preprocessingStep : _preprocessing) preprocessingStep.prepare(); + } for (ModelingStep step : getExecutionPlan()) { if (!exceededSearchLimits(step)) { if (_modelingStepsExecutor.submit(step, job())) { @@ -606,6 +624,9 @@ private void learn() { } } } + if (_preprocessing != null) { + for (PreprocessingStep preprocessingStep : _preprocessing) preprocessingStep.dispose(); + } _actualModelingSteps = _modelingStepsRegistry.createDefinitionPlanFromSteps(executed.toArray(new ModelingStep[0])); eventLog().info(Stage.Workflow, "Actual modeling steps: "+Arrays.toString(_actualModelingSteps)); } @@ -622,13 +643,13 @@ private int nextInstanceCounter(String algoName, String type) { return _instanceCounters.get(key).incrementAndGet(); } - Key makeKey(String algoName, String type, boolean with_counter) { + public Key makeKey(String algoName, String type, boolean with_counter) { String counterStr = with_counter ? "_" + nextInstanceCounter(algoName, type) : ""; String prefix = StringUtils.isNullOrEmpty(type) ? algoName : algoName+"_"+type+"_"; return Key.make(prefix + counterStr + "_AutoML_" + timestampFormatForKeys.get().format(_startTime)); } - void trackKey(Key key) { + public void trackKey(Key key) { _trackedKeys.put(key, Arrays.toString(Thread.currentThread().getStackTrace())); } @@ -653,7 +674,7 @@ private boolean exceededSearchLimits(ModelingStep step) { //***************** Clean Up + other utility functions *****************// /** - * Delete the AutoML-related objects, but leave the grids and models that it built. + * Delete the AutoML-related objects, including the grids and models that it built if cascade=true */ @Override protected Futures remove_impl(Futures fs, boolean cascade) { @@ -675,7 +696,11 @@ protected Futures remove_impl(Futures fs, boolean cascade) { Frame.deleteTempFrameAndItsNonSharedVecs(_trainingFrame, _origTrainingFrame); if (leaderboard() != null) leaderboard().remove(fs, cascade); if (eventLog() != null) eventLog().remove(fs, cascade); - + if (cascade && _preprocessing != null) { + for (PreprocessingStep preprocessingStep : _preprocessing) { + preprocessingStep.remove(); + } + } for (Key key : _trackedKeys.keySet()) Keyed.remove(key, fs, true); return super.remove_impl(fs, cascade); diff --git a/h2o-automl/src/main/java/ai/h2o/automl/AutoMLBuildSpec.java b/h2o-automl/src/main/java/ai/h2o/automl/AutoMLBuildSpec.java index 42312c3b2e9e..0a5c8fc3d217 100644 --- a/h2o-automl/src/main/java/ai/h2o/automl/AutoMLBuildSpec.java +++ b/h2o-automl/src/main/java/ai/h2o/automl/AutoMLBuildSpec.java @@ -1,5 +1,6 @@ package ai.h2o.automl; +import ai.h2o.automl.preprocessing.PreprocessingStepDefinition; import hex.Model; import hex.ScoreKeeper.StoppingMetric; import hex.grid.HyperSpaceSearchCriteria; @@ -170,6 +171,7 @@ public static final class AutoMLBuildModels extends Iced { public StepDefinition[] modeling_plan; public double exploitation_ratio = 0; public AutoMLCustomParameters algo_parameters = new AutoMLCustomParameters(); + public PreprocessingStepDefinition[] preprocessing; } public static final class AutoMLCustomParameters extends Iced { diff --git a/h2o-automl/src/main/java/ai/h2o/automl/ModelingStep.java b/h2o-automl/src/main/java/ai/h2o/automl/ModelingStep.java index f00c9577a182..f29bcceef0fb 100644 --- a/h2o-automl/src/main/java/ai/h2o/automl/ModelingStep.java +++ b/h2o-automl/src/main/java/ai/h2o/automl/ModelingStep.java @@ -9,6 +9,9 @@ import ai.h2o.automl.WorkAllocations.JobType; import ai.h2o.automl.WorkAllocations.Work; import ai.h2o.automl.leaderboard.Leaderboard; +import ai.h2o.automl.preprocessing.PreprocessingConfig; +import ai.h2o.automl.preprocessing.PreprocessingStep; +import ai.h2o.automl.preprocessing.PreprocessingStepDefinition; import hex.Model; import hex.Model.Parameters.FoldAssignmentScheme; import hex.ModelBuilder; @@ -27,9 +30,8 @@ import water.util.EnumUtils; import water.util.Log; -import java.util.Arrays; -import java.util.Date; -import java.util.Map; +import java.util.*; +import java.util.function.Consumer; import java.util.function.Predicate; /** @@ -46,6 +48,7 @@ protected enum SeedPolicy { Incremental } + static Predicate isDefaultModel = w -> w._type == JobType.ModelBuild; static Predicate isExplorationWork = w -> w._type == JobType.ModelBuild || w._type == JobType.HyperparamSearch; static Predicate isExploitationWork = w -> w._type == JobType.Selection; @@ -55,6 +58,7 @@ protected Job startSearch( final Map hyperParams, final HyperSpaceSearchCriteria searchCriteria) { + applyPreprocessing(baseParams); aml().eventLog().info(Stage.ModelTraining, "AutoML: starting "+resultKey+" hyperparameter search") .setNamedValue("start_"+_algo+"_"+_id, new Date(), EventLogEntry.epochFormat.get()); return GridSearch.startGridSearch( @@ -72,6 +76,7 @@ protected Job startModel( final MP params ) { Job job = new Job<>(resultKey, ModelBuilder.javaName(_algo.urlName()), _description); + applyPreprocessing(params); ModelBuilder builder = ModelBuilder.make(_algo.urlName(), job, (Key) resultKey); builder._parms = params; aml().eventLog().info(Stage.ModelTraining, "AutoML: starting "+resultKey+" model training") @@ -81,9 +86,9 @@ protected Job startModel( return builder.trainModelOnH2ONode(); } catch (H2OIllegalArgumentException exception) { aml().eventLog().warn(Stage.ModelTraining, "Skipping training of model "+resultKey+" due to exception: "+exception); + onDone(null); return null; } - } private transient AutoML _aml; @@ -93,6 +98,7 @@ protected Job startModel( protected int _weight; protected AutoML.Constraint[] _ignoredConstraints = new AutoML.Constraint[0]; // whether or not to ignore the max_models/max_runtime constraints protected String _description; + private final transient List> _onDone = new ArrayList<>(); StepDefinition _fromDef; @@ -112,6 +118,13 @@ protected ModelingStep(IAlgo algo, String id, int weight, AutoML autoML) { protected abstract Job startJob(); + protected void onDone(Job job) { + for (Consumer exec : _onDone) { + exec.accept(job); + } + _onDone.clear(); + }; + protected AutoML aml() { return _aml; } @@ -160,7 +173,7 @@ protected void setCommonModelBuilderParams(Model.Parameters params) { setCrossValidationParams(params); setWeightingParams(params); setClassBalancingParams(params); - + params._keep_cross_validation_models = buildSpec.build_control.keep_cross_validation_models; params._keep_cross_validation_fold_assignment = buildSpec.build_control.nfolds != 0 && buildSpec.build_control.keep_cross_validation_fold_assignment; params._export_checkpoints_dir = buildSpec.build_control.export_checkpoints_dir; @@ -199,7 +212,18 @@ protected void setCustomParams(Model.Parameters params) { if (customParams == null) return; customParams.applyCustomParameters(_algo, params); } - + + protected void applyPreprocessing(Model.Parameters params) { + if (aml().getPreprocessing() == null) return; + for (PreprocessingStep preprocessingStep : aml().getPreprocessing()) { + PreprocessingStep.Completer complete = preprocessingStep.apply(params, getPreprocessingConfig()); + _onDone.add(j -> complete.run()); + } + } + + protected PreprocessingConfig getPreprocessingConfig() { + return new PreprocessingConfig(); + } /** * Configures early-stopping for the model or set of models to be built. @@ -340,6 +364,7 @@ protected Job trainModel(Key key, Model.Parameters parms) { Work work = getAllocatedWork(); // double maxAssignedTimeSecs = aml().timeRemainingMs() / 1e3; // legacy double maxAssignedTimeSecs = aml().timeRemainingMs() * getWorkAllocations().remainingWorkRatio(work) / 1e3; //including default models in the distribution of the time budget. +// double maxAssignedTimeSecs = aml().timeRemainingMs() * getWorkAllocations().remainingWorkRatio(work, isDefaultModel) / 1e3; //PUBDEV-7595 parms._max_runtime_secs = parms._max_runtime_secs == 0 ? maxAssignedTimeSecs : Math.min(parms._max_runtime_secs, maxAssignedTimeSecs); @@ -350,7 +375,6 @@ protected Job trainModel(Key key, Model.Parameters parms) { : "Time assigned for "+key+": "+parms._max_runtime_secs+"s"); return startModel(key, parms); } - } /** diff --git a/h2o-automl/src/main/java/ai/h2o/automl/ModelingStepsExecutor.java b/h2o-automl/src/main/java/ai/h2o/automl/ModelingStepsExecutor.java index 5b64a7318375..29bd3cc000f9 100644 --- a/h2o-automl/src/main/java/ai/h2o/automl/ModelingStepsExecutor.java +++ b/h2o-automl/src/main/java/ai/h2o/automl/ModelingStepsExecutor.java @@ -16,7 +16,6 @@ import water.util.Log; import java.util.ArrayList; -import java.util.Arrays; import java.util.List; import java.util.concurrent.atomic.AtomicInteger; import java.util.function.Consumer; @@ -76,11 +75,18 @@ void stop() { boolean submit(ModelingStep step, Job parentJob) { if (step.canRun()) { Job job = step.startJob(); - if (job == null) { - skip(step._description, step.getAllocatedWork(), parentJob); - } else { - monitor(job, step.getAllocatedWork(), parentJob, ArrayUtils.contains(step._ignoredConstraints, Constraint.TIMEOUT)); - return true; + try { + if (job==null) { + skip(step._description, step.getAllocatedWork(), parentJob); + } else { + monitor(job, + step.getAllocatedWork(), + parentJob, + ArrayUtils.contains(step._ignoredConstraints, Constraint.TIMEOUT)); + return true; + } + } finally { + step.onDone(job); } } return false; @@ -145,13 +151,13 @@ void monitor(Job job, Work work, Job parentJob, boolean ignoreTimeout) { } else if (job.get() == null) { eventLog.info(Stage.ModelTraining, jobDescription + " cancelled"); } else { + eventLog.debug(Stage.ModelTraining, jobDescription + " complete"); ModelContainer container = (ModelContainer) job.get(); int totalModelsBuilt = container.getModelCount(); if (totalModelsBuilt > lastTotalModelsBuilt) { eventLog.debug(Stage.ModelTraining, "Built: "+totalModelsBuilt+" models for "+work._type+" : "+jobDescription); this.addModels(container); } - eventLog.debug(Stage.ModelTraining, jobDescription + " complete"); } } else if (JobType.ModelBuild == work._type) { if (job.isCrashed()) { @@ -160,7 +166,7 @@ void monitor(Job job, Work work, Job parentJob, boolean ignoreTimeout) { eventLog.info(Stage.ModelTraining, jobDescription + " cancelled"); } else { eventLog.debug(Stage.ModelTraining, jobDescription + " complete"); - this.addModel((Model) job.get()); + this.addModel((Model)job.get()); } } diff --git a/h2o-automl/src/main/java/ai/h2o/automl/modeling/DeepLearningStepsProvider.java b/h2o-automl/src/main/java/ai/h2o/automl/modeling/DeepLearningStepsProvider.java index 345f8ccebdad..9208698ee421 100644 --- a/h2o-automl/src/main/java/ai/h2o/automl/modeling/DeepLearningStepsProvider.java +++ b/h2o-automl/src/main/java/ai/h2o/automl/modeling/DeepLearningStepsProvider.java @@ -1,6 +1,8 @@ package ai.h2o.automl.modeling; import ai.h2o.automl.*; +import ai.h2o.automl.preprocessing.PreprocessingConfig; +import ai.h2o.automl.preprocessing.TargetEncoding; import hex.deeplearning.DeepLearningModel; import hex.deeplearning.DeepLearningModel.DeepLearningParameters; import hex.grid.Grid; @@ -22,6 +24,14 @@ static abstract class DeepLearningModelStep extends ModelingStep.ModelStep { @@ -39,7 +49,14 @@ DeepLearningParameters prepareModelParameters() { return dlParameters; } - + + @Override + protected PreprocessingConfig getPreprocessingConfig() { + //TE useless for DNN + PreprocessingConfig config = super.getPreprocessingConfig(); + config.put(TargetEncoding.CONFIG_PREPARE_CV_ONLY, aml().isCVEnabled()); + return config; + } Map prepareSearchParams() { Map searchParams = new HashMap<>(); diff --git a/h2o-automl/src/main/java/ai/h2o/automl/modeling/GLMStepsProvider.java b/h2o-automl/src/main/java/ai/h2o/automl/modeling/GLMStepsProvider.java index 592c54af7c18..93a625627876 100644 --- a/h2o-automl/src/main/java/ai/h2o/automl/modeling/GLMStepsProvider.java +++ b/h2o-automl/src/main/java/ai/h2o/automl/modeling/GLMStepsProvider.java @@ -1,6 +1,9 @@ package ai.h2o.automl.modeling; import ai.h2o.automl.*; +import ai.h2o.automl.preprocessing.PreprocessingConfig; +import ai.h2o.automl.preprocessing.PreprocessingStepDefinition; +import ai.h2o.automl.preprocessing.TargetEncoding; import hex.Model; import hex.glm.GLMModel; import hex.glm.GLMModel.GLMParameters; @@ -35,6 +38,15 @@ GLMParameters prepareModelParameters() { : GLMParameters.Family.gaussian; // TODO: other continuous distributions! return glmParameters; } + + @Override + protected PreprocessingConfig getPreprocessingConfig() { + //GLM (the exception as usual) doesn't support targetencoding if CV is enabled + // because it is initializing its lambdas + other params before CV (preventing changes in train frame during CV). + PreprocessingConfig config = super.getPreprocessingConfig(); + config.put(TargetEncoding.CONFIG_PREPARE_CV_ONLY, aml().isCVEnabled()); + return config; + } } diff --git a/h2o-automl/src/main/java/ai/h2o/automl/modeling/StackedEnsembleStepsProvider.java b/h2o-automl/src/main/java/ai/h2o/automl/modeling/StackedEnsembleStepsProvider.java index 2ea45117765a..9ddb038350e3 100644 --- a/h2o-automl/src/main/java/ai/h2o/automl/modeling/StackedEnsembleStepsProvider.java +++ b/h2o-automl/src/main/java/ai/h2o/automl/modeling/StackedEnsembleStepsProvider.java @@ -3,12 +3,13 @@ import ai.h2o.automl.*; import ai.h2o.automl.WorkAllocations.Work; import ai.h2o.automl.events.EventLogEntry; +import ai.h2o.automl.preprocessing.PreprocessingConfig; +import ai.h2o.automl.preprocessing.PreprocessingStepDefinition; +import ai.h2o.automl.preprocessing.TargetEncoding; import hex.KeyValue; import hex.Model; -import hex.ensemble.Metalearner; import hex.ensemble.StackedEnsembleModel; import hex.ensemble.StackedEnsembleModel.StackedEnsembleParameters; -import hex.glm.GLMModel; import water.DKV; import water.Job; import water.Key; @@ -50,6 +51,14 @@ protected void setClassBalancingParams(Model.Parameters params) { //Disabled } + @Override + protected PreprocessingConfig getPreprocessingConfig() { + //SE should not have TE applied, the base models already do it. + PreprocessingConfig config = super.getPreprocessingConfig(); + config.put(TargetEncoding.CONFIG_ENABLED, false); + return config; + } + @Override protected boolean canRun() { Key[] keys = getBaseModels(); diff --git a/h2o-automl/src/main/java/ai/h2o/automl/preprocessing/PreprocessingConfig.java b/h2o-automl/src/main/java/ai/h2o/automl/preprocessing/PreprocessingConfig.java new file mode 100644 index 000000000000..b571e6755bed --- /dev/null +++ b/h2o-automl/src/main/java/ai/h2o/automl/preprocessing/PreprocessingConfig.java @@ -0,0 +1,10 @@ +package ai.h2o.automl.preprocessing; + +import java.util.HashMap; + +public class PreprocessingConfig extends HashMap { + + boolean get(String key, boolean defaultValue) { + return (boolean) getOrDefault(key, defaultValue); + } +} diff --git a/h2o-automl/src/main/java/ai/h2o/automl/preprocessing/PreprocessingStep.java b/h2o-automl/src/main/java/ai/h2o/automl/preprocessing/PreprocessingStep.java new file mode 100644 index 000000000000..e3a32a361c71 --- /dev/null +++ b/h2o-automl/src/main/java/ai/h2o/automl/preprocessing/PreprocessingStep.java @@ -0,0 +1,37 @@ +package ai.h2o.automl.preprocessing; + +import ai.h2o.automl.ModelingStep; +import hex.Model; + +public interface PreprocessingStep { + + interface Completer extends Runnable {} + + String getType(); + + /** + * preprocessing steps are prepared by default before the AutoML session starts training the first model. + */ + void prepare(); + + /** + * applies this preprocessing step to the model parameters right before the model training starts. + * @param params + * @return a function used to "complete" the preprocessing step: it is called by default at the end of the job creating model(s) from the given parms. + * This can mean for example cleaning the temporary artifacts that may have been created to apply the preprocessing step. + */ + Completer apply(Model.Parameters params, PreprocessingConfig config); + + /** + * preprocessing steps are disposed by default at the end of the AutoML training session. + * Note that disposing here doesn't mean being removed from the system, + * the goal is mainly to clean resources that are not needed anymore for the current AutoML run. + */ + void dispose(); + + /** + * Completely remove from the system + */ + void remove(); + +} diff --git a/h2o-automl/src/main/java/ai/h2o/automl/preprocessing/PreprocessingStepDefinition.java b/h2o-automl/src/main/java/ai/h2o/automl/preprocessing/PreprocessingStepDefinition.java new file mode 100644 index 000000000000..568599a16633 --- /dev/null +++ b/h2o-automl/src/main/java/ai/h2o/automl/preprocessing/PreprocessingStepDefinition.java @@ -0,0 +1,28 @@ +package ai.h2o.automl.preprocessing; + +import ai.h2o.automl.AutoML; +import water.Iced; + +public class PreprocessingStepDefinition extends Iced { + + public enum Type { + TargetEncoding + } + + Type _type; + + public PreprocessingStepDefinition() { /* for reflection */ } + + public PreprocessingStepDefinition(Type type) { + _type = type; + } + + public PreprocessingStep newPreprocessingStep(AutoML aml) { + switch (_type) { + case TargetEncoding: + return new TargetEncoding(aml); + default: + throw new IllegalStateException(); + } + } +} diff --git a/h2o-automl/src/main/java/ai/h2o/automl/preprocessing/TargetEncoding.java b/h2o-automl/src/main/java/ai/h2o/automl/preprocessing/TargetEncoding.java new file mode 100644 index 000000000000..b11215cc4d80 --- /dev/null +++ b/h2o-automl/src/main/java/ai/h2o/automl/preprocessing/TargetEncoding.java @@ -0,0 +1,234 @@ +package ai.h2o.automl.preprocessing; + +import ai.h2o.automl.AutoML; +import ai.h2o.automl.AutoMLBuildSpec.AutoMLBuildControl; +import ai.h2o.automl.AutoMLBuildSpec.AutoMLInput; +import ai.h2o.automl.events.EventLogEntry.Stage; +import ai.h2o.targetencoding.TargetEncoder; +import ai.h2o.targetencoding.TargetEncoderModel; +import ai.h2o.targetencoding.TargetEncoderModel.DataLeakageHandlingStrategy; +import ai.h2o.targetencoding.TargetEncoderModel.TargetEncoderParameters; +import ai.h2o.targetencoding.TargetEncoderPreprocessor; +import hex.Model; +import hex.Model.Parameters.FoldAssignmentScheme; +import hex.ModelPreprocessor; +import water.DKV; +import water.Key; +import water.fvec.Frame; +import water.fvec.Vec; +import water.rapids.ast.prims.advmath.AstKFold; +import water.util.ArrayUtils; + +import java.util.*; +import java.util.function.Predicate; + +public class TargetEncoding implements PreprocessingStep { + + public static String CONFIG_ENABLED = "target_encoding_enabled"; + public static String CONFIG_PREPARE_CV_ONLY = "target_encoding_prepare_cv_only"; + + static String TE_FOLD_COLUMN_SUFFIX = "_te_fold"; + private static final Completer NOOP = () -> {}; + + private AutoML _aml; + private TargetEncoderPreprocessor _tePreprocessor; + private TargetEncoderModel _teModel; + private final List _disposables = new ArrayList<>(); + + private TargetEncoderParameters _defaultParams; + private boolean _encodeAllColumns = false; // if true, bypass all restrictions in columns selection. + private int _columnCardinalityThreshold = 10; // the minimal cardinality for a column to be TE encoded. + + public TargetEncoding(AutoML aml) { + _aml = aml; + } + + @Override + public String getType() { + return PreprocessingStepDefinition.Type.TargetEncoding.name(); + } + + @Override + public void prepare() { + AutoMLInput amlInput = _aml.getBuildSpec().input_spec; + AutoMLBuildControl amlBuild = _aml.getBuildSpec().build_control; + Frame amlTrain = _aml.getTrainingFrame(); + + TargetEncoderParameters params = (TargetEncoderParameters) getDefaultParams().clone(); + params._train = amlTrain._key; + params._response_column = amlInput.response_column; + params._seed = amlBuild.stopping_criteria.seed(); + + Set teColumns = selectColumnsToEncode(amlTrain, params); + if (teColumns.isEmpty()) return; + + _aml.eventLog().warn(Stage.FeatureCreation, + "Target Encoding integration in AutoML is in an experimental stage, the models obtained with this feature can not yet be downloaded as MOJO for production."); + + + if (_aml.isCVEnabled()) { + params._data_leakage_handling = DataLeakageHandlingStrategy.KFold; + params._fold_column = amlInput.fold_column; + if (params._fold_column == null) { + //generate fold column + Frame train = new Frame(params.train()); + Vec foldColumn = createFoldColumn( + params.train(), + FoldAssignmentScheme.Modulo, + amlBuild.nfolds, + params._response_column, + params._seed + ); + DKV.put(foldColumn); + params._fold_column = params._response_column+TE_FOLD_COLUMN_SUFFIX; + train.add(params._fold_column, foldColumn); + register(train, params._train.toString(), true); + params._train = train._key; + _disposables.add(() -> { + foldColumn.remove(); + DKV.remove(train._key); + }); + } + } + String[] keep = params.getNonPredictors(); + params._ignored_columns = Arrays.stream(amlTrain.names()) + .filter(col -> !teColumns.contains(col) && !ArrayUtils.contains(keep, col)) + .toArray(String[]::new); + + TargetEncoder te = new TargetEncoder(params, _aml.makeKey(getType(), null, false)); + _teModel = te.trainModel().get(); + _tePreprocessor = new TargetEncoderPreprocessor(_teModel); + } + + @Override + public Completer apply(Model.Parameters params, PreprocessingConfig config) { + if (_tePreprocessor == null || !config.get(CONFIG_ENABLED, true)) return NOOP; + + if (!config.get(CONFIG_PREPARE_CV_ONLY, false)) + params._preprocessors = (Key[])ArrayUtils.append(params._preprocessors, _tePreprocessor._key); + + Frame train = new Frame(params.train()); + String foldColumn = _teModel._parms._fold_column; + boolean addFoldColumn = foldColumn != null && train.find(foldColumn) < 0; + if (addFoldColumn) { + train.add(foldColumn, _teModel._parms._train.get().vec(foldColumn)); + register(train, params._train.toString(), true); + params._train = train._key; + params._fold_column = foldColumn; + params._nfolds = 0; // to avoid confusion or errors + params._fold_assignment = FoldAssignmentScheme.AUTO; // to avoid confusion or errors + } + + return () -> { + //revert train changes + if (addFoldColumn) { + DKV.remove(train._key); + } + }; + } + + @Override + public void dispose() { + for (Completer disposable : _disposables) disposable.run(); + } + + @Override + public void remove() { + if (_tePreprocessor != null) { + _tePreprocessor.remove(true); + _tePreprocessor = null; + _teModel = null; + } + } + + public void setDefaultParams(TargetEncoderParameters defaultParams) { + _defaultParams = defaultParams; + } + + public void setEncodeAllColumns(boolean encodeAllColumns) { + _encodeAllColumns = encodeAllColumns; + } + + public void setColumnCardinalityThreshold(int threshold) { + _columnCardinalityThreshold = threshold; + } + + private TargetEncoderParameters getDefaultParams() { + if (_defaultParams != null) return _defaultParams; + + _defaultParams = new TargetEncoderParameters(); + _defaultParams._keep_original_categorical_columns = false; + _defaultParams._blending = true; + _defaultParams._inflection_point = 5; + _defaultParams._smoothing = 10; + _defaultParams._noise = 0; + + return _defaultParams; + } + + private Set selectColumnsToEncode(Frame fr, TargetEncoderParameters params) { + final Set encode = new HashSet<>(); + if (_encodeAllColumns) { + encode.addAll(Arrays.asList(fr.names())); + } else { + Predicate cardinalityLargeEnough = v -> v.cardinality() >= _columnCardinalityThreshold; + Predicate cardinalityNotTooLarge = params._blending + ? v -> (double) fr.numRows() / v.cardinality() > params._inflection_point + : v -> true; + + for (int i = 0; i < fr.names().length; i++) { + Vec v = fr.vec(i); + if (cardinalityLargeEnough.test(v) && cardinalityNotTooLarge.test(v)) + encode.add(fr.name(i)); + } + } + + AutoMLInput amlInput = _aml.getBuildSpec().input_spec; + List nonPredictors = Arrays.asList( + amlInput.weights_column, + amlInput.fold_column, + amlInput.response_column + ); + encode.removeAll(nonPredictors); + return encode; + } + + TargetEncoderPreprocessor getTEPreprocessor() { + return _tePreprocessor; + } + + TargetEncoderModel getTEModel() { + return _teModel; + } + + private static void register(Frame fr, String keyPrefix, boolean force) { + Key key = fr._key; + if (key == null || force) + fr._key = keyPrefix == null ? Key.make() : Key.make(keyPrefix+"_"+Key.rand()); + if (force) DKV.remove(key); + DKV.put(fr); + } + + public static Vec createFoldColumn(Frame fr, + FoldAssignmentScheme fold_assignment, + int nfolds, + String responseColumn, + long seed) { + Vec foldColumn; + switch (fold_assignment) { + default: + case AUTO: + case Random: + foldColumn = AstKFold.kfoldColumn(fr.anyVec().makeZero(), nfolds, seed); + break; + case Modulo: + foldColumn = AstKFold.moduloKfoldColumn(fr.anyVec().makeZero(), nfolds); + break; + case Stratified: + foldColumn = AstKFold.stratifiedKFoldColumn(fr.vec(responseColumn), nfolds, seed); + break; + } + return foldColumn; + } + +} diff --git a/h2o-automl/src/main/java/water/automl/api/schemas3/AutoMLBuildSpecV99.java b/h2o-automl/src/main/java/water/automl/api/schemas3/AutoMLBuildSpecV99.java index ef9bb99205ce..75439ecbaaa9 100644 --- a/h2o-automl/src/main/java/water/automl/api/schemas3/AutoMLBuildSpecV99.java +++ b/h2o-automl/src/main/java/water/automl/api/schemas3/AutoMLBuildSpecV99.java @@ -4,7 +4,6 @@ import ai.h2o.automl.Algo; import ai.h2o.automl.AutoMLBuildSpec; import ai.h2o.automl.AutoMLBuildSpec.AutoMLStoppingCriteria; -import ai.h2o.automl.IAlgo; import hex.KeyValue; import hex.ScoreKeeper.StoppingMetric; import water.Iced; @@ -255,6 +254,10 @@ public static final class AutoMLBuildModelsV99 extends SchemaV3 { + + public static final class TypeProvider extends EnumValuesProvider { + public TypeProvider() { + super(Type.class); + } + } + + @API(help="A type representing the preprocessing step to be executed.", valuesProvider=TypeProvider.class, direction=API.Direction.INOUT) + public Type type; + +} + diff --git a/h2o-automl/src/main/resources/META-INF/services/water.api.Schema b/h2o-automl/src/main/resources/META-INF/services/water.api.Schema index 689715e1fe07..bcb7caf21d8c 100644 --- a/h2o-automl/src/main/resources/META-INF/services/water.api.Schema +++ b/h2o-automl/src/main/resources/META-INF/services/water.api.Schema @@ -12,3 +12,4 @@ water.automl.api.schemas3.EventLogEntryV99 water.automl.api.schemas3.EventLogV99 water.automl.api.schemas3.StepDefinitionV99 water.automl.api.schemas3.StepDefinitionV99$StepV99 +water.automl.api.schemas3.PreprocessingStepDefinitionV99 diff --git a/h2o-automl/src/test/java/ai/h2o/automl/preprocessing/TargetEncodingTest.java b/h2o-automl/src/test/java/ai/h2o/automl/preprocessing/TargetEncodingTest.java new file mode 100644 index 000000000000..b324c8fef48b --- /dev/null +++ b/h2o-automl/src/test/java/ai/h2o/automl/preprocessing/TargetEncodingTest.java @@ -0,0 +1,243 @@ +package ai.h2o.automl.preprocessing; + +import ai.h2o.automl.*; +import ai.h2o.automl.dummy.DummyModel; +import ai.h2o.automl.preprocessing.PreprocessingStepDefinition.Type; +import ai.h2o.targetencoding.TargetEncoderModel; +import ai.h2o.targetencoding.TargetEncoderModel.DataLeakageHandlingStrategy; +import ai.h2o.targetencoding.TargetEncoderModel.TargetEncoderParameters; +import ai.h2o.targetencoding.TargetEncoderPreprocessor; +import hex.Model; +import hex.SplitFrame; +import hex.deeplearning.DeepLearningModel; +import hex.ensemble.StackedEnsembleModel; +import hex.glm.GLMModel; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import water.DKV; +import water.Key; +import water.Keyed; +import water.Scope; +import water.fvec.Frame; +import water.fvec.TestFrameBuilder; +import water.fvec.Vec; +import water.runner.CloudSize; +import water.runner.H2ORunner; +import water.util.ArrayUtils; + +import java.util.ArrayList; +import java.util.Date; +import java.util.List; + +import static org.junit.Assert.*; +import static water.TestUtil.*; +import static water.TestUtil.ar; + +@CloudSize(1) +@RunWith(H2ORunner.class) +public class TargetEncodingTest { + + private List toDelete = new ArrayList<>(); + private AutoML aml; + private Frame fr; + + @Before + public void setup() { + fr = new TestFrameBuilder() + .withName("dummy_fr") + .withColNames("cat1", "numerical", "cat2", "target", "foldc") + .withVecTypes(Vec.T_CAT, Vec.T_NUM, Vec.T_CAT, Vec.T_CAT, Vec.T_NUM) + .withDataForCol(0, ar("a", "b", "c", "a", "b", "c")) + .withDataForCol(1, ard(1, 2, 5, 1.5, 3, 4)) + .withDataForCol(2, ar("s", null, "t", "t", null, "s")) + .withDataForCol(3, ar("yes", "no", "no", "yes", "yes", "no")) + .withDataForCol(4, ar(1, 1, 1, 2, 2, 2)) + .build(); + DKV.put(fr); toDelete.add(fr); + AutoMLBuildSpec buildSpec = new AutoMLBuildSpec(); + buildSpec.input_spec.training_frame = fr._key; + buildSpec.input_spec.response_column = "target"; + aml = new AutoML(null, new Date(), buildSpec); + DKV.put(aml); toDelete.add(aml); + } + + @After + public void cleanup() { + toDelete.forEach(Keyed::remove); + } + + @Test + public void test_default_params() { + aml.getBuildSpec().build_control.nfolds = 0; //disabling CV on AutoML + TargetEncoding te = new TargetEncoding(aml); + te.setEncodeAllColumns(true); + try { + Scope.enter(); + te.prepare(); + assertNotNull(te.getTEModel()); + assertNotNull(te.getTEPreprocessor()); + Scope.track_generic(te.getTEModel()); + Scope.track_generic(te.getTEPreprocessor()); + + TargetEncoderParameters teParams = te.getTEModel()._parms; + assertNull(teParams._fold_column); + assertEquals(DataLeakageHandlingStrategy.None, teParams._data_leakage_handling); + assertFalse(teParams._keep_original_categorical_columns); + assertTrue(teParams._blending); + assertEquals(0, teParams._noise, 0); + } finally { + te.dispose(); + Scope.exit(); + } + } + + @Test + public void test_te_preprocessing_lifecycle_automl_no_cv() { + aml.getBuildSpec().build_control.nfolds = 0; //disabling CV on AutoML + TargetEncoding te = new TargetEncoding(aml); + te.setEncodeAllColumns(true); + assertNull(te.getTEModel()); + assertNull(te.getTEPreprocessor()); + try { + Scope.enter(); + te.prepare(); + assertNotNull(te.getTEModel()); + assertNotNull(te.getTEPreprocessor()); + Scope.track_generic(te.getTEModel()); + Scope.track_generic(te.getTEPreprocessor()); + assertNull(te.getTEModel()._parms._fold_column); + assertEquals(DataLeakageHandlingStrategy.None, te.getTEModel()._parms._data_leakage_handling); + + Model.Parameters params = new DummyModel.DummyModelParameters(); + params._train = fr._key; + params._nfolds = 0; + params._fold_column = null; + + PreprocessingStep.Completer complete = te.apply(params, new PreprocessingConfig()); + assertEquals(0, params._nfolds); + assertNull(params._fold_column); + complete.run(); + } finally { + te.dispose(); + Scope.exit(); + } + } + + + @Test + public void test_te_preprocessing_lifecycle_with_automl_cv_nfolds() { + int nfolds = 3; + aml.getBuildSpec().build_control.nfolds = nfolds; + TargetEncoding te = new TargetEncoding(aml); + te.setEncodeAllColumns(true); + try { + Scope.enter(); + te.prepare(); + assertNotNull(te.getTEModel()); + assertNotNull(te.getTEPreprocessor()); + Scope.track_generic(te.getTEModel()); + Scope.track_generic(te.getTEPreprocessor()); + assertNotNull(te.getTEModel()._parms._fold_column); + assertTrue(te.getTEModel()._parms._fold_column.endsWith(TargetEncoding.TE_FOLD_COLUMN_SUFFIX)); + assertEquals(DataLeakageHandlingStrategy.KFold, te.getTEModel()._parms._data_leakage_handling); + + Model.Parameters params = new DummyModel.DummyModelParameters(); + params._train = fr._key; + params._nfolds = nfolds; + params._fold_column = null; + + PreprocessingStep.Completer complete = te.apply(params, new PreprocessingConfig()); + assertEquals(0, params._nfolds); + assertNotNull(params._fold_column); + assertEquals(te.getTEModel()._parms._fold_column, params._fold_column); + assertNotEquals(fr._key, params._train); + Frame newTrain = params._train.get(); + assertTrue(ArrayUtils.contains(newTrain.names(), params._fold_column)); + assertFalse(ArrayUtils.contains(fr.names(), params._fold_column)); + assertEquals(nfolds, newTrain.vec(params._fold_column).toCategoricalVec().cardinality()); + complete.run(); + } finally { + te.dispose(); + Scope.exit(); + } + } + + @Test + public void test_te_preprocessing_lifecycle_with_automl_cv_foldcolumn() { + aml.getBuildSpec().input_spec.fold_column = "foldc"; + TargetEncoding te = new TargetEncoding(aml); + te.setEncodeAllColumns(true); + try { + Scope.enter(); + te.prepare(); + assertNotNull(te.getTEModel()); + assertNotNull(te.getTEPreprocessor()); + Scope.track_generic(te.getTEModel()); + Scope.track_generic(te.getTEPreprocessor()); + assertNotNull(te.getTEModel()._parms._fold_column); + assertEquals("foldc", te.getTEModel()._parms._fold_column); + assertEquals(DataLeakageHandlingStrategy.KFold, te.getTEModel()._parms._data_leakage_handling); + + Model.Parameters params = new DummyModel.DummyModelParameters(); + params._train = fr._key; + params._nfolds = 0; + params._fold_column = "foldc"; + + PreprocessingStep.Completer complete = te.apply(params, new PreprocessingConfig()); + assertEquals(0, params._nfolds); + assertNotNull(params._fold_column); + assertEquals("foldc", params._fold_column); + assertEquals(te.getTEModel()._parms._fold_column, params._fold_column); + assertEquals(fr._key, params._train); + complete.run(); + } finally { + te.dispose(); + Scope.exit(); + } + } + + + @Test + public void test_automl_run_with_target_encoding_enabled() { + try { + Scope.enter(); + AutoMLBuildSpec autoMLBuildSpec = new AutoMLBuildSpec(); + Frame fr = parse_test_file("./smalldata/titanic/titanic_expanded.csv"); Scope.track(fr); + SplitFrame sf = new SplitFrame(fr, new double[] { 0.7, 0.3 }, new Key[]{Key.make("titanic_train"), Key.make("titanic_test")}); + sf.exec().get(); + Frame train = sf._destination_frames[0].get(); Scope.track(train); + Frame test = sf._destination_frames[1].get(); Scope.track(test); + + autoMLBuildSpec.input_spec.training_frame = train._key; +// autoMLBuildSpec.input_spec.validation_frame = test._key; + autoMLBuildSpec.input_spec.leaderboard_frame = test._key; + autoMLBuildSpec.input_spec.response_column = "survived"; + autoMLBuildSpec.build_control.stopping_criteria.set_max_models(15); // sth big enough to test all algos+grids with TE + autoMLBuildSpec.build_control.stopping_criteria.set_seed(42); + autoMLBuildSpec.build_control.nfolds = 3; + autoMLBuildSpec.build_models.preprocessing = new PreprocessingStepDefinition[] { + new PreprocessingStepDefinition(Type.TargetEncoding) + }; + + aml = AutoML.startAutoML(autoMLBuildSpec); Scope.track_generic(aml); + aml.get(); + System.out.println(aml.leaderboard().toTwoDimTable()); + for (Model m : aml.leaderboard().getModels()) { + if (m instanceof StackedEnsembleModel + || m instanceof GLMModel + || m instanceof DeepLearningModel + ) { // disabled for GLM with CV, because GLM refuses to follow the same CV flow as other algos. + assertNull(m._parms._preprocessors); + } else { + assertNotNull(m._parms._preprocessors); + assertEquals(1, m._parms._preprocessors.length); + assertTrue(m._parms._preprocessors[0].get() instanceof TargetEncoderPreprocessor); + } + } + } finally { + Scope.exit(); + } + } +} diff --git a/h2o-extensions/target-encoder/src/main/java/ai/h2o/targetencoding/TargetEncoder.java b/h2o-extensions/target-encoder/src/main/java/ai/h2o/targetencoding/TargetEncoder.java index 81838ea78603..d5b00b0a606f 100644 --- a/h2o-extensions/target-encoder/src/main/java/ai/h2o/targetencoding/TargetEncoder.java +++ b/h2o-extensions/target-encoder/src/main/java/ai/h2o/targetencoding/TargetEncoder.java @@ -2,8 +2,11 @@ import ai.h2o.targetencoding.TargetEncoderModel.DataLeakageHandlingStrategy; import ai.h2o.targetencoding.TargetEncoderModel.TargetEncoderOutput; +import ai.h2o.targetencoding.TargetEncoderModel.TargetEncoderParameters; import hex.ModelBuilder; import hex.ModelCategory; +import water.DKV; +import water.Key; import water.Scope; import water.exceptions.H2OModelBuilderIllegalArgumentException; import water.fvec.Frame; @@ -16,19 +19,24 @@ import static ai.h2o.targetencoding.TargetEncoderHelper.*; -public class TargetEncoder extends ModelBuilder { +public class TargetEncoder extends ModelBuilder { private static final Logger logger = LoggerFactory.getLogger(TargetEncoder.class); private TargetEncoderModel _targetEncoderModel; private String[] _columnsToEncode; - public TargetEncoder(TargetEncoderModel.TargetEncoderParameters parms) { + public TargetEncoder(TargetEncoderParameters parms) { super(parms); init(false); } + public TargetEncoder(TargetEncoderParameters parms, Key key) { + super(parms, key); + init(false); + } + public TargetEncoder(final boolean startupOnce) { - super(new TargetEncoderModel.TargetEncoderParameters(), startupOnce); + super(new TargetEncoderParameters(), startupOnce); } @Override @@ -137,7 +145,9 @@ private IcedHashMap prepareEncodingMap() { ); encodings.delete(); encodings = finalEncodings; - + if (encodings._key != null) DKV.remove(encodings._key); + encodings._key = Key.make(_result.toString()+"_encodings_"+columnToEncode); + DKV.put(encodings); columnToEncodings.put(columnToEncode, encodings); } diff --git a/h2o-extensions/target-encoder/src/main/java/ai/h2o/targetencoding/TargetEncoderModel.java b/h2o-extensions/target-encoder/src/main/java/ai/h2o/targetencoding/TargetEncoderModel.java index 99b452768cac..c7823524d572 100644 --- a/h2o-extensions/target-encoder/src/main/java/ai/h2o/targetencoding/TargetEncoderModel.java +++ b/h2o-extensions/target-encoder/src/main/java/ai/h2o/targetencoding/TargetEncoderModel.java @@ -184,6 +184,7 @@ public Frame transform(Frame fr, BlendingParams blendingParams, double noiseLeve * @return An instance of {@link Frame} with transformed fr, registered in DKV. */ public Frame transform(Frame fr, boolean asTraining, int outOfFold, BlendingParams blendingParams, double noiseLevel) { + if (!canApplyTargetEncoding(fr)) return fr; Frame adaptFr = null; try { adaptFr = adaptForEncoding(fr); @@ -211,6 +212,7 @@ protected double[] score0(double data[], double preds[]){ */ @Override public Frame score(Frame fr, String destination_key, Job j, boolean computeMetrics, CFuncRef customMetricFunc) throws IllegalArgumentException { + if (!canApplyTargetEncoding(fr)) return new Frame(fr); Frame adaptFr = null; try { adaptFr = adaptForEncoding(fr); @@ -245,7 +247,17 @@ private Frame adaptForEncoding(Frame fr) { return adaptFr; } - + private boolean canApplyTargetEncoding(Frame fr) { + String[] frColumns = fr.names(); + Set teColumns = _output._target_encoding_map.keySet(); + boolean canApply = Arrays.stream(frColumns).anyMatch(teColumns::contains); + if (!canApply) { + logger.info("Frame "+fr._key+" has no columns to encode with TargetEncoder, skipping it: " + + "columns="+Arrays.toString(fr.names())+", target encoder columns="+_output._target_encoding_map.keySet()); + } + return canApply; + } + /** * Core method for applying pre-calculated encodings to the dataset. * @@ -312,6 +324,7 @@ Frame applyTargetEncoding(Frame data, break; } + List tmps = new ArrayList<>(); Frame workingFrame = null; Key tmpKey; try { @@ -328,12 +341,18 @@ Frame applyTargetEncoding(Frame data, String columnToEncode = kv.getKey(); Frame encodings = kv.getValue(); + int colIdx = workingFrame.find(columnToEncode); + if (colIdx < 0) { + logger.warn("Column "+columnToEncode+" is missing in frame "+data._key); + continue; + } + // if not applying encodings to training data, then get rid of the foldColumn in encodings. if (dataLeakageHandlingStrategy != DataLeakageHandlingStrategy.KFold && encodings.find(foldColumn) >= 0) { encodings = groupEncodingsByCategory(encodings, encodings.find(columnToEncode)); + tmps.add(encodings); } - int colIdx = workingFrame.find(columnToEncode); imputeCategoricalColumn(workingFrame, colIdx, columnToEncode + NA_POSTFIX); IntStream posTargetClasses = _output.nclasses() == 1 ? IntStream.of(NO_TARGET_CLASS) // regression @@ -351,7 +370,7 @@ Frame applyTargetEncoding(Frame data, } // end for each target if (!_parms._keep_original_categorical_columns) - workingFrame.remove(colIdx); + tmps.add(workingFrame.remove(colIdx)); } // end for each columnToEncode DKV.remove(tmpKey); @@ -362,6 +381,8 @@ Frame applyTargetEncoding(Frame data, } catch (Exception e) { if (workingFrame != null) workingFrame.delete(); throw e; + } finally { + for (Keyed tmp : tmps) tmp.remove(); } } @@ -512,10 +533,10 @@ protected double valueForImputation(String columnToEncode, Frame encodings, } protected void removeNumeratorAndDenominatorColumns(Frame fr) { - Vec removedNumeratorNone = fr.remove(NUMERATOR_COL); - removedNumeratorNone.remove(); - Vec removedDenominatorNone = fr.remove(DENOMINATOR_COL); - removedDenominatorNone.remove(); + Vec removedNumerator = fr.remove(NUMERATOR_COL); + removedNumerator.remove(); + Vec removedDenominator = fr.remove(DENOMINATOR_COL); + removedDenominator.remove(); } } diff --git a/h2o-extensions/target-encoder/src/main/java/ai/h2o/targetencoding/TargetEncoderPreprocessor.java b/h2o-extensions/target-encoder/src/main/java/ai/h2o/targetencoding/TargetEncoderPreprocessor.java index efffb0c72fda..b40e925a8743 100644 --- a/h2o-extensions/target-encoder/src/main/java/ai/h2o/targetencoding/TargetEncoderPreprocessor.java +++ b/h2o-extensions/target-encoder/src/main/java/ai/h2o/targetencoding/TargetEncoderPreprocessor.java @@ -3,6 +3,7 @@ import hex.Model; import hex.ModelPreprocessor; import water.DKV; +import water.Futures; import water.Key; import water.fvec.Frame; @@ -48,6 +49,12 @@ public Model asModel() { return _targetEncoder; } + @Override + protected Futures remove_impl(Futures fs, boolean cascade) { + if (cascade && _targetEncoder != null) _targetEncoder.remove(); + return super.remove_impl(fs, cascade); + } + private boolean useFoldTransform(Model.Parameters params) { return params._is_cv_model && _targetEncoder._parms._data_leakage_handling == KFold; } diff --git a/h2o-extensions/target-encoder/src/test/java/ai/h2o/targetencoding/TargetEncoderPreprocessorTest.java b/h2o-extensions/target-encoder/src/test/java/ai/h2o/targetencoding/TargetEncoderPreprocessorTest.java index 1dcea5eadf57..1981d2450e80 100644 --- a/h2o-extensions/target-encoder/src/test/java/ai/h2o/targetencoding/TargetEncoderPreprocessorTest.java +++ b/h2o-extensions/target-encoder/src/test/java/ai/h2o/targetencoding/TargetEncoderPreprocessorTest.java @@ -152,7 +152,7 @@ private Frame makeValidFrame() { private TargetEncoderModel trainTE(Frame train, DataLeakageHandlingStrategy strategy, boolean encodeAll, boolean keepOriginalCategoricalPredictors) { TargetEncoderParameters params = new TargetEncoderParameters(); - params._keep_original_categorical_columns = keepOriginalCategoricalPredictors; + params._keep_original_categorical_columns= keepOriginalCategoricalPredictors; params._train = train._key; params._response_column = TARGET; params._fold_column = ArrayUtils.contains(train.names(), FOLDC) ? FOLDC : null; diff --git a/h2o-py/h2o/automl/autoh2o.py b/h2o-py/h2o/automl/autoh2o.py index 6cfd57a8e18e..1b28b8293e24 100644 --- a/h2o-py/h2o/automl/autoh2o.py +++ b/h2o-py/h2o/automl/autoh2o.py @@ -67,14 +67,15 @@ def __init__(self, include_algos=None, exploitation_ratio=0, modeling_plan=None, + preprocessing=None, monotone_constraints=None, - algo_parameters=None, keep_cross_validation_predictions=False, keep_cross_validation_models=False, keep_cross_validation_fold_assignment=False, sort_metric="AUTO", export_checkpoints_dir=None, - verbosity="warn"): + verbosity="warn", + **kwargs): """ Create a new H2OAutoML instance. @@ -117,11 +118,9 @@ def __init__(self, :param exploitation_ratio: The budget ratio (between 0 and 1) dedicated to the exploitation (vs exploration) phase. By default, the exploitation phase is disabled (exploitation_ratio=0) as this is still experimental; to activate it, it is recommended to try a ratio around 0.1. Note that the current exploitation phase only tries to fine-tune the best XGBoost and the best GBM found during exploration. :param modeling_plan: List of modeling steps to be used by the AutoML engine (they may not all get executed, depending on other constraints). Defaults to None (Expert usage only). + :param preprocessing: List of preprocessing steps to run. Only 'target_encoding' is currently supported. :param monotone_constraints: Dict representing monotonic constraints. Use +1 to enforce an increasing constraint and -1 to specify a decreasing constraint. - :param algo_parameters: Dict of ``param_name=param_value`` to be passed to internal models. Defaults to none (Expert usage only). - By default, params are set only to algorithms accepting them, and ignored by others. - Only following parameters are currently allowed: ``"monotone_constraints"``. :param keep_cross_validation_predictions: Whether to keep the predictions of the cross-validation predictions. This needs to be set to ``True`` if running the same AutoML object for repeated runs because CV predictions are required to build additional Stacked Ensemble models in AutoML. This option defaults to ``False``. @@ -137,6 +136,15 @@ def __init__(self, :param verbosity: Verbosity of the backend messages printed during training. Available options are None (live log disabled), 'debug', 'info' or 'warn'. Defaults to 'warn'. """ + + # early validate kwargs: + algo_parameters = None + for k in kwargs: + if k == 'algo_parameters': + algo_parameters = kwargs[k] + else: + raise TypeError("H2OAutoML got an unexpected keyword argument '%s'" % k) + # Check if H2O jar contains AutoML try: h2o.api("GET /3/Metadata/schemas/AutoMLV99") @@ -275,7 +283,14 @@ def assert_is_step(s): else: self.modeling_plan = None - assert_is_type(algo_parameters, None, dict) + assert_is_type(preprocessing, None, [str]) # for now + if preprocessing is not None: + assert all(p in ['target_encoding'] for p in preprocessing) + self.preprocessing = self.build_models['preprocessing'] = [dict(type=p.replace("_", "")) for p in preprocessing] + else: + self.preprocessing = None + + assert_is_type(monotone_constraints, None, dict) if monotone_constraints is not None: if algo_parameters is None: algo_parameters = {} diff --git a/h2o-py/h2o/sklearn/wrapper.py b/h2o-py/h2o/sklearn/wrapper.py index 7fade824ea81..2aaf45a9315c 100644 --- a/h2o-py/h2o/sklearn/wrapper.py +++ b/h2o-py/h2o/sklearn/wrapper.py @@ -18,7 +18,7 @@ from ..utils.shared_utils import can_use_numpy, can_use_pandas try: - from inspect import signature + from inspect import Parameter, signature except ImportError: from sklearn.utils.fixes import signature @@ -114,9 +114,11 @@ def wrap_estimator(cls, if default_params is None: # obtain the default params from signature of the estimator class constructor sig = signature(cls.__init__) + ignored_names = ['self'] + ignored_kind = [Parameter.VAR_KEYWORD, Parameter.VAR_POSITIONAL] default_params = OrderedDict((p.name, p.default if p.default is not p.empty else None) - for p in sig.parameters.values()) - del default_params['self'] + for p in sig.parameters.values() + if p.name not in ignored_names and p.kind not in ignored_kind) gen_class_name = name if name else cls.__name__+'Sklearn' gen_class_module = module if module else __name__ diff --git a/h2o-py/tests/testdir_algos/automl/pyunit_automl_preprocessing.py b/h2o-py/tests/testdir_algos/automl/pyunit_automl_preprocessing.py new file mode 100644 index 000000000000..f24c92280e97 --- /dev/null +++ b/h2o-py/tests/testdir_algos/automl/pyunit_automl_preprocessing.py @@ -0,0 +1,72 @@ +from __future__ import print_function +import sys, os + +sys.path.insert(1, os.path.join("..","..","..")) +import h2o +import h2o.exceptions +from tests import pyunit_utils as pu +from h2o.automl import H2OAutoML + + +def import_dataset(seed=0, mode='binary'): + df = h2o.import_file(path=pu.locate("smalldata/titanic/titanic_expanded.csv"), header=1) + target = dict( + binary='survived', + multiclass='pclass', + regression='fare' + )[mode] + + fr = df.split_frame(ratios=[.8], seed=seed) + return pu.ns(train=fr[0], test=fr[1], target=target) + + +def test_target_encoding_binary(): + ds = import_dataset(mode='binary') + aml = H2OAutoML(project_name="automl_with_te_binary", + max_models=5, + preprocessing=['target_encoding'], + seed=1) + aml.train(y=ds.target, training_frame=ds.train, leaderboard_frame=ds.test) + lb = aml.leaderboard + print(lb) + # we can't really verify from client if TE was correctly applied... so just using a poor man's check: + mem_keys = h2o.ls().key + # print(mem_keys) + assert any(k.startswith("TargetEncoding_AutoML") for k in mem_keys) + + +def test_target_encoding_multiclass(): + ds = import_dataset(mode='multiclass') + aml = H2OAutoML(project_name="automl_with_te_multiclass", + max_models=5, + preprocessing=['target_encoding'], + seed=1) + aml.train(y=ds.target, training_frame=ds.train, leaderboard_frame=ds.test) + lb = aml.leaderboard + print(lb) + # we can't really verify from client if TE was correctly applied... so just using a poor man's check: + mem_keys = h2o.ls().key + # print(mem_keys) + assert any(k.startswith("TargetEncoding_AutoML") for k in mem_keys) + + +def test_target_encoding_regression(): + ds = import_dataset(mode='regression') + aml = H2OAutoML(project_name="automl_with_te_regression", + max_models=5, + preprocessing=['target_encoding'], + seed=1) + aml.train(y=ds.target, training_frame=ds.train, leaderboard_frame=ds.test) + lb = aml.leaderboard + print(lb) + # we can't really verify from client if TE was correctly applied... so just using a poor man's check: + mem_keys = h2o.ls().key + # print(mem_keys) + assert any(k.startswith("TargetEncoding_AutoML") for k in mem_keys) + + +pu.run_tests([ + test_target_encoding_binary, + test_target_encoding_multiclass, + test_target_encoding_regression +]) diff --git a/h2o-r/h2o-package/R/automl.R b/h2o-r/h2o-package/R/automl.R index 85cd66e6c1f5..890ac32bf85e 100644 --- a/h2o-r/h2o-package/R/automl.R +++ b/h2o-r/h2o-package/R/automl.R @@ -48,11 +48,9 @@ #' Defaults to NULL, which means that all appropriate H2O algorithms will be used, if the search stopping criteria allow. Optional. #' @param exploitation_ratio The budget ratio (between 0 and 1) dedicated to the exploitation (vs exploration) phase. By default, the exploitation phase is disabled (exploitation_ratio=0) as this is still experimental; to activate it, it is recommended to try a ratio around 0.1. Note that the current exploitation phase only tries to fine-tune the best XGBoost and the best GBM found during exploration. #' @param modeling_plan List. The list of modeling steps to be used by the AutoML engine (they may not all get executed, depending on other constraints). Optional (Expert usage only). +#' @param preprocessing List. The list of preprocessing steps to run. Only 'target_encoding' is currently supported. #' @param monotone_constraints List. A mapping representing monotonic constraints. #' Use +1 to enforce an increasing constraint and -1 to specify a decreasing constraint. -#' @param algo_parameters List. A list of param_name=param_value to be passed to internal models. Defaults to none (Expert usage only). -#' By default, params are set only to algorithms accepting them, and ignored by others. -#' Only following parameters are currently allowed: "monotone_constraints". #' @param keep_cross_validation_predictions \code{Logical}. Whether to keep the predictions of the cross-validation predictions. This needs to be set to TRUE if running the same AutoML object for repeated runs because CV predictions are required to build additional Stacked Ensemble models in AutoML. This option defaults to FALSE. #' @param keep_cross_validation_models \code{Logical}. Whether to keep the cross-validated models. Keeping cross-validation models may consume significantly more memory in the H2O cluster. This option defaults to FALSE. #' @param keep_cross_validation_fold_assignment \code{Logical}. Whether to keep fold assignments in the models. Deleting them will save memory in the H2O cluster. Defaults to FALSE. @@ -63,6 +61,7 @@ #' @param export_checkpoints_dir (Optional) Path to a directory where every model will be stored in binary form. #' @param verbosity Verbosity of the backend messages printed during training; Optional. #' Must be one of NULL (live log disabled), "debug", "info", "warn". Defaults to "warn". +#' @param ... Additional (experimental) arguments to be passed through; Optional. #' @details AutoML finds the best model, given a training frame and response, and returns an H2OAutoML object, #' which contains a leaderboard of all the models that were trained in the process, ranked by a default model performance metric. #' @return An \linkS4class{H2OAutoML} object. @@ -100,16 +99,26 @@ h2o.automl <- function(x, y, training_frame, exclude_algos = NULL, include_algos = NULL, modeling_plan = NULL, + preprocessing = NULL, exploitation_ratio = 0.0, monotone_constraints = NULL, - algo_parameters = NULL, keep_cross_validation_predictions = FALSE, keep_cross_validation_models = FALSE, keep_cross_validation_fold_assignment = FALSE, sort_metric = c("AUTO", "deviance", "logloss", "MSE", "RMSE", "MAE", "RMSLE", "AUC", "AUCPR", "mean_per_class_error"), export_checkpoints_dir = NULL, - verbosity = "warn") + verbosity = "warn", + ...) { + dots <- list(...) + algo_parameters <- NULL + for (arg in names(dots)) { + if (arg == 'algo_parameters') { + algo_parameters <- dots$algo_parameters + } else { + stop(paste("unused argument", arg, "=", dots[[arg]])) + } + } tryCatch({ .h2o.__remoteSend(h2oRestApiVersion = 3, method="GET", page = "Metadata/schemas/AutoMLV99") @@ -251,6 +260,18 @@ h2o.automl <- function(x, y, training_frame, }) build_models$modeling_plan <- modeling_plan } + + if (!is.null(preprocessing)) { + is.string <- function(s) is.character(s) && length(s) == 1 + preprocessing <- lapply(preprocessing, function(step) { + if (is.string(step)) { + list(type=gsub("_", "", step)) + } else { + stop("preprocessing steps must be a string (only 'target_encoding' currently supported)") + } + }) + build_models$preprocessing <- preprocessing + } if (!is.null(monotone_constraints)) { if(is.null(algo_parameters)) algo_parameters <- list() diff --git a/h2o-r/tests/testdir_algos/automl/runit_automl_preprocessing.R b/h2o-r/tests/testdir_algos/automl/runit_automl_preprocessing.R new file mode 100644 index 000000000000..a7a4dd200501 --- /dev/null +++ b/h2o-r/tests/testdir_algos/automl/runit_automl_preprocessing.R @@ -0,0 +1,38 @@ +setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f"))) +source("../../../scripts/h2o-r-test-setup.R") + +automl.preprocessing.suite <- function() { + + import_dataset <- function() { + y <- "survived" + fr <- h2o.importFile(locate("smalldata/titanic/titanic_expanded.csv")) + splits <- h2o.splitFrame(fr, destination_frames=c("r_amlte_train", "r_amlte_test"), seed = 1) + train <- splits[[1]] + test <- splits[[2]] + x <- setdiff(names(train), y) + return(list(x=x, y=y, train=train, test=test)) + } + + test_targetencoding_enabled <- function() { + ds <- import_dataset() + aml <- h2o.automl(x = ds$x, y = ds$y, + training_frame = ds$train, + leaderboard_frame = ds$test, + project_name="r_automl_targetencoding", + max_models = 6, + preprocessing = list("target_encoding"), + seed = 1 + ) + print(h2o.get_leaderboard(aml)) + keys <- h2o.ls()$key + expect_true(any(grepl("TargetEncoding_AutoML", keys))) + } + + + makeSuite( + test_targetencoding_enabled + ) +} + + +doSuite("AutoML Preprocessing Suite", automl.preprocessing.suite()) diff --git a/h2o-test-support/src/main/java/water/runner/H2ORunner.java b/h2o-test-support/src/main/java/water/runner/H2ORunner.java index 9cc3e3a446d0..817e0ab3ae67 100644 --- a/h2o-test-support/src/main/java/water/runner/H2ORunner.java +++ b/h2o-test-support/src/main/java/water/runner/H2ORunner.java @@ -116,7 +116,8 @@ private void printLeakedKeys(final Key[] leakedKeys, final CheckKeysTask.LeakInf final Value keyValue = Value.STORE_get(key); if (keyValue != null && keyValue.isFrame()) { Frame frame = (Frame) key.get(); - Log.err(String.format("Leaked frame with key '%s'. This frame contains the following vectors:", frame._key.toString())); + Log.err(String.format("Leaked frame with key '%s' and columns '%s'. This frame contains the following vectors:", + frame._key.toString(), Arrays.toString(frame.names()))); for (Key vecKey : frame.keys()) { if (!leakedKeysSet.contains(vecKey)) continue;