Skip to content
This repository has been archived by the owner on Apr 8, 2024. It is now read-only.

Implement sweep settings as dataclasses, refactor training settings with distinct sections #138

Merged
merged 10 commits into from
Nov 24, 2021
86 changes: 52 additions & 34 deletions docs/run/azureml/train-on-your-data.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,16 +44,17 @@ tasks:
!!! hint
`tasks` is actually a list, if you provide multiple entries, the pipeline will train one model per train/test pair.

4\. If you want the pipeline to save your model as a dataset, uncomment the line below and name the output accordingly:
4\. If you want the pipeline to save your model as a dataset, turn `register_model` to True and uncomment the lines below to name the output according to the naming convention:

```yaml
lightgbm_training:
reference_training:
reference:
# model registration
# naming convention: "{register_model_prefix}-{task_key}-{num_iterations}trees-{num_leaves}leaves-{register_model_suffix}"
register_model: True
register_model_prefix: "model"
register_model_suffix: null
output:
register_model: False
#register_model_prefix: "model"
#register_model_suffix: null
```

!!! hint
Expand All @@ -76,12 +77,13 @@ The benchmark training pipeline is entirely configurable. There are a few key pa
!!! hint
Check out example config file `conf/experiments/lightgbm_training/cpu.yaml`.

To enable multi-node training, simple modify the number of nodes under:
To enable multi-node training, simple modify the number of nodes under `runtime`:

```yaml
lightgbm_training:
reference_training:
nodes: 1
reference:
runtime:
nodes: 1
```

When running the pipeline, it will automatically partition the data to match with the number of nodes, and create multi-node training provisioning the required number of nodes.
Expand All @@ -95,12 +97,15 @@ python pipelines/lightgbm_training.py --config-dir ./conf --config-name experime
!!! hint
Check out example config file `conf/experiments/lightgbm_training/gpu.yaml`.

To enable gpu training, modify the options below:
To enable gpu training, modify the options below to build a GPU-ready docker image and turn on gpu in LightGBM training:

```yaml
lightgbm_training:
reference_training:
device_type: "gpu"
reference:
training:
device_type: "gpu"
runtime:
build: "docker/lightgbm-v3.3.0/linux_gpu_pip.dockerfile"
```

When running the pipeline, it will automatically run on the gpu cluster you've named in your `compute/myaml.yaml` file.
Expand All @@ -114,12 +119,13 @@ python pipelines/lightgbm_training.py --config-dir ./conf --config-name experime
!!! hint
Check out example config file `conf/experiments/lightgbm_training/cpu-custom.yaml`.

To enable gpu training, modify the options below:
To enable training on a custom build, modify the options below:

```yaml
lightgbm_training:
reference_training:
override_docker: "dockers/lightgbm_cpu_mpi_custom.dockerfile" # relative to lightgbm_python folder
reference:
runtime:
build: "dockers/lightgbm_cpu_mpi_custom.dockerfile" # relative to lightgbm_python folder
```

When running the pipeline, it will build the container from this custom dockerfile and use it to run your job.
Expand All @@ -139,14 +145,15 @@ To enable parameter sweep, just change the "sweepable" parameters (see below) to

```yaml
lightgbm_training:
reference_training:
# "sweepable" training parameters
num_iterations: "choice(100, 200)"
num_leaves: "choice(10,20,30)"
min_data_in_leaf: 20
learning_rate: 0.1
max_bin: 255
feature_fraction: 1.0
reference:
training:
# "sweepable" training parameters
num_iterations: "choice(100, 200)"
num_leaves: "choice(10,20,30)"
min_data_in_leaf: 20
learning_rate: 0.1
max_bin: 255
feature_fraction: 1.0
```

Running the pipeline with this config will automatically try multiple values for the parameters and return the best model.
Expand All @@ -159,13 +166,20 @@ You can also modify the parameters of Sweep itself, see [documentation on the ro

```yaml
lightgbm_training:
reference_training:
# SWEEP
sweep_algorithm: "random"
sweep_goal: "minimize"
sweep_max_total_trials: 10
sweep_max_concurrent_trials: 10
sweep_timeout_minutes: 60
reference:
sweep:
#primary_metric: "node_0/valid_0.rmse" # if you comment it out, will use "node_0/valid_0.METRIC"
goal: "minimize"
algorithm: "random"
early_termination:
policy_type: "median_stopping"
evaluation_interval: 1
delay_evaluation: 5
truncation_percentage: 20
limits:
max_total_trials: 100
max_concurrent_trials: 10
timeout_minutes: 60
```

## Running multiple variants of training parameters
Expand Down Expand Up @@ -199,13 +213,17 @@ lightgbm_training:

# reference settings for the benchmark
# all variants will be based on this
reference_training:
reference:
# lots of other params here
num_iterations: 100
training:
num_iterations: 100

# variant settings override what is in reference_training
variants:
- num_iterations: 10
- num_iterations: 1000
- num_iterations: 5000
- training:
num_iterations: 10
- training:
num_iterations: 1000
- training:
num_iterations: 5000
```
Original file line number Diff line number Diff line change
Expand Up @@ -43,33 +43,55 @@ lightgbm_training:

# reference settings for the benchmark
# all variants are defined as diffs of these parameters
reference_training:
reference:
# input parameters
data:
auto_partitioning: True # inserts partitioning to match expected number of partitions (if nodes*processes > 1)
pre_convert_to_binary: False # inserts convertion of train/test data into binary to speed up training (not compatible with auto_partitioning yet)
header: false
label_column: "0"
group_column: null

# lightgbm training parameters
objective: "regression"
metric: "rmse"
boosting: "gbdt"
tree_learner: "data"
num_iterations: 100
num_leaves: 31
min_data_in_leaf: 20
learning_rate: 0.1
max_bin: 255
feature_fraction: 1.0
training:
objective: "regression"
metric: "rmse"
boosting: "gbdt"
tree_learner: "data"
num_iterations: 100
num_leaves: 31
min_data_in_leaf: 20
learning_rate: 0.1
max_bin: 255
feature_fraction: 1.0

# model registration
# naming convention: "{register_model_prefix}-{task_key}-{num_iterations}trees-{num_leaves}leaves-{register_model_suffix}"
register_model: True
register_model_prefix: "model"
register_model_suffix: null
# compute parameters
device_type: "cpu"

# you can add anything under custom_params, it will be sent as a dictionary
# to the lightgbm training module to override its parameters (see lightgbm docs for list)
custom_params:
deterministic: True
use_two_round_loading: True

# compute parameters
#target: null # optional: force target for this training job
device_type: "cpu"
nodes: 1
processes: 1
runtime:
#target: null # optional: force target for this training job
nodes: 1
processes: 1

# model registration
# naming convention: "{register_model_prefix}-{task_key}-{num_iterations}trees-{num_leaves}leaves-{register_model_suffix}"
output:
register_model: True
register_model_prefix: "model"
register_model_suffix: null

# variant settings override specific parameters of reference_training
variants:
- num_iterations: 10
- num_iterations: 1000
- num_iterations: 5000
- training:
num_iterations: 10
- training:
num_iterations: 1000
- training:
num_iterations: 5000
Original file line number Diff line number Diff line change
Expand Up @@ -43,33 +43,51 @@ lightgbm_training:

# reference settings for the benchmark
# all variants are defined as diffs of these parameters
reference_training:
reference:
# input parameters
data:
auto_partitioning: True # inserts partitioning to match expected number of partitions (if nodes*processes > 1)
pre_convert_to_binary: False # inserts convertion of train/test data into binary to speed up training (not compatible with auto_partitioning yet)
header: false
label_column: "0"
group_column: null

# lightgbm training parameters
objective: "regression"
metric: "rmse"
boosting: "gbdt"
tree_learner: "data"
num_iterations: 100
num_leaves: 31
min_data_in_leaf: 20
learning_rate: 0.1
max_bin: 255
feature_fraction: 1.0
training:
objective: "regression"
metric: "rmse"
boosting: "gbdt"
tree_learner: "data"
num_iterations: 100
num_leaves: 31
min_data_in_leaf: 20
learning_rate: 0.1
max_bin: 255
feature_fraction: 1.0

# model registration
# naming convention: "{register_model_prefix}-{task_key}-{num_iterations}trees-{num_leaves}leaves-{register_model_suffix}"
register_model: True
register_model_prefix: "model"
register_model_suffix: "cpu"
# compute parameters
device_type: "cpu"

# you can add anything under custom_params, it will be sent as a dictionary
# to the lightgbm training module to override its parameters (see lightgbm docs for list)
custom_params:
deterministic: True
use_two_round_loading: True

# compute parameters
#target: null # optional: force target for this training job
device_type: "cpu"
nodes: 1
processes: 1
runtime:
#target: null # optional: force target for this training job
nodes: 1
processes: 1

# model registration
# naming convention: "{register_model_prefix}-{task_key}-{num_iterations}trees-{num_leaves}leaves-{register_model_suffix}"
output:
register_model: False

# variant settings override specific parameters of reference_training
variants:
- device_type: "gpu"
register_model_suffix: "gpu"
override_docker: "dockers/lightgbm_gpu_pip.dockerfile" # relative to lightgbm_python folder
- training:
device_type: "gpu"
runtime:
build: "docker/lightgbm-v3.2.1/linux_gpu_pip.dockerfile" # relative to lightgbm_python folder
Original file line number Diff line number Diff line change
Expand Up @@ -29,42 +29,55 @@ module_loader: # module loading params
lightgbm_training:
benchmark_name: "benchmark-dev"

# name of the train/test dataset pairs to train on (can provide multiple as a list)
# list all the train/test pairs to train on
tasks:
- train_dataset: "data-synthetic-regression-100cols-100000samples-train"
test_dataset: "data-synthetic-regression-100cols-10000samples-test"
task_key: "synthetic-regression-100cols" # optional, user to register outputs

# set of parameters to train on all tasks
reference_training:
# NOTE: this example uses only 1 training (reference)
# see other config files for creating training variants
reference:
# input parameters
header: false
label_column: "0"
group_column: null
data:
auto_partitioning: True # inserts partitioning to match expected number of partitions (if nodes*processes > 1)
pre_convert_to_binary: False # inserts convertion of train/test data into binary to speed up training (not compatible with auto_partitioning yet)
header: false
label_column: "0"
group_column: null

# lightgbm training parameters
objective: "regression"
metric: "rmse"
boosting: "gbdt"
tree_learner: "data"
num_iterations: 100
num_leaves: 31
min_data_in_leaf: 20
learning_rate: 0.1
max_bin: 255
feature_fraction: 1.0
#custom_params:
# deterministic: True
training:
objective: "regression"
metric: "rmse"
boosting: "gbdt"
tree_learner: "data"
num_iterations: 100
num_leaves: 31
min_data_in_leaf: 20
learning_rate: 0.1
max_bin: 255
feature_fraction: 1.0

# model registration
# naming convention: "{register_model_prefix}-{task_key}-{num_iterations}trees-{num_leaves}leaves-{register_model_suffix}"
register_model: True
register_model_prefix: "model"
register_model_suffix: "cpu-custom"
# compute parameters
device_type: "cpu"

# you can add anything under custom_params, it will be sent as a dictionary
# to the lightgbm training module to override its parameters (see lightgbm docs for list)
custom_params:
deterministic: True
use_two_round_loading: True

# compute parameters
#target: null # optional: force target for this training job
device_type: "cpu"
nodes: 1
processes: 1
override_docker: "docker/lightgbm-custom/v330_patch_cpu_mpi_build.dockerfile" # relative to lightgbm_python folder
runtime:
#target: null # optional: force target for this training job
nodes: 1
processes: 1
build: "docker/lightgbm-custom/v330_patch_cpu_mpi_build.dockerfile" # relative to lightgbm_python folder

# model registration
# naming convention: "{register_model_prefix}-{task_key}-{num_iterations}trees-{num_leaves}leaves-{register_model_suffix}"
output:
register_model: True
register_model_prefix: "model"
register_model_suffix: "cpu-custom"
Loading