diff --git a/contrib/experiments/interpretation/dutchf3_patch/distributed/train.py b/contrib/experiments/interpretation/dutchf3_patch/distributed/train.py index 7f3a136e..34f1157a 100644 --- a/contrib/experiments/interpretation/dutchf3_patch/distributed/train.py +++ b/contrib/experiments/interpretation/dutchf3_patch/distributed/train.py @@ -23,7 +23,6 @@ import fire import numpy as np -import toolz import torch from albumentations import Compose, HorizontalFlip, Normalize, PadIfNeeded, Resize from ignite.contrib.handlers import ConcatScheduler, CosineAnnealingScheduler, LinearCyclicalScheduler diff --git a/cv_lib/cv_lib/event_handlers/__init__.py b/cv_lib/cv_lib/event_handlers/__init__.py index 589bbd86..8bd8567f 100644 --- a/cv_lib/cv_lib/event_handlers/__init__.py +++ b/cv_lib/cv_lib/event_handlers/__init__.py @@ -31,8 +31,7 @@ def _create_checkpoint_handler(self): def __call__(self, engine, to_save): self._checkpoint_handler(engine, to_save) if self._snapshot_function(): - files = glob.glob(os.path.join(self._model_save_location, self._running_model_prefix + "*")) - print(files) + files = glob.glob(os.path.join(self._model_save_location, self._running_model_prefix + "*")) name_postfix = os.path.basename(files[0]).lstrip(self._running_model_prefix) copyfile( files[0], diff --git a/cv_lib/cv_lib/event_handlers/logging_handlers.py b/cv_lib/cv_lib/event_handlers/logging_handlers.py index 97f382bc..de354760 100644 --- a/cv_lib/cv_lib/event_handlers/logging_handlers.py +++ b/cv_lib/cv_lib/event_handlers/logging_handlers.py @@ -26,15 +26,21 @@ def log_lr(optimizer, engine): @curry -def log_metrics(log_msg, engine, metrics_dict={"pixacc": "Avg accuracy :", "nll": "Avg loss :"}, fname=None): +def log_metrics( + engine, + evaluator, + metrics_dict={ + "nll": "Avg loss :", + "pixacc": "Pixelwise Accuracy :", + "mca": "Avg Class Accuracy :", + "mIoU": "Avg Class IoU :", + }, + stage="undefined", +): logger = logging.getLogger(__name__) - metrics = engine.state.metrics + metrics = evaluator.state.metrics metrics_msg = " ".join([f"{metrics_dict[k]} {metrics[k]:.4f}" for k in metrics_dict]) - if fname: - with open(fname, "w") as fid: - output_dict = {metrics_dict[k]: float(metrics[k]) for k in metrics_dict} - json.dump(output_dict, fid) - logger.info(f"{log_msg} - Epoch {engine.state.epoch} [{engine.state.max_epochs}] " + metrics_msg) + logger.info(f"{stage} - Epoch {engine.state.epoch} [{engine.state.max_epochs}] " + metrics_msg) @curry diff --git a/examples/interpretation/notebooks/Dutch_F3_patch_model_training_and_evaluation.ipynb b/examples/interpretation/notebooks/Dutch_F3_patch_model_training_and_evaluation.ipynb index 1748e45c..28351947 100644 --- a/examples/interpretation/notebooks/Dutch_F3_patch_model_training_and_evaluation.ipynb +++ b/examples/interpretation/notebooks/Dutch_F3_patch_model_training_and_evaluation.ipynb @@ -143,10 +143,7 @@ "from cv_lib.event_handlers import SnapshotHandler, logging_handlers\n", "from cv_lib.event_handlers.logging_handlers import Evaluator\n", "from cv_lib.event_handlers import tensorboard_handlers\n", - "from cv_lib.event_handlers.tensorboard_handlers import (\n", - " create_image_writer,\n", - " create_summary_writer, \n", - ")\n", + "from cv_lib.event_handlers.tensorboard_handlers import create_summary_writer\n", "from cv_lib.segmentation import models\n", "from cv_lib.segmentation.dutchf3.engine import (\n", " create_supervised_evaluator,\n", @@ -537,6 +534,7 @@ ")\n", "\n", "if papermill:\n", + " train_set = data.Subset(train_set, range(3))\n", " val_set = data.Subset(val_set, range(3))\n", "elif DEMO:\n", " val_set = data.Subset(val_set, range(config.VALIDATION.BATCH_SIZE_PER_GPU))\n", @@ -571,15 +569,15 @@ "source": [ "# if we're running in test mode, just run 2 batches\n", "if papermill:\n", - " train_len = config.TRAIN.BATCH_SIZE_PER_GPU*2 \n", - "# if we're running in demo mode, just run 10 batches to fine-tune the model\n", + " train_len = 2\n", + "# if we're running in demo mode, just run 20 batches to fine-tune the model\n", "elif DEMO:\n", - " train_len = config.TRAIN.BATCH_SIZE_PER_GPU*10 \n", + " train_len = 20\n", "# if we're not in test or demo modes, run the entire loop\n", "else:\n", " train_len = len(train_loader)\n", "\n", - "snapshot_duration = scheduler_step * train_len if not papermill else 2*len(train_loader)" + "snapshot_duration = scheduler_step * train_len if not papermill else train_len" ] }, { @@ -678,10 +676,7 @@ "# create training engine\n", "trainer = create_supervised_trainer(\n", " model, optimizer, criterion, prepare_batch, device=device\n", - ")\n", - "\n", - "# add learning rate scheduler\n", - "trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)" + ")" ] }, { @@ -710,35 +705,16 @@ "generate_path(output_dir)\n", "\n", "# define main summary writer which logs all model summaries\n", - "summary_writer = create_summary_writer(log_dir=path.join(output_dir, config.LOG_DIR))\n", - "\n", - "# add logging of training output\n", - "trainer.add_event_handler(\n", - " Events.ITERATION_COMPLETED,\n", - " logging_handlers.log_training_output(log_interval=config.TRAIN.BATCH_SIZE_PER_GPU),\n", - ")\n", - "\n", - "# add logging of learning rate\n", - "trainer.add_event_handler(Events.EPOCH_STARTED, logging_handlers.log_lr(optimizer))\n", - "\n", - "# log learning rate to tensorboard\n", - "trainer.add_event_handler(\n", - " Events.EPOCH_STARTED,\n", - " tensorboard_handlers.log_lr(summary_writer, optimizer, \"epoch\"),\n", - ")\n", - "\n", - "# log training summary to tensorboard as well\n", - "trainer.add_event_handler(\n", - " Events.ITERATION_COMPLETED,\n", - " tensorboard_handlers.log_training_output(summary_writer),\n", - ")" + "summary_writer = create_summary_writer(log_dir=path.join(output_dir, config.LOG_DIR))\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "We also checkpoint models and snapshot them to disk with every training epoch." + "Next we need to score the model on validation set as it's training. To do this we need to add helper functions to manipulate data into the required shape just as we've done to prepare each batch for training at the beginning of this notebook.\n", + "\n", + "We also set up evaluation metrics which we want to record on the training set." ] }, { @@ -747,28 +723,52 @@ "metadata": {}, "outputs": [], "source": [ - "# add model checkpointing\n", - "checkpoint_handler = ModelCheckpoint(\n", - " output_dir,\n", - " \"model_f3_nb\",\n", - " save_interval=1,\n", - " n_saved=1,\n", - " create_dir=True,\n", - " require_empty=False,\n", + "transform_fn = lambda output_dict: (output_dict[\"y_pred\"].squeeze(), output_dict[\"mask\"].squeeze())\n", + "evaluator = create_supervised_evaluator(\n", + " model,\n", + " prepare_batch,\n", + " metrics={\n", + " \"nll\": Loss(criterion, output_transform=transform_fn),\n", + " \"pixacc\": pixelwise_accuracy(n_classes, output_transform=transform_fn, device=device),\n", + " \"cacc\": class_accuracy(n_classes, output_transform=transform_fn),\n", + " \"mca\": mean_class_accuracy(n_classes, output_transform=transform_fn),\n", + " \"ciou\": class_iou(n_classes, output_transform=transform_fn),\n", + " \"mIoU\": mean_iou(n_classes, output_transform=transform_fn),\n", + " },\n", + " device=device,\n", ")\n", + "trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)\n", "\n", + "# Logging:\n", "trainer.add_event_handler(\n", - " Events.EPOCH_COMPLETED, checkpoint_handler, {config.MODEL.NAME: model}\n", - ")" + " Events.ITERATION_COMPLETED, logging_handlers.log_training_output(log_interval=config.PRINT_FREQ),\n", + ")\n", + "trainer.add_event_handler(Events.EPOCH_COMPLETED, logging_handlers.log_lr(optimizer))\n", + "\n", + "# Tensorboard and Logging:\n", + "trainer.add_event_handler(Events.ITERATION_COMPLETED, tensorboard_handlers.log_training_output(summary_writer))\n", + "trainer.add_event_handler(Events.ITERATION_COMPLETED, tensorboard_handlers.log_validation_output(summary_writer))\n", + "\n", + "# add specific logger which also triggers printed metrics on test set\n", + "@trainer.on(Events.EPOCH_COMPLETED)\n", + "def log_training_results(engine):\n", + " evaluator.run(train_loader)\n", + " tensorboard_handlers.log_results(engine, evaluator, summary_writer, n_classes, stage=\"Training\")\n", + " logging_handlers.log_metrics(engine, evaluator, stage=\"Training\")\n", + "\n", + "# add specific logger which also triggers printed metrics on validation set\n", + "@trainer.on(Events.EPOCH_COMPLETED)\n", + "def log_validation_results(engine):\n", + " evaluator.run(val_loader)\n", + " tensorboard_handlers.log_results(engine, evaluator, summary_writer, n_classes, stage=\"Validation\")\n", + " logging_handlers.log_metrics(engine, evaluator, stage=\"Validation\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Next we need to score the model on validation set as it's training. To do this we need to add helper functions to manipulate data into the required shape just as we've done to prepare each batch for training at the beginning of this notebook.\n", - "\n", - "We also set up evaluation metrics which we want to record on the training set." + "We also checkpoint models and snapshot them to disk with every training epoch." ] }, { @@ -777,90 +777,18 @@ "metadata": {}, "outputs": [], "source": [ - "# helper function for\n", - "def _select_pred_and_mask(model_out_dict):\n", - " return (model_out_dict[\"y_pred\"].squeeze(), model_out_dict[\"mask\"].squeeze())\n", - "\n", - "\n", - "def _select_max(pred_tensor):\n", - " return pred_tensor.max(1)[1]\n", - "\n", - "\n", - "def _tensor_to_numpy(pred_tensor):\n", - " return pred_tensor.squeeze().cpu().numpy()\n", - "\n", - "\n", - "def snapshot_function():\n", - " return (trainer.state.iteration % snapshot_duration) == 0\n", - "\n", - "evaluator = create_supervised_evaluator(\n", - " model,\n", - " prepare_batch,\n", - " metrics={\n", - " \"nll\": Loss(criterion, output_transform=_select_pred_and_mask),\n", - " \"pixacc\": pixelwise_accuracy(\n", - " n_classes, output_transform=_select_pred_and_mask, device=device\n", - " ),\n", - " \"cacc\": class_accuracy(n_classes, output_transform=_select_pred_and_mask),\n", - " \"mca\": mean_class_accuracy(n_classes, output_transform=_select_pred_and_mask),\n", - " \"ciou\": class_iou(n_classes, output_transform=_select_pred_and_mask),\n", - " \"mIoU\": mean_iou(n_classes, output_transform=_select_pred_and_mask),\n", - " },\n", - " device=device,\n", - ")\n", - "\n", - "trainer.add_event_handler(Events.EPOCH_COMPLETED, Evaluator(evaluator, val_loader))\n", - "\n", - "evaluator.add_event_handler(\n", - " Events.EPOCH_COMPLETED,\n", - " logging_handlers.log_metrics(\n", - " \"Validation results\",\n", - " metrics_dict={\n", - " \"nll\": \"Avg loss :\",\n", - " \"pixacc\": \"Pixelwise Accuracy :\",\n", - " \"mca\": \"Avg Class Accuracy :\",\n", - " \"mIoU\": \"Avg Class IoU :\",\n", - " },\n", - " ),\n", - ")\n", - "\n", - "evaluator.add_event_handler(\n", - " Events.EPOCH_COMPLETED,\n", - " tensorboard_handlers.log_metrics(\n", - " summary_writer,\n", - " trainer,\n", - " \"epoch\",\n", - " metrics_dict={\n", - " \"mIoU\": \"Validation/mIoU\",\n", - " \"nll\": \"Validation/Loss\",\n", - " \"mca\": \"Validation/MCA\",\n", - " \"pixacc\": \"Validation/Pixel_Acc\",\n", - " },\n", - " ),\n", - ")\n", - "\n", - "\n", - "transform_func = compose(np_to_tb, decode_segmap(n_classes=n_classes), _tensor_to_numpy)\n", - "\n", - "transform_pred = compose(transform_func, _select_max)\n", - "\n", - "evaluator.add_event_handler(\n", - " Events.EPOCH_COMPLETED,\n", - " create_image_writer(summary_writer, \"Validation/Image\", \"image\"),\n", - ")\n", - "\n", - "evaluator.add_event_handler(\n", - " Events.EPOCH_COMPLETED,\n", - " create_image_writer(\n", - " summary_writer, \"Validation/Mask\", \"mask\", transform_func=transform_func\n", - " ),\n", + "# add model checkpointing\n", + "checkpoint_handler = ModelCheckpoint(\n", + " output_dir,\n", + " \"model_f3_nb\",\n", + " save_interval=1,\n", + " n_saved=1,\n", + " create_dir=True,\n", + " require_empty=False,\n", ")\n", "\n", - "evaluator.add_event_handler(\n", - " Events.EPOCH_COMPLETED,\n", - " create_image_writer(\n", - " summary_writer, \"Validation/Pred\", \"y_pred\", transform_func=transform_pred\n", - " ),\n", + "trainer.add_event_handler(\n", + " Events.EPOCH_COMPLETED, checkpoint_handler, {config.MODEL.NAME: model}\n", ")" ] }, diff --git a/experiments/interpretation/dutchf3_patch/local/default.py b/experiments/interpretation/dutchf3_patch/local/default.py index 4a4c74af..0322d5b1 100644 --- a/experiments/interpretation/dutchf3_patch/local/default.py +++ b/experiments/interpretation/dutchf3_patch/local/default.py @@ -11,8 +11,10 @@ _C = CN() -_C.OUTPUT_DIR = "output" # This will be the base directory for all output, such as logs and saved models -_C.LOG_DIR = "" # This will be a subdirectory inside OUTPUT_DIR +# This will be the base directory for all output, such as logs and saved models +_C.OUTPUT_DIR = "output" +# This will be a subdirectory inside OUTPUT_DIR +_C.LOG_DIR = "" _C.GPUS = (0,) _C.WORKERS = 4 _C.PRINT_FREQ = 20 @@ -21,6 +23,8 @@ _C.LOG_CONFIG = "logging.conf" _C.SEED = 42 _C.OPENCV_BORDER_CONSTANT = 0 +# number of batches to use in test/debug mode +_C.NUM_DEBUG_BATCHES = 1 # Cudnn related params _C.CUDNN = CN() diff --git a/experiments/interpretation/dutchf3_patch/local/train.py b/experiments/interpretation/dutchf3_patch/local/train.py index 9e26f7a2..1d4e3348 100644 --- a/experiments/interpretation/dutchf3_patch/local/train.py +++ b/experiments/interpretation/dutchf3_patch/local/train.py @@ -12,7 +12,7 @@ Time to run on single V100 for 300 epochs: 4.5 days """ - +import json import logging import logging.config from os import path @@ -28,7 +28,7 @@ from ignite.utils import convert_tensor from cv_lib.event_handlers import SnapshotHandler, logging_handlers, tensorboard_handlers -from cv_lib.event_handlers.tensorboard_handlers import create_summary_writer, log_results +from cv_lib.event_handlers.tensorboard_handlers import create_summary_writer from cv_lib.segmentation import extract_metric_from, models from cv_lib.segmentation.dutchf3.engine import create_supervised_evaluator, create_supervised_trainer from cv_lib.segmentation.dutchf3.utils import current_datetime, generate_path, git_branch, git_hash @@ -53,6 +53,7 @@ def run(*options, cfg=None, debug=False): Notes: Options can be passed in via the options argument and loaded from the cfg file Options from default.py will be overridden by options loaded from cfg file + Options from default.py will be overridden by options loaded from cfg file Options passed in via options argument will override option loaded from cfg file Args: @@ -63,7 +64,7 @@ def run(*options, cfg=None, debug=False): debug (bool): Places scripts in debug/test mode and only executes a few iterations """ # Configuration: - update_config(config, options=options, config_file=cfg) + update_config(config, options=options, config_file=cfg) # The model will be saved under: outputs// config_file_name = "default_config" if not cfg else cfg.split("/")[-1].split(".")[0] @@ -147,13 +148,15 @@ def run(*options, cfg=None, debug=False): if debug: logger.info("Running in debug mode..") - train_set = data.Subset(train_set, list(range(4))) - val_set = data.Subset(val_set, list(range(4))) + train_set = data.Subset(train_set, range(config.TRAIN.BATCH_SIZE_PER_GPU*config.NUM_DEBUG_BATCHES)) + val_set = data.Subset(val_set, range(config.VALIDATION.BATCH_SIZE_PER_GPU)) train_loader = data.DataLoader( train_set, batch_size=config.TRAIN.BATCH_SIZE_PER_GPU, num_workers=config.WORKERS, shuffle=True ) - val_loader = data.DataLoader(val_set, batch_size=config.VALIDATION.BATCH_SIZE_PER_GPU, num_workers=config.WORKERS) + val_loader = data.DataLoader( + val_set, batch_size=config.VALIDATION.BATCH_SIZE_PER_GPU, num_workers=1 + ) # config.WORKERS) # Model: model = getattr(models, config.MODEL.NAME).get_seg_model(config) @@ -203,40 +206,38 @@ def run(*options, cfg=None, debug=False): # Logging: trainer.add_event_handler( - Events.ITERATION_COMPLETED, logging_handlers.log_training_output(log_interval=config.TRAIN.BATCH_SIZE_PER_GPU), + Events.ITERATION_COMPLETED, logging_handlers.log_training_output(log_interval=config.PRINT_FREQ), ) trainer.add_event_handler(Events.EPOCH_COMPLETED, logging_handlers.log_lr(optimizer)) - fname = f"metrics_{config_file_name}_{config.TRAIN.MODEL_DIR}.json" if debug else None - evaluator.add_event_handler( - Events.EPOCH_COMPLETED, - logging_handlers.log_metrics( - "Validation results", - metrics_dict={ - "nll": "Avg loss :", - "pixacc": "Pixelwise Accuracy :", - "mca": "Avg Class Accuracy :", - "mIoU": "Avg Class IoU :", - }, - fname=fname, - ), - ) - # Tensorboard and Logging: trainer.add_event_handler(Events.ITERATION_COMPLETED, tensorboard_handlers.log_training_output(summary_writer)) trainer.add_event_handler(Events.ITERATION_COMPLETED, tensorboard_handlers.log_validation_output(summary_writer)) + # add specific logger which also triggers printed metrics on training set @trainer.on(Events.EPOCH_COMPLETED) def log_training_results(engine): evaluator.run(train_loader) - log_results(engine, evaluator, summary_writer, n_classes, stage="Training") + tensorboard_handlers.log_results(engine, evaluator, summary_writer, n_classes, stage="Training") + logging_handlers.log_metrics(engine, evaluator, stage="Training") + # add specific logger which also triggers printed metrics on validation set @trainer.on(Events.EPOCH_COMPLETED) def log_validation_results(engine): evaluator.run(val_loader) - log_results(engine, evaluator, summary_writer, n_classes, stage="Validation") - - # Checkpointing: + tensorboard_handlers.log_results(engine, evaluator, summary_writer, n_classes, stage="Validation") + logging_handlers.log_metrics(engine, evaluator, stage="Validation") + # dump validation set metrics at the very end for debugging purposes + if engine.state.epoch == config.TRAIN.END_EPOCH and debug: + fname = f"metrics_test_{config_file_name}_{config.TRAIN.MODEL_DIR}.json" + metrics = evaluator.state.metrics + out_dict = {x: metrics[x] for x in ["nll", "pixacc", "mca", "mIoU"]} + with open(fname, "w") as fid: + json.dump(out_dict, fid) + log_msg = " ".join(f"{k}: {out_dict[k]}" for k in out_dict.keys()) + logging.info(log_msg) + + # Checkpointing: snapshotting trained models to disk checkpoint_handler = SnapshotHandler( output_dir, config.MODEL.NAME, @@ -246,15 +247,8 @@ def log_validation_results(engine): evaluator.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {"model": model}) logger.info("Starting training") - if debug: - trainer.run( - train_loader, - max_epochs=config.TRAIN.END_EPOCH, - epoch_length=config.TRAIN.BATCH_SIZE_PER_GPU, - seed=config.SEED, - ) - else: - trainer.run(train_loader, max_epochs=config.TRAIN.END_EPOCH, epoch_length=len(train_loader), seed=config.SEED) + trainer.run(train_loader, max_epochs=config.TRAIN.END_EPOCH, epoch_length=len(train_loader), seed=config.SEED) + summary_writer.close() diff --git a/tests/cicd/main_build.yml b/tests/cicd/main_build.yml index 875c1c05..4a0b8f2b 100644 --- a/tests/cicd/main_build.yml +++ b/tests/cicd/main_build.yml @@ -113,30 +113,37 @@ jobs: # Create a temporary directory to store the statuses dir=$(mktemp -d) + # we are running a single batch in debug mode through, so increase the + # number of epochs to obtain a representative set of results + pids= # export CUDA_VISIBLE_DEVICES=0 - { python train.py 'DATASET.ROOT' '/home/alfred/data_dynamic/checkerboard/data' 'TRAIN.END_EPOCH' 1 'TRAIN.SNAPSHOTS' 1 \ + { python train.py 'DATASET.ROOT' '/home/alfred/data_dynamic/checkerboard/data' \ + 'NUM_DEBUG_BATCHES' 5 'TRAIN.END_EPOCH' 1 'TRAIN.SNAPSHOTS' 1 \ 'DATASET.NUM_CLASSES' 2 'DATASET.CLASS_WEIGHTS' '[1.0, 1.0]' \ 'TRAIN.DEPTH' 'none' \ 'OUTPUT_DIR' 'output' 'TRAIN.MODEL_DIR' 'no_depth' \ --cfg=configs/patch_deconvnet.yaml --debug ; echo "$?" > "$dir/$BASHPID"; } pids+=" $!" # export CUDA_VISIBLE_DEVICES=1 - { python train.py 'DATASET.ROOT' '/home/alfred/data_dynamic/checkerboard/data' 'TRAIN.END_EPOCH' 1 'TRAIN.SNAPSHOTS' 1 \ + { python train.py 'DATASET.ROOT' '/home/alfred/data_dynamic/checkerboard/data' \ + 'NUM_DEBUG_BATCHES' 5 'TRAIN.END_EPOCH' 1 'TRAIN.SNAPSHOTS' 1 \ 'DATASET.NUM_CLASSES' 2 'DATASET.CLASS_WEIGHTS' '[1.0, 1.0]' \ 'TRAIN.DEPTH' 'section' \ 'OUTPUT_DIR' 'output' 'TRAIN.MODEL_DIR' 'section_depth' \ --cfg=configs/unet.yaml --debug ; echo "$?" > "$dir/$BASHPID"; } pids+=" $!" # export CUDA_VISIBLE_DEVICES=2 - { python train.py 'DATASET.ROOT' '/home/alfred/data_dynamic/checkerboard/data' 'TRAIN.END_EPOCH' 1 'TRAIN.SNAPSHOTS' 1 \ + { python train.py 'DATASET.ROOT' '/home/alfred/data_dynamic/checkerboard/data' \ + 'NUM_DEBUG_BATCHES' 5 'TRAIN.END_EPOCH' 1 'TRAIN.SNAPSHOTS' 1 \ 'DATASET.NUM_CLASSES' 2 'DATASET.CLASS_WEIGHTS' '[1.0, 1.0]' \ 'TRAIN.DEPTH' 'section' \ 'OUTPUT_DIR' 'output' 'TRAIN.MODEL_DIR' 'section_depth' \ --cfg=configs/seresnet_unet.yaml --debug ; echo "$?" > "$dir/$BASHPID"; } pids+=" $!" # export CUDA_VISIBLE_DEVICES=3 - { python train.py 'DATASET.ROOT' '/home/alfred/data_dynamic/checkerboard/data' 'TRAIN.END_EPOCH' 1 'TRAIN.SNAPSHOTS' 1 \ + { python train.py 'DATASET.ROOT' '/home/alfred/data_dynamic/checkerboard/data' \ + 'NUM_DEBUG_BATCHES' 5 'TRAIN.END_EPOCH' 1 'TRAIN.SNAPSHOTS' 1 \ 'DATASET.NUM_CLASSES' 2 'DATASET.CLASS_WEIGHTS' '[1.0, 1.0]' \ 'TRAIN.DEPTH' 'section' \ 'MODEL.PRETRAINED' '/home/alfred/models/hrnetv2_w48_imagenet_pretrained.pth' \ @@ -262,24 +269,28 @@ jobs: # export CUDA_VISIBLE_DEVICES=0 { python train.py 'DATASET.ROOT' '/home/alfred/data_dynamic/dutch_f3/data' 'TRAIN.END_EPOCH' 1 'TRAIN.SNAPSHOTS' 1 \ 'TRAIN.DEPTH' 'none' \ + 'TRAIN.BATCH_SIZE_PER_GPU' 2 'VALIDATION.BATCH_SIZE_PER_GPU' 2 \ 'OUTPUT_DIR' 'output' 'TRAIN.MODEL_DIR' 'no_depth' \ --cfg=configs/patch_deconvnet.yaml --debug ; echo "$?" > "$dir/$BASHPID"; } pids+=" $!" # export CUDA_VISIBLE_DEVICES=1 { python train.py 'DATASET.ROOT' '/home/alfred/data_dynamic/dutch_f3/data' 'TRAIN.END_EPOCH' 1 'TRAIN.SNAPSHOTS' 1 \ 'TRAIN.DEPTH' 'section' \ + 'TRAIN.BATCH_SIZE_PER_GPU' 2 'VALIDATION.BATCH_SIZE_PER_GPU' 2 \ 'OUTPUT_DIR' 'output' 'TRAIN.MODEL_DIR' 'section_depth' \ --cfg=configs/unet.yaml --debug ; echo "$?" > "$dir/$BASHPID"; } pids+=" $!" # export CUDA_VISIBLE_DEVICES=2 { python train.py 'DATASET.ROOT' '/home/alfred/data_dynamic/dutch_f3/data' 'TRAIN.END_EPOCH' 1 'TRAIN.SNAPSHOTS' 1 \ 'TRAIN.DEPTH' 'section' \ + 'TRAIN.BATCH_SIZE_PER_GPU' 2 'VALIDATION.BATCH_SIZE_PER_GPU' 2 \ 'OUTPUT_DIR' 'output' 'TRAIN.MODEL_DIR' 'section_depth' \ --cfg=configs/seresnet_unet.yaml --debug ; echo "$?" > "$dir/$BASHPID"; } pids+=" $!" # export CUDA_VISIBLE_DEVICES=3 { python train.py 'DATASET.ROOT' '/home/alfred/data_dynamic/dutch_f3/data' 'TRAIN.END_EPOCH' 1 'TRAIN.SNAPSHOTS' 1 \ 'TRAIN.DEPTH' 'section' \ + 'TRAIN.BATCH_SIZE_PER_GPU' 2 'VALIDATION.BATCH_SIZE_PER_GPU' 2 \ 'MODEL.PRETRAINED' '/home/alfred/models/hrnetv2_w48_imagenet_pretrained.pth' \ 'OUTPUT_DIR' 'output' 'TRAIN.MODEL_DIR' 'section_depth' \ --cfg=configs/hrnet.yaml --debug ; echo "$?" > "$dir/$BASHPID"; }