From d0aa884fdf429dda5f5d9e8b660389c20298379d Mon Sep 17 00:00:00 2001 From: Max Kaznady Date: Mon, 20 Apr 2020 17:47:16 +0000 Subject: [PATCH 1/5] resolved rebase conflict --- .../penobscot/local/logging.conf | 34 ------------------- 1 file changed, 34 deletions(-) delete mode 100644 contrib/experiments/interpretation/penobscot/local/logging.conf diff --git a/contrib/experiments/interpretation/penobscot/local/logging.conf b/contrib/experiments/interpretation/penobscot/local/logging.conf deleted file mode 100644 index 56334fc4..00000000 --- a/contrib/experiments/interpretation/penobscot/local/logging.conf +++ /dev/null @@ -1,34 +0,0 @@ -[loggers] -keys=root,__main__,event_handlers - -[handlers] -keys=consoleHandler - -[formatters] -keys=simpleFormatter - -[logger_root] -level=INFO -handlers=consoleHandler - -[logger___main__] -level=INFO -handlers=consoleHandler -qualname=__main__ -propagate=0 - -[logger_event_handlers] -level=INFO -handlers=consoleHandler -qualname=event_handlers -propagate=0 - -[handler_consoleHandler] -class=StreamHandler -level=INFO -formatter=simpleFormatter -args=(sys.stdout,) - -[formatter_simpleFormatter] -format=%(asctime)s - %(name)s - %(levelname)s - %(message)s - From 8ed4af6ada5fd1c7b139a9db3f1cff5066b83b22 Mon Sep 17 00:00:00 2001 From: Max Kaznady Date: Thu, 16 Apr 2020 20:45:40 +0000 Subject: [PATCH 2/5] resolved merge conflict --- .../dutchf3_patch/distributed/train.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/contrib/experiments/interpretation/dutchf3_patch/distributed/train.py b/contrib/experiments/interpretation/dutchf3_patch/distributed/train.py index 9d1a2631..bc28249a 100644 --- a/contrib/experiments/interpretation/dutchf3_patch/distributed/train.py +++ b/contrib/experiments/interpretation/dutchf3_patch/distributed/train.py @@ -24,6 +24,7 @@ import cv2 import fire import numpy as np +import toolz import torch from albumentations import Compose, HorizontalFlip, Normalize, Resize, PadIfNeeded from cv_lib.utils import load_log_configuration @@ -167,8 +168,7 @@ def run(*options, cfg=None, local_rank=0, debug=False): stride=config.TRAIN.STRIDE, patch_size=config.TRAIN.PATCH_SIZE, augmentations=train_aug, - ) - logger.info(f"Training examples {len(train_set)}") + ) val_set = TrainPatchLoader( config.DATASET.ROOT, @@ -185,6 +185,13 @@ def run(*options, cfg=None, local_rank=0, debug=False): logger.info(f"Validation examples {len(val_set)}") n_classes = train_set.n_classes + #if debug: + #val_set = data.Subset(val_set, range(config.VALIDATION.BATCH_SIZE_PER_GPU)) + #train_set = data.Subset(train_set, range(config.TRAIN.BATCH_SIZE_PER_GPU*2)) + + logger.info(f"Training examples {len(train_set)}") + logger.info(f"Validation examples {len(val_set)}") + train_sampler = torch.utils.data.distributed.DistributedSampler(train_set, num_replicas=world_size, rank=local_rank) train_loader = data.DataLoader( @@ -220,6 +227,8 @@ def run(*options, cfg=None, local_rank=0, debug=False): model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[device], find_unused_parameters=True) snapshot_duration = scheduler_step * len(train_loader) + if debug: + snapshot_duration = 2 warmup_duration = 5 * len(train_loader) warmup_scheduler = LinearCyclicalScheduler( optimizer, From 5641445480aa16692917020bbab5e56e4cb61930 Mon Sep 17 00:00:00 2001 From: Max Kaznady Date: Mon, 20 Apr 2020 17:47:16 +0000 Subject: [PATCH 3/5] resolved rebase conflict --- .../penobscot/local/logging.conf | 34 ------------------- 1 file changed, 34 deletions(-) delete mode 100644 contrib/experiments/interpretation/penobscot/local/logging.conf diff --git a/contrib/experiments/interpretation/penobscot/local/logging.conf b/contrib/experiments/interpretation/penobscot/local/logging.conf deleted file mode 100644 index 56334fc4..00000000 --- a/contrib/experiments/interpretation/penobscot/local/logging.conf +++ /dev/null @@ -1,34 +0,0 @@ -[loggers] -keys=root,__main__,event_handlers - -[handlers] -keys=consoleHandler - -[formatters] -keys=simpleFormatter - -[logger_root] -level=INFO -handlers=consoleHandler - -[logger___main__] -level=INFO -handlers=consoleHandler -qualname=__main__ -propagate=0 - -[logger_event_handlers] -level=INFO -handlers=consoleHandler -qualname=event_handlers -propagate=0 - -[handler_consoleHandler] -class=StreamHandler -level=INFO -formatter=simpleFormatter -args=(sys.stdout,) - -[formatter_simpleFormatter] -format=%(asctime)s - %(name)s - %(levelname)s - %(message)s - From 19abb244a5256feef0edac1ffd89d43f792dcc17 Mon Sep 17 00:00:00 2001 From: Max Kaznady Date: Thu, 16 Apr 2020 20:45:40 +0000 Subject: [PATCH 4/5] resolved merge conflict --- .../dutchf3_patch/distributed/train.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/contrib/experiments/interpretation/dutchf3_patch/distributed/train.py b/contrib/experiments/interpretation/dutchf3_patch/distributed/train.py index 33bb0045..29c82d31 100644 --- a/contrib/experiments/interpretation/dutchf3_patch/distributed/train.py +++ b/contrib/experiments/interpretation/dutchf3_patch/distributed/train.py @@ -156,14 +156,24 @@ def run(*options, cfg=None, local_rank=0, debug=False): logger.info(f"Validation examples {len(val_set)}") n_classes = train_set.n_classes +<<<<<<< 5641445480aa16692917020bbab5e56e4cb61930 if debug: logger.info("Running in debug mode..") train_set = data.Subset(train_set, list(range(4))) val_set = data.Subset(val_set, list(range(4))) +======= + #if debug: + #val_set = data.Subset(val_set, range(config.VALIDATION.BATCH_SIZE_PER_GPU)) + #train_set = data.Subset(train_set, range(config.TRAIN.BATCH_SIZE_PER_GPU*2)) +>>>>>>> resolved merge conflict logger.info(f"Training examples {len(train_set)}") logger.info(f"Validation examples {len(val_set)}") +<<<<<<< 5641445480aa16692917020bbab5e56e4cb61930 +======= + train_sampler = torch.utils.data.distributed.DistributedSampler(train_set, num_replicas=world_size, rank=local_rank) +>>>>>>> resolved merge conflict train_sampler = torch.utils.data.distributed.DistributedSampler(train_set, num_replicas=world_size, rank=local_rank) train_loader = data.DataLoader( @@ -198,7 +208,13 @@ def run(*options, cfg=None, local_rank=0, debug=False): model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[device], find_unused_parameters=True) +<<<<<<< 5641445480aa16692917020bbab5e56e4cb61930 snapshot_duration = epochs_per_cycle * len(train_loader) if not debug else 2*len(train_loader) +======= + snapshot_duration = scheduler_step * len(train_loader) + if debug: + snapshot_duration = 2 +>>>>>>> resolved merge conflict warmup_duration = 5 * len(train_loader) warmup_scheduler = LinearCyclicalScheduler( optimizer, From d5f3a1aa55e2a359e0cc1a324ff577fcb878d65e Mon Sep 17 00:00:00 2001 From: Max Kaznady Date: Wed, 29 Apr 2020 14:29:35 +0000 Subject: [PATCH 5/5] reverted multi-GPU builds to run on single GPU --- tests/cicd/main_build.yml | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/tests/cicd/main_build.yml b/tests/cicd/main_build.yml index bfc9b026..f1a33f61 100644 --- a/tests/cicd/main_build.yml +++ b/tests/cicd/main_build.yml @@ -91,6 +91,7 @@ jobs: ################################################################################################### # Stage 3: Dutch F3 patch models: deconvnet, unet, HRNet patch depth, HRNet section depth +# CAUTION: reverted these builds to single-GPU leaving new multi-GPU code in to be reverted later ################################################################################################### - job: dutchf3_patch @@ -113,30 +114,30 @@ jobs: dir=$(mktemp -d) pids= - export CUDA_VISIBLE_DEVICES=0 + # export CUDA_VISIBLE_DEVICES=0 { python train.py 'DATASET.ROOT' '/home/alfred/data_dynamic/dutch_f3/data' 'TRAIN.END_EPOCH' 1 'TRAIN.SNAPSHOTS' 1 \ 'TRAIN.DEPTH' 'none' \ 'OUTPUT_DIR' 'output' 'TRAIN.MODEL_DIR' 'no_depth' \ - --cfg=configs/patch_deconvnet.yaml --debug ; echo "$?" > "$dir/$BASHPID"; } & + --cfg=configs/patch_deconvnet.yaml --debug ; echo "$?" > "$dir/$BASHPID"; } pids+=" $!" - export CUDA_VISIBLE_DEVICES=1 + # export CUDA_VISIBLE_DEVICES=1 { python train.py 'DATASET.ROOT' '/home/alfred/data_dynamic/dutch_f3/data' 'TRAIN.END_EPOCH' 1 'TRAIN.SNAPSHOTS' 1 \ 'TRAIN.DEPTH' 'section' \ 'OUTPUT_DIR' 'output' 'TRAIN.MODEL_DIR' 'section_depth' \ - --cfg=configs/unet.yaml --debug ; echo "$?" > "$dir/$BASHPID"; } & + --cfg=configs/unet.yaml --debug ; echo "$?" > "$dir/$BASHPID"; } pids+=" $!" - export CUDA_VISIBLE_DEVICES=2 + # export CUDA_VISIBLE_DEVICES=2 { python train.py 'DATASET.ROOT' '/home/alfred/data_dynamic/dutch_f3/data' 'TRAIN.END_EPOCH' 1 'TRAIN.SNAPSHOTS' 1 \ 'TRAIN.DEPTH' 'section' \ 'OUTPUT_DIR' 'output' 'TRAIN.MODEL_DIR' 'section_depth' \ - --cfg=configs/seresnet_unet.yaml --debug ; echo "$?" > "$dir/$BASHPID"; } & + --cfg=configs/seresnet_unet.yaml --debug ; echo "$?" > "$dir/$BASHPID"; } pids+=" $!" - export CUDA_VISIBLE_DEVICES=3 + # export CUDA_VISIBLE_DEVICES=3 { python train.py 'DATASET.ROOT' '/home/alfred/data_dynamic/dutch_f3/data' 'TRAIN.END_EPOCH' 1 'TRAIN.SNAPSHOTS' 1 \ 'TRAIN.DEPTH' 'section' \ 'MODEL.PRETRAINED' '/home/alfred/models/hrnetv2_w48_imagenet_pretrained.pth' \ 'OUTPUT_DIR' 'output' 'TRAIN.MODEL_DIR' 'section_depth' \ - --cfg=configs/hrnet.yaml --debug ; echo "$?" > "$dir/$BASHPID"; } & + --cfg=configs/hrnet.yaml --debug ; echo "$?" > "$dir/$BASHPID"; } pids+=" $!" wait $pids || exit 1 @@ -157,16 +158,16 @@ jobs: dir=$(mktemp -d) pids= - export CUDA_VISIBLE_DEVICES=0 + # export CUDA_VISIBLE_DEVICES=0 # find the latest model which we just trained model_dir=$(ls -td output/patch_deconvnet/no_depth/* | head -1) model=$(ls -t ${model_dir}/*.pth | head -1) # try running the test script { python test.py 'DATASET.ROOT' '/home/alfred/data_dynamic/dutch_f3/data' \ 'TEST.MODEL_PATH' ${model} \ - --cfg=configs/patch_deconvnet.yaml --debug ; echo "$?" > "$dir/$BASHPID"; } & + --cfg=configs/patch_deconvnet.yaml --debug ; echo "$?" > "$dir/$BASHPID"; } pids+=" $!" - export CUDA_VISIBLE_DEVICES=1 + # export CUDA_VISIBLE_DEVICES=1 # find the latest model which we just trained model_dir=$(ls -td output/unet/section_depth/* | head -1) model=$(ls -t ${model_dir}/*.pth | head -1) @@ -174,9 +175,9 @@ jobs: # try running the test script { python test.py 'DATASET.ROOT' '/home/alfred/data_dynamic/dutch_f3/data' \ 'TEST.MODEL_PATH' ${model} \ - --cfg=configs/unet.yaml --debug ; echo "$?" > "$dir/$BASHPID"; } & + --cfg=configs/unet.yaml --debug ; echo "$?" > "$dir/$BASHPID"; } pids+=" $!" - export CUDA_VISIBLE_DEVICES=2 + # export CUDA_VISIBLE_DEVICES=2 # find the latest model which we just trained model_dir=$(ls -td output/seresnet_unet/section_depth/* | head -1) model=$(ls -t ${model_dir}/*.pth | head -1) @@ -184,9 +185,9 @@ jobs: # try running the test script { python test.py 'DATASET.ROOT' '/home/alfred/data_dynamic/dutch_f3/data' \ 'TEST.MODEL_PATH' ${model} \ - --cfg=configs/seresnet_unet.yaml --debug ; echo "$?" > "$dir/$BASHPID"; } & + --cfg=configs/seresnet_unet.yaml --debug ; echo "$?" > "$dir/$BASHPID"; } pids+=" $!" - export CUDA_VISIBLE_DEVICES=3 + # export CUDA_VISIBLE_DEVICES=3 # find the latest model which we just trained model_dir=$(ls -td output/hrnet/section_depth/* | head -1) model=$(ls -t ${model_dir}/*.pth | head -1) @@ -195,7 +196,7 @@ jobs: { python test.py 'DATASET.ROOT' '/home/alfred/data_dynamic/dutch_f3/data' \ 'MODEL.PRETRAINED' '/home/alfred/models/hrnetv2_w48_imagenet_pretrained.pth' \ 'TEST.MODEL_PATH' ${model} \ - --cfg=configs/hrnet.yaml --debug ; echo "$?" > "$dir/$BASHPID"; } & + --cfg=configs/hrnet.yaml --debug ; echo "$?" > "$dir/$BASHPID"; } pids+=" $!" # wait for completion