microsoft · fazamani · Jun 19, 2020 · Jun 16, 2020 · Jun 16, 2020 · Jun 17, 2020
diff --git a/experiments/interpretation/dutchf3_patch/local/test.py b/experiments/interpretation/dutchf3_patch/local/test.py
@@ -230,7 +230,7 @@ def _patch_label_2d(
 
         # dump the data right before it's being put into the model and after scoring
         if debug:
-            outdir = f"debug/batch_{split}"
+            outdir = f"debug/test/batch_{split}"
             generate_path(outdir)
             for i in range(batch.shape[0]):
                 path_prefix = f"{outdir}/{batch_indexes[i][0]}_{batch_indexes[i][1]}"
@@ -251,7 +251,7 @@ def _patch_label_2d(
 
 
 def _evaluate_split(
-    split, section_aug, model, pre_processing, output_processing, device, running_metrics_overall, config, debug=False,
+    split, section_aug, model, pre_processing, output_processing, device, running_metrics_overall, config, data_flow, debug=False,
 ):
     logger = logging.getLogger(__name__)
 
@@ -267,28 +267,42 @@ def _evaluate_split(
 
     n_classes = test_set.n_classes
 
+    if debug:
+        data_flow[split] = dict()
+        data_flow[split]['test_section_loader_length'] = len(test_set)
+        data_flow[split]['test_input_shape'] = test_set.seismic.shape
+        data_flow[split]['test_label_shape'] = test_set.labels.shape
+        data_flow[split]['n_classes'] = n_classes
+
+
     test_loader = data.DataLoader(test_set, batch_size=1, num_workers=config.WORKERS, shuffle=False)
 
     if debug:
+        data_flow[split]['test_loader_length'] = len(test_loader)
         logger.info("Running in Debug/Test mode")
-        test_loader = take(2, test_loader)
+        take_n = 2
+        test_loader = take(take_n, test_loader)
+        data_flow[split]['take_n_sections'] = take_n
+        pred_list, gt_list, img_list = [], [], []
+
 
     try:
         output_dir = generate_path(
-            f"debug/{config.OUTPUT_DIR}_test_{split}", git_branch(), git_hash(), config.MODEL.NAME, current_datetime(),
+            f"{config.OUTPUT_DIR}/test/{split}", git_branch(), git_hash(), config.MODEL.NAME, current_datetime(),
         )
     except:
-        output_dir = generate_path(f"debug/{config.OUTPUT_DIR}_test_{split}", config.MODEL.NAME, current_datetime(),)
+        output_dir = generate_path(f"{config.OUTPUT_DIR}/test/{split}", config.MODEL.NAME, current_datetime(),)
 
     running_metrics_split = runningScore(n_classes)
+
 
     # evaluation mode:
     with torch.no_grad():  # operations inside don't track history
         model.eval()
-        total_iteration = 0
+        # total_iteration = 0
         for i, (images, labels) in enumerate(test_loader):
             logger.info(f"split: {split}, section: {i}")
-            total_iteration = total_iteration + 1
+            # total_iteration = total_iteration + 1
             outputs = _patch_label_2d(
                 model,
                 images,
@@ -306,13 +320,23 @@ def _evaluate_split(
 
             pred = outputs.detach().max(1)[1].numpy()
             gt = labels.numpy()
+            if debug:
+                pred_list.append((pred.shape, len(np.unique(pred))))
+                gt_list.append((gt.shape, len(np.unique(gt))))
+                img_list.append(images.numpy().shape)
+
             running_metrics_split.update(gt, pred)
             running_metrics_overall.update(gt, pred)
 
             #  dump images to disk for review
             mask_to_disk(pred.squeeze(), os.path.join(output_dir, f"{i}_pred.png"), n_classes)
             mask_to_disk(gt.squeeze(), os.path.join(output_dir, f"{i}_gt.png"), n_classes)
 
+    if debug:
+        data_flow[split]['pred_shape'] =  pred_list
+        data_flow[split]['gt_shape'] =  gt_list
+        data_flow[split]['img_shape'] =  img_list
+
     # get scores
     score, class_iou = running_metrics_split.get_scores()
 
@@ -363,7 +387,7 @@ def test(*options, cfg=None, debug=False):
     load_log_configuration(config.LOG_CONFIG)
     logger = logging.getLogger(__name__)
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    log_dir, model_name = os.path.split(config.TEST.MODEL_PATH)
+    log_dir, _ = os.path.split(config.TEST.MODEL_PATH)
 
     # load model:
     model = getattr(models, config.MODEL.NAME).get_seg_model(config)
@@ -396,6 +420,7 @@ def test(*options, cfg=None, debug=False):
     output_processing = _output_processing_pipeline(config)
 
     splits = ["test1", "test2"] if "Both" in config.TEST.SPLIT else [config.TEST.SPLIT]
+    data_flow = dict()
     for sdx, split in enumerate(splits):
         labels = np.load(path.join(config.DATASET.ROOT, "test_once", split + "_labels.npy"))
         section_file = path.join(config.DATASET.ROOT, "splits", "section_" + split + ".txt")
@@ -409,9 +434,17 @@ def test(*options, cfg=None, debug=False):
             device,
             running_metrics_overall,
             config,
+            data_flow,
             debug=debug,
         )
 
+    if debug:
+        config_file_name = "default_config" if not cfg else cfg.split("/")[-1].split(".")[0]
+
+        fname = f"data_flow_test_{config_file_name}_{config.TRAIN.MODEL_DIR}.json"
+        with open(fname, 'w') as f:
+            json.dump(data_flow, f, indent=1)
+
     # FINAL TEST RESULTS:
     score, class_iou = running_metrics_overall.get_scores()
 
@@ -434,7 +467,6 @@ def test(*options, cfg=None, debug=False):
     np.savetxt(path.join(log_dir, "confusion.csv"), confusion, delimiter=" ")
 
     if debug:
-        config_file_name = "default_config" if not cfg else cfg.split("/")[-1].split(".")[0]
         fname = f"metrics_test_{config_file_name}_{config.TRAIN.MODEL_DIR}.json"
         with open(fname, "w") as fid:
             json.dump(

diff --git a/experiments/interpretation/dutchf3_patch/local/train.py b/experiments/interpretation/dutchf3_patch/local/train.py
@@ -90,9 +90,6 @@ def run(*options, cfg=None, debug=False, input=None):
     # Set CUDNN benchmark mode:
     torch.backends.cudnn.benchmark = config.CUDNN.BENCHMARK
 
-    # We will write the model under outputs / config_file_name / model_dir
-    config_file_name = "default_config" if not cfg else cfg.split("/")[-1].split(".")[0]
-
     # Fix random seeds:
     torch.manual_seed(config.SEED)
     if torch.cuda.is_available():
@@ -142,8 +139,8 @@ def run(*options, cfg=None, debug=False, input=None):
     )
     logger.info(train_set)
 
-
     n_classes = train_set.n_classes
+
     val_set = TrainPatchLoader(
         config,
         split="val",
@@ -152,26 +149,41 @@ def run(*options, cfg=None, debug=False, input=None):
         debug=debug,
     )
     logger.info(val_set)
-
-
 
     if debug:
+        data_flow_dict = dict()
+
+        data_flow_dict['train_patch_loader_length'] = len(train_set)
+        data_flow_dict['validation_patch_loader_length'] = len(val_set)
+        data_flow_dict['train_input_shape'] = train_set.seismic.shape
+        data_flow_dict['train_label_shape'] = train_set.labels.shape
+        data_flow_dict['n_classes'] = n_classes
+
         logger.info("Running in debug mode..")
         train_range = min(config.TRAIN.BATCH_SIZE_PER_GPU * config.NUM_DEBUG_BATCHES, len(train_set))
         logging.info(f"train range in debug mode {train_range}")
         train_set = data.Subset(train_set, range(train_range))
         valid_range = min(config.VALIDATION.BATCH_SIZE_PER_GPU, len(val_set))
         val_set = data.Subset(val_set, range(valid_range))
 
-
-
+        data_flow_dict['train_length_subset'] = len(train_set)
+        data_flow_dict['validation_length_subset'] = len(val_set)
+
     train_loader = data.DataLoader(
         train_set, batch_size=config.TRAIN.BATCH_SIZE_PER_GPU, num_workers=config.WORKERS, shuffle=True
     )
     val_loader = data.DataLoader(
         val_set, batch_size=config.VALIDATION.BATCH_SIZE_PER_GPU, num_workers=1
     )  # config.WORKERS)
 
+    if debug:
+        data_flow_dict['train_loader_length'] = len(train_loader)
+        data_flow_dict['validation_loader_length'] = len(val_loader)
+
+        fname = f"data_flow_train_{config_file_name}_{config.TRAIN.MODEL_DIR}.json"
+        with open(fname, 'w') as f:
+            json.dump(data_flow_dict, f, indent=2)
+
     # Model:
     model = getattr(models, config.MODEL.NAME).get_seg_model(config)
     device = "cuda" if torch.cuda.is_available() else "cpu"

diff --git a/interpretation/deepseismic_interpretation/dutchf3/data.py b/interpretation/deepseismic_interpretation/dutchf3/data.py
@@ -153,7 +153,7 @@ def __getitem__(self, index):
         im, lbl = _transform_WH_to_HW(im), _transform_WH_to_HW(lbl)
 
         if self.debug and "test" in self.split:
-            outdir = f"debug/sectionLoader_{self.split}_raw"
+            outdir = f"debug/test/sectionLoader_{self.split}_raw"
             generate_path(outdir)
             path_prefix = f"{outdir}/index_{index}_section_{section_name}"
             image_to_disk(im, path_prefix + "_img.png", self.MIN, self.MAX)
@@ -167,7 +167,7 @@ def __getitem__(self, index):
             im, lbl = self.transform(im, lbl)
 
         if self.debug and "test" in self.split:
-            outdir = f"debug/sectionLoader_{self.split}_{'aug' if self.augmentations is not None else 'noaug'}"
+            outdir = f"debug/test/sectionLoader_{self.split}_{'aug' if self.augmentations is not None else 'noaug'}"
             generate_path(outdir)
             path_prefix = f"{outdir}/index_{index}_section_{section_name}"
             image_to_disk(np.array(im[0]), path_prefix + "_img.png", self.MIN, self.MAX)
@@ -397,7 +397,7 @@ def __getitem__(self, index):
 
         # dump images before augmentation
         if self.debug:
-            outdir = f"debug/testSectionLoaderWithDepth_{self.split}_raw"
+            outdir = f"debug/test/testSectionLoaderWithDepth_{self.split}_raw"
             generate_path(outdir)
             # this needs to take the first dimension of image (no depth) but lbl only has 1 dim
             path_prefix = f"{outdir}/index_{index}_section_{section_name}"
@@ -416,7 +416,7 @@ def __getitem__(self, index):
         # dump images and labels to disk after augmentation
         if self.debug:
             outdir = (
-                f"debug/testSectionLoaderWithDepth_{self.split}_{'aug' if self.augmentations is not None else 'noaug'}"
+                f"debug/test/testSectionLoaderWithDepth_{self.split}_{'aug' if self.augmentations is not None else 'noaug'}"
             )
             generate_path(outdir)
             path_prefix = f"{outdir}/index_{index}_section_{section_name}"
@@ -773,9 +773,6 @@ def __repr__(self):
     "patch": TrainPatchLoaderWithDepth,
 }
 
-_TRAIN_SECTION_LOADERS = {"section": TrainSectionLoaderWithDepth}
-
-
 def get_patch_loader(cfg):
     assert str(cfg.TRAIN.DEPTH).lower() in [
         "section",
@@ -785,6 +782,7 @@ def get_patch_loader(cfg):
             Valid values: section, patch, none."
     return _TRAIN_PATCH_LOADERS.get(cfg.TRAIN.DEPTH, TrainPatchLoader)
 
+_TRAIN_SECTION_LOADERS = {"section": TrainSectionLoaderWithDepth}
 
 def get_section_loader(cfg):
     assert str(cfg.TRAIN.DEPTH).lower() in [
@@ -797,7 +795,6 @@ def get_section_loader(cfg):
 
 _TEST_LOADERS = {"section": TestSectionLoaderWithDepth}
 
-
 def get_test_loader(cfg):
     logger = logging.getLogger(__name__)
     logger.info(f"Test loader {cfg.TRAIN.DEPTH}")

diff --git a/tests/cicd/main_build.yml b/tests/cicd/main_build.yml
@@ -114,15 +114,15 @@ jobs:
       echo "cv_lib unit test job passed"
 
 ###################################################################################################
-# Stage 3: Dutch F3 patch models on checkerboard test set: 
+# Stage 3: Patch models on checkerboard test set: 
 #              deconvnet, unet, HRNet patch depth, HRNet section depth
 # CAUTION: reverted these builds to single-GPU leaving new multi-GPU code in to be reverted later
 ###################################################################################################
 
-- job: checkerboard_dutchf3_patch
+- job: checkerboard_patch
   dependsOn: cv_lib_unit_tests_job
   timeoutInMinutes: 60
-  displayName: Checkerboard Dutch F3 patch local
+  displayName: Checkerboard patch local
   pool:
     name: deepseismicagentpool
   steps:
@@ -148,6 +148,7 @@ jobs:
                         'TRAIN.END_EPOCH' 2 'TRAIN.SNAPSHOTS' 1 \
                         'DATASET.NUM_CLASSES' 2 'DATASET.CLASS_WEIGHTS' '[1.0, 1.0]' \
                         'TRAIN.DEPTH' 'none' \
+                        'TRAIN.BATCH_SIZE_PER_GPU' 16 'VALIDATION.BATCH_SIZE_PER_GPU' 32 \
                         'OUTPUT_DIR' 'output' 'TRAIN.MODEL_DIR' 'no_depth' \
                         'WORKERS' 1 \
                         --cfg=configs/patch_deconvnet.yaml --debug ; echo "$?" > "$dir/$BASHPID"; }
@@ -158,6 +159,7 @@ jobs:
                         'TRAIN.END_EPOCH' 1 'TRAIN.SNAPSHOTS' 1 \
                         'DATASET.NUM_CLASSES' 2 'DATASET.CLASS_WEIGHTS' '[1.0, 1.0]' \
                         'TRAIN.DEPTH' 'section' \
+                        'TRAIN.BATCH_SIZE_PER_GPU' 16 'VALIDATION.BATCH_SIZE_PER_GPU' 32 \
                         'OUTPUT_DIR' 'output' 'TRAIN.MODEL_DIR' 'section_depth' \
                         'WORKERS' 1 \
                         --cfg=configs/unet.yaml --debug ; echo "$?" > "$dir/$BASHPID"; }
@@ -168,6 +170,7 @@ jobs:
                         'TRAIN.END_EPOCH' 1 'TRAIN.SNAPSHOTS' 1 \
                         'DATASET.NUM_CLASSES' 2 'DATASET.CLASS_WEIGHTS' '[1.0, 1.0]' \
                         'TRAIN.DEPTH' 'section' \
+                        'TRAIN.BATCH_SIZE_PER_GPU' 16 'VALIDATION.BATCH_SIZE_PER_GPU' 32 \
                         'OUTPUT_DIR' 'output' 'TRAIN.MODEL_DIR' 'section_depth' \
                         'WORKERS' 1 \
                         --cfg=configs/seresnet_unet.yaml --debug ; echo "$?" > "$dir/$BASHPID"; }
@@ -178,6 +181,7 @@ jobs:
                         'TRAIN.END_EPOCH' 2 'TRAIN.SNAPSHOTS' 1 \
                         'DATASET.NUM_CLASSES' 2 'DATASET.CLASS_WEIGHTS' '[1.0, 1.0]' \
                         'TRAIN.DEPTH' 'section' \
+                        'TRAIN.BATCH_SIZE_PER_GPU' 16 'VALIDATION.BATCH_SIZE_PER_GPU' 32 \
                         'MODEL.PRETRAINED' '/home/alfred/models/hrnetv2_w48_imagenet_pretrained.pth' \
                         'OUTPUT_DIR' 'output' 'TRAIN.MODEL_DIR' 'section_depth' \
                         'WORKERS' 1 \
@@ -195,6 +199,16 @@ jobs:
 
       # Remove the temporary directory
       rm -r "$dir"
+
+
+      set -e
+      python ../../../../tests/cicd/src/check_data_flow.py --infile data_flow_train_patch_deconvnet_no_depth.json --step train --train_depth none
+      python ../../../../tests/cicd/src/check_data_flow.py --infile data_flow_train_unet_section_depth.json --step train --train_depth section
+      python ../../../../tests/cicd/src/check_data_flow.py --infile data_flow_train_seresnet_unet_section_depth.json --step train --train_depth section
+      python ../../../../tests/cicd/src/check_data_flow.py --infile data_flow_train_hrnet_section_depth.json --step train --train_depth section
+      set +e
+
+
 
       # check validation set performance
       set -e
@@ -286,6 +300,14 @@ jobs:
       # Remove the temporary directory
       rm -r "$dir"
 
+      # check data flow for test
+      set -e
+      python ../../../../tests/cicd/src/check_data_flow.py --infile data_flow_test_patch_deconvnet_no_depth.json --step test --train_depth none
+      python ../../../../tests/cicd/src/check_data_flow.py --infile data_flow_test_unet_section_depth.json --step test --train_depth section
+      python ../../../../tests/cicd/src/check_data_flow.py --infile data_flow_test_seresnet_unet_section_depth.json --step test --train_depth section
+      python ../../../../tests/cicd/src/check_data_flow.py --infile data_flow_test_hrnet_section_depth.json --step test --train_depth section
+      set +e
+
       # check test set performance
       set -e
       # TODO: enable this after investigating reproducibility problem of patch_deconvnet for small-size training data
@@ -304,7 +326,7 @@ jobs:
 ###################################################################################################
 
 - job: F3_block_training_and_evaluation_local_notebook
-  dependsOn: checkerboard_dutchf3_patch
+  dependsOn: checkerboard_patch
   timeoutInMinutes: 5
   displayName: F3 block training and evaluation local notebook
   pool: