Skip to content
This repository has been archived by the owner on Nov 16, 2023. It is now read-only.

Commit

Permalink
Data flow tests (#375)
Browse files Browse the repository at this point in the history
* renamed checkerboard job name

* restructured default outputs from test.py to be dumped under output dir and not debug dir

* test.py output re-org

* removed outdated variable from check_performance.py

* intermediate work

* intermediate work

* bunch of intermediate works

* changing args for different trainings

* final to run dev_build"

* remove print statements

* removed print statement

* removed suppressed lines

* added assertion error msg

* added assertion error msg, one intential bug to test

* testing a stupid bug

* debug

* omg

* final

* trigger build
  • Loading branch information
fazamani authored Jun 19, 2020
1 parent 4f35e8d commit cf5473a
Show file tree
Hide file tree
Showing 6 changed files with 270 additions and 28 deletions.
48 changes: 39 additions & 9 deletions experiments/interpretation/dutchf3_patch/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,7 @@ def _patch_label_2d(

# dump the data right before it's being put into the model and after scoring
if debug:
outdir = f"debug/batch_{split}"
outdir = f"debug/test/batch_{split}"
generate_path(outdir)
for i in range(batch.shape[0]):
path_prefix = f"{outdir}/{batch_indexes[i][0]}_{batch_indexes[i][1]}"
Expand All @@ -251,7 +251,7 @@ def _patch_label_2d(


def _evaluate_split(
split, section_aug, model, pre_processing, output_processing, device, running_metrics_overall, config, debug=False,
split, section_aug, model, pre_processing, output_processing, device, running_metrics_overall, config, data_flow, debug=False,
):
logger = logging.getLogger(__name__)

Expand All @@ -267,28 +267,40 @@ def _evaluate_split(

n_classes = test_set.n_classes

if debug:
data_flow[split] = dict()
data_flow[split]['test_section_loader_length'] = len(test_set)
data_flow[split]['test_input_shape'] = test_set.seismic.shape
data_flow[split]['test_label_shape'] = test_set.labels.shape
data_flow[split]['n_classes'] = n_classes


test_loader = data.DataLoader(test_set, batch_size=1, num_workers=config.WORKERS, shuffle=False)

if debug:
data_flow[split]['test_loader_length'] = len(test_loader)
logger.info("Running in Debug/Test mode")
test_loader = take(2, test_loader)
take_n = 2
test_loader = take(take_n, test_loader)
data_flow[split]['take_n_sections'] = take_n
pred_list, gt_list, img_list = [], [], []


try:
output_dir = generate_path(
f"debug/{config.OUTPUT_DIR}_test_{split}", git_branch(), git_hash(), config.MODEL.NAME, current_datetime(),
f"{config.OUTPUT_DIR}/test/{split}", git_branch(), git_hash(), config.MODEL.NAME, current_datetime(),
)
except:
output_dir = generate_path(f"debug/{config.OUTPUT_DIR}_test_{split}", config.MODEL.NAME, current_datetime(),)
output_dir = generate_path(f"{config.OUTPUT_DIR}/test/{split}", config.MODEL.NAME, current_datetime(),)

running_metrics_split = runningScore(n_classes)


# evaluation mode:
with torch.no_grad(): # operations inside don't track history
model.eval()
total_iteration = 0
for i, (images, labels) in enumerate(test_loader):
logger.info(f"split: {split}, section: {i}")
total_iteration = total_iteration + 1
outputs = _patch_label_2d(
model,
images,
Expand All @@ -306,13 +318,23 @@ def _evaluate_split(

pred = outputs.detach().max(1)[1].numpy()
gt = labels.numpy()
if debug:
pred_list.append((pred.shape, len(np.unique(pred))))
gt_list.append((gt.shape, len(np.unique(gt))))
img_list.append(images.numpy().shape)

running_metrics_split.update(gt, pred)
running_metrics_overall.update(gt, pred)

# dump images to disk for review
mask_to_disk(pred.squeeze(), os.path.join(output_dir, f"{i}_pred.png"), n_classes)
mask_to_disk(gt.squeeze(), os.path.join(output_dir, f"{i}_gt.png"), n_classes)

if debug:
data_flow[split]['pred_shape'] = pred_list
data_flow[split]['gt_shape'] = gt_list
data_flow[split]['img_shape'] = img_list

# get scores
score, class_iou = running_metrics_split.get_scores()

Expand Down Expand Up @@ -363,7 +385,7 @@ def test(*options, cfg=None, debug=False):
load_log_configuration(config.LOG_CONFIG)
logger = logging.getLogger(__name__)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
log_dir, model_name = os.path.split(config.TEST.MODEL_PATH)
log_dir, _ = os.path.split(config.TEST.MODEL_PATH)

# load model:
model = getattr(models, config.MODEL.NAME).get_seg_model(config)
Expand Down Expand Up @@ -396,6 +418,7 @@ def test(*options, cfg=None, debug=False):
output_processing = _output_processing_pipeline(config)

splits = ["test1", "test2"] if "Both" in config.TEST.SPLIT else [config.TEST.SPLIT]
data_flow = dict()
for sdx, split in enumerate(splits):
labels = np.load(path.join(config.DATASET.ROOT, "test_once", split + "_labels.npy"))
section_file = path.join(config.DATASET.ROOT, "splits", "section_" + split + ".txt")
Expand All @@ -409,9 +432,17 @@ def test(*options, cfg=None, debug=False):
device,
running_metrics_overall,
config,
data_flow,
debug=debug,
)

if debug:
config_file_name = "default_config" if not cfg else cfg.split("/")[-1].split(".")[0]

fname = f"data_flow_test_{config_file_name}_{config.TRAIN.MODEL_DIR}.json"
with open(fname, 'w') as f:
json.dump(data_flow, f, indent=1)

# FINAL TEST RESULTS:
score, class_iou = running_metrics_overall.get_scores()

Expand All @@ -434,7 +465,6 @@ def test(*options, cfg=None, debug=False):
np.savetxt(path.join(log_dir, "confusion.csv"), confusion, delimiter=" ")

if debug:
config_file_name = "default_config" if not cfg else cfg.split("/")[-1].split(".")[0]
fname = f"metrics_test_{config_file_name}_{config.TRAIN.MODEL_DIR}.json"
with open(fname, "w") as fid:
json.dump(
Expand Down
23 changes: 20 additions & 3 deletions experiments/interpretation/dutchf3_patch/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,9 +107,6 @@ def run(*options, cfg=None, local_rank=0, debug=False, input=None, distributed=F
# Set CUDNN benchmark mode:
torch.backends.cudnn.benchmark = config.CUDNN.BENCHMARK

# We will write the model under outputs / config_file_name / model_dir
config_file_name = "default_config" if not cfg else cfg.split("/")[-1].split(".")[0]

# Fix random seeds:
torch.manual_seed(config.SEED)
if torch.cuda.is_available():
Expand Down Expand Up @@ -155,16 +152,28 @@ def run(*options, cfg=None, local_rank=0, debug=False, input=None, distributed=F

n_classes = train_set.n_classes
val_set = TrainPatchLoader(config, split="val", is_transform=True, augmentations=val_aug, debug=debug,)

logger.info(val_set)

if debug:
data_flow_dict = dict()

data_flow_dict['train_patch_loader_length'] = len(train_set)
data_flow_dict['validation_patch_loader_length'] = len(val_set)
data_flow_dict['train_input_shape'] = train_set.seismic.shape
data_flow_dict['train_label_shape'] = train_set.labels.shape
data_flow_dict['n_classes'] = n_classes

logger.info("Running in debug mode..")
train_range = min(config.TRAIN.BATCH_SIZE_PER_GPU * config.NUM_DEBUG_BATCHES, len(train_set))
logging.info(f"train range in debug mode {train_range}")
train_set = data.Subset(train_set, range(train_range))
valid_range = min(config.VALIDATION.BATCH_SIZE_PER_GPU, len(val_set))
val_set = data.Subset(val_set, range(valid_range))

data_flow_dict['train_length_subset'] = len(train_set)
data_flow_dict['validation_length_subset'] = len(val_set)

train_sampler = torch.utils.data.distributed.DistributedSampler(train_set, num_replicas=world_size, rank=local_rank)
val_sampler = torch.utils.data.distributed.DistributedSampler(val_set, num_replicas=world_size, rank=local_rank)

Expand All @@ -175,6 +184,14 @@ def run(*options, cfg=None, local_rank=0, debug=False, input=None, distributed=F
val_set, batch_size=config.VALIDATION.BATCH_SIZE_PER_GPU, num_workers=config.WORKERS, sampler=val_sampler
)

if debug:
data_flow_dict['train_loader_length'] = len(train_loader)
data_flow_dict['validation_loader_length'] = len(val_loader)
config_file_name = "default_config" if not cfg else cfg.split("/")[-1].split(".")[0]
fname = f"data_flow_train_{config_file_name}_{config.TRAIN.MODEL_DIR}.json"
with open(fname, 'w') as f:
json.dump(data_flow_dict, f, indent=2)

# Model:
model = getattr(models, config.MODEL.NAME).get_seg_model(config)
device = "cuda" if torch.cuda.is_available() else "cpu"
Expand Down
13 changes: 5 additions & 8 deletions interpretation/deepseismic_interpretation/dutchf3/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ def __getitem__(self, index):
im, lbl = _transform_WH_to_HW(im), _transform_WH_to_HW(lbl)

if self.debug and "test" in self.split:
outdir = f"debug/sectionLoader_{self.split}_raw"
outdir = f"debug/test/sectionLoader_{self.split}_raw"
generate_path(outdir)
path_prefix = f"{outdir}/index_{index}_section_{section_name}"
image_to_disk(im, path_prefix + "_img.png", self.MIN, self.MAX)
Expand All @@ -167,7 +167,7 @@ def __getitem__(self, index):
im, lbl = self.transform(im, lbl)

if self.debug and "test" in self.split:
outdir = f"debug/sectionLoader_{self.split}_{'aug' if self.augmentations is not None else 'noaug'}"
outdir = f"debug/test/sectionLoader_{self.split}_{'aug' if self.augmentations is not None else 'noaug'}"
generate_path(outdir)
path_prefix = f"{outdir}/index_{index}_section_{section_name}"
image_to_disk(np.array(im[0]), path_prefix + "_img.png", self.MIN, self.MAX)
Expand Down Expand Up @@ -397,7 +397,7 @@ def __getitem__(self, index):

# dump images before augmentation
if self.debug:
outdir = f"debug/testSectionLoaderWithDepth_{self.split}_raw"
outdir = f"debug/test/testSectionLoaderWithDepth_{self.split}_raw"
generate_path(outdir)
# this needs to take the first dimension of image (no depth) but lbl only has 1 dim
path_prefix = f"{outdir}/index_{index}_section_{section_name}"
Expand All @@ -416,7 +416,7 @@ def __getitem__(self, index):
# dump images and labels to disk after augmentation
if self.debug:
outdir = (
f"debug/testSectionLoaderWithDepth_{self.split}_{'aug' if self.augmentations is not None else 'noaug'}"
f"debug/test/testSectionLoaderWithDepth_{self.split}_{'aug' if self.augmentations is not None else 'noaug'}"
)
generate_path(outdir)
path_prefix = f"{outdir}/index_{index}_section_{section_name}"
Expand Down Expand Up @@ -773,9 +773,6 @@ def __repr__(self):
"patch": TrainPatchLoaderWithDepth,
}

_TRAIN_SECTION_LOADERS = {"section": TrainSectionLoaderWithDepth}


def get_patch_loader(cfg):
assert str(cfg.TRAIN.DEPTH).lower() in [
"section",
Expand All @@ -785,6 +782,7 @@ def get_patch_loader(cfg):
Valid values: section, patch, none."
return _TRAIN_PATCH_LOADERS.get(cfg.TRAIN.DEPTH, TrainPatchLoader)

_TRAIN_SECTION_LOADERS = {"section": TrainSectionLoaderWithDepth}

def get_section_loader(cfg):
assert str(cfg.TRAIN.DEPTH).lower() in [
Expand All @@ -797,7 +795,6 @@ def get_section_loader(cfg):

_TEST_LOADERS = {"section": TestSectionLoaderWithDepth}


def get_test_loader(cfg):
logger = logging.getLogger(__name__)
logger.info(f"Test loader {cfg.TRAIN.DEPTH}")
Expand Down
30 changes: 26 additions & 4 deletions tests/cicd/main_build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -114,15 +114,15 @@ jobs:
echo "cv_lib unit test job passed"
###################################################################################################
# Stage 3: Dutch F3 patch models on checkerboard test set:
# Stage 3: Patch models on checkerboard test set:
# deconvnet, unet, HRNet patch depth, HRNet section depth
# CAUTION: reverted these builds to single-GPU leaving new multi-GPU code in to be reverted later
###################################################################################################

- job: checkerboard_dutchf3_patch
- job: checkerboard_patch
dependsOn: cv_lib_unit_tests_job
timeoutInMinutes: 60
displayName: Checkerboard Dutch F3 patch local
displayName: Checkerboard patch local
pool:
name: deepseismicagentpool
steps:
Expand All @@ -148,6 +148,7 @@ jobs:
'TRAIN.END_EPOCH' 2 'TRAIN.SNAPSHOTS' 1 \
'DATASET.NUM_CLASSES' 2 'DATASET.CLASS_WEIGHTS' '[1.0, 1.0]' \
'TRAIN.DEPTH' 'none' \
'TRAIN.BATCH_SIZE_PER_GPU' 16 'VALIDATION.BATCH_SIZE_PER_GPU' 32 \
'OUTPUT_DIR' 'output' 'TRAIN.MODEL_DIR' 'no_depth' \
'WORKERS' 1 \
--cfg=configs/patch_deconvnet.yaml --debug ; echo "$?" > "$dir/$BASHPID"; }
Expand All @@ -158,6 +159,7 @@ jobs:
'TRAIN.END_EPOCH' 1 'TRAIN.SNAPSHOTS' 1 \
'DATASET.NUM_CLASSES' 2 'DATASET.CLASS_WEIGHTS' '[1.0, 1.0]' \
'TRAIN.DEPTH' 'section' \
'TRAIN.BATCH_SIZE_PER_GPU' 16 'VALIDATION.BATCH_SIZE_PER_GPU' 32 \
'OUTPUT_DIR' 'output' 'TRAIN.MODEL_DIR' 'section_depth' \
'WORKERS' 1 \
--cfg=configs/unet.yaml --debug ; echo "$?" > "$dir/$BASHPID"; }
Expand All @@ -168,6 +170,7 @@ jobs:
'TRAIN.END_EPOCH' 1 'TRAIN.SNAPSHOTS' 1 \
'DATASET.NUM_CLASSES' 2 'DATASET.CLASS_WEIGHTS' '[1.0, 1.0]' \
'TRAIN.DEPTH' 'section' \
'TRAIN.BATCH_SIZE_PER_GPU' 16 'VALIDATION.BATCH_SIZE_PER_GPU' 32 \
'OUTPUT_DIR' 'output' 'TRAIN.MODEL_DIR' 'section_depth' \
'WORKERS' 1 \
--cfg=configs/seresnet_unet.yaml --debug ; echo "$?" > "$dir/$BASHPID"; }
Expand All @@ -178,6 +181,7 @@ jobs:
'TRAIN.END_EPOCH' 2 'TRAIN.SNAPSHOTS' 1 \
'DATASET.NUM_CLASSES' 2 'DATASET.CLASS_WEIGHTS' '[1.0, 1.0]' \
'TRAIN.DEPTH' 'section' \
'TRAIN.BATCH_SIZE_PER_GPU' 16 'VALIDATION.BATCH_SIZE_PER_GPU' 32 \
'MODEL.PRETRAINED' '/home/alfred/models/hrnetv2_w48_imagenet_pretrained.pth' \
'OUTPUT_DIR' 'output' 'TRAIN.MODEL_DIR' 'section_depth' \
'WORKERS' 1 \
Expand All @@ -195,6 +199,16 @@ jobs:
# Remove the temporary directory
rm -r "$dir"
set -e
python ../../../tests/cicd/src/check_data_flow.py --infile data_flow_train_patch_deconvnet_no_depth.json --step train --train_depth none
python ../../../tests/cicd/src/check_data_flow.py --infile data_flow_train_unet_section_depth.json --step train --train_depth section
python ../../../tests/cicd/src/check_data_flow.py --infile data_flow_train_seresnet_unet_section_depth.json --step train --train_depth section
python ../../../tests/cicd/src/check_data_flow.py --infile data_flow_train_hrnet_section_depth.json --step train --train_depth section
set +e
# check validation set performance
set -e
Expand Down Expand Up @@ -286,6 +300,14 @@ jobs:
# Remove the temporary directory
rm -r "$dir"
# check data flow for test
set -e
python ../../../tests/cicd/src/check_data_flow.py --infile data_flow_test_patch_deconvnet_no_depth.json --step test --train_depth none
python ../../../tests/cicd/src/check_data_flow.py --infile data_flow_test_unet_section_depth.json --step test --train_depth section
python ../../../tests/cicd/src/check_data_flow.py --infile data_flow_test_seresnet_unet_section_depth.json --step test --train_depth section
python ../../../tests/cicd/src/check_data_flow.py --infile data_flow_test_hrnet_section_depth.json --step test --train_depth section
set +e
# check test set performance
set -e
# TODO: enable this after investigating reproducibility problem of patch_deconvnet for small-size training data
Expand All @@ -304,7 +326,7 @@ jobs:
###################################################################################################

- job: F3_block_training_and_evaluation_local_notebook
dependsOn: checkerboard_dutchf3_patch
dependsOn: checkerboard_patch
timeoutInMinutes: 5
displayName: F3 block training and evaluation local notebook
pool:
Expand Down
Loading

0 comments on commit cf5473a

Please sign in to comment.