Skip to content
This repository has been archived by the owner on Nov 16, 2023. It is now read-only.

289: correctness metrics and tighter tests #293

Merged
merged 54 commits into from
May 5, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
54 commits
Select commit Hold shift + click to select a range
d0aa884
resolved rebase conflict
maxkazmsft Apr 20, 2020
8ed4af6
resolved merge conflict
maxkazmsft Apr 16, 2020
5641445
resolved rebase conflict
maxkazmsft Apr 20, 2020
19abb24
resolved merge conflict
maxkazmsft Apr 16, 2020
bca191d
resolved merge conflict
maxkazmsft Apr 16, 2020
7773924
resolved rebase conflict
maxkazmsft Apr 20, 2020
6bcaae6
wrote the bulk of checkerboard example
maxkazmsft Apr 14, 2020
8b73d65
finished checkerboard generator
maxkazmsft Apr 14, 2020
3a739be
got binary dataset to run
maxkazmsft Apr 15, 2020
512ed01
finished first implementation mockup - commit before rebase
maxkazmsft Apr 22, 2020
c57a641
made sure rebase went well manually
maxkazmsft Apr 22, 2020
cb8f66e
added new files
maxkazmsft Apr 22, 2020
2332aae
resolved merge conflict
maxkazmsft Apr 22, 2020
2d4c5f3
resolved merge conflict
maxkazmsft Apr 22, 2020
a1dc129
resolved PR comments and made tests work
maxkazmsft Apr 24, 2020
c2aae9c
fixed build error
maxkazmsft Apr 27, 2020
51c5749
fixed build VM errors
maxkazmsft Apr 27, 2020
14a50af
more fixes to get the test to pass
maxkazmsft Apr 27, 2020
626bc9e
fixed n_classes issue in data.py
maxkazmsft Apr 27, 2020
615aea8
fixed notebook as well
maxkazmsft Apr 27, 2020
9e91cc2
cleared notebook run cell
maxkazmsft Apr 28, 2020
bf41e3d
trivial commit to restart builds
maxkazmsft Apr 28, 2020
86897b8
addressed PR comments
maxkazmsft Apr 28, 2020
63c3bce
moved notebook tests to main build pipeline
maxkazmsft Apr 28, 2020
55087d4
fixed checkerboard label precision
maxkazmsft Apr 28, 2020
784e3aa
resolved merge confict 2
maxkazmsft Apr 28, 2020
c19d96b
resolved merge conflict 3
maxkazmsft Apr 28, 2020
fc362f6
relaxed performance tests for now
maxkazmsft Apr 28, 2020
19369f4
resolved merge conflict
maxkazmsft Apr 16, 2020
4bd4cdf
resolved merge conflict
maxkazmsft Apr 16, 2020
4a01d47
Merge branch 'correctness' of github.com:maxkazmsft/seismic-deeplearn…
maxkazmsft Apr 29, 2020
d214633
merged with updated correctness branch
maxkazmsft Apr 29, 2020
420e609
fixed build error
maxkazmsft Apr 29, 2020
82f5221
resolved merge conflicts
maxkazmsft Apr 29, 2020
f2175d3
fixed another merge mistake
maxkazmsft Apr 29, 2020
7155f86
resolved rebase conflict
maxkazmsft Apr 20, 2020
9589b76
resolved rebase 2
maxkazmsft Apr 30, 2020
7e7acca
resolved merge conflict
maxkazmsft Apr 16, 2020
e4f4ed9
resolved merge conflict
maxkazmsft Apr 16, 2020
7ba0dc5
resolved merge conflict
maxkazmsft Apr 30, 2020
ede71e4
adding new logging
maxkazmsft May 1, 2020
e39b067
added better logging - cleaner - debugged metrics on checkerboard dat…
maxkazmsft May 1, 2020
6a5bf95
resolved rebase conflict
maxkazmsft Apr 20, 2020
110d771
resolved merge conflict
maxkazmsft Apr 16, 2020
dd67625
resolved merge conflict
maxkazmsft Apr 16, 2020
d467f76
resolved rebase 2
maxkazmsft Apr 30, 2020
3a27af5
resolved merge conflict
maxkazmsft Apr 16, 2020
dcab630
resolved merge conflict
maxkazmsft Apr 16, 2020
57cc989
resolved merge conflict
maxkazmsft May 1, 2020
33e5422
resolved merge with correctness
maxkazmsft May 1, 2020
de79388
updated notebook with the changes
maxkazmsft May 4, 2020
e805790
addressed PR comments
maxkazmsft May 4, 2020
9bc0d6e
addressed another PR comment
maxkazmsft May 4, 2020
e17b848
Merge branch 'correctness' into 289
maxkazmsft May 5, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@

import fire
import numpy as np
import toolz
import torch
from albumentations import Compose, HorizontalFlip, Normalize, PadIfNeeded, Resize
from ignite.contrib.handlers import ConcatScheduler, CosineAnnealingScheduler, LinearCyclicalScheduler
Expand Down
3 changes: 1 addition & 2 deletions cv_lib/cv_lib/event_handlers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,7 @@ def _create_checkpoint_handler(self):
def __call__(self, engine, to_save):
self._checkpoint_handler(engine, to_save)
if self._snapshot_function():
files = glob.glob(os.path.join(self._model_save_location, self._running_model_prefix + "*"))
print(files)
files = glob.glob(os.path.join(self._model_save_location, self._running_model_prefix + "*"))
name_postfix = os.path.basename(files[0]).lstrip(self._running_model_prefix)
copyfile(
files[0],
Expand Down
20 changes: 13 additions & 7 deletions cv_lib/cv_lib/event_handlers/logging_handlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,15 +26,21 @@ def log_lr(optimizer, engine):


@curry
def log_metrics(log_msg, engine, metrics_dict={"pixacc": "Avg accuracy :", "nll": "Avg loss :"}, fname=None):
def log_metrics(
engine,
evaluator,
metrics_dict={
"nll": "Avg loss :",
"pixacc": "Pixelwise Accuracy :",
"mca": "Avg Class Accuracy :",
"mIoU": "Avg Class IoU :",
},
stage="undefined",
):
logger = logging.getLogger(__name__)
metrics = engine.state.metrics
metrics = evaluator.state.metrics
metrics_msg = " ".join([f"{metrics_dict[k]} {metrics[k]:.4f}" for k in metrics_dict])
if fname:
with open(fname, "w") as fid:
output_dict = {metrics_dict[k]: float(metrics[k]) for k in metrics_dict}
json.dump(output_dict, fid)
logger.info(f"{log_msg} - Epoch {engine.state.epoch} [{engine.state.max_epochs}] " + metrics_msg)
logger.info(f"{stage} - Epoch {engine.state.epoch} [{engine.state.max_epochs}] " + metrics_msg)


@curry
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -143,10 +143,7 @@
"from cv_lib.event_handlers import SnapshotHandler, logging_handlers\n",
"from cv_lib.event_handlers.logging_handlers import Evaluator\n",
"from cv_lib.event_handlers import tensorboard_handlers\n",
"from cv_lib.event_handlers.tensorboard_handlers import (\n",
" create_image_writer,\n",
" create_summary_writer, \n",
")\n",
"from cv_lib.event_handlers.tensorboard_handlers import create_summary_writer\n",
"from cv_lib.segmentation import models\n",
"from cv_lib.segmentation.dutchf3.engine import (\n",
" create_supervised_evaluator,\n",
Expand Down Expand Up @@ -537,6 +534,7 @@
")\n",
"\n",
"if papermill:\n",
" train_set = data.Subset(train_set, range(3))\n",
" val_set = data.Subset(val_set, range(3))\n",
"elif DEMO:\n",
" val_set = data.Subset(val_set, range(config.VALIDATION.BATCH_SIZE_PER_GPU))\n",
Expand Down Expand Up @@ -571,15 +569,15 @@
"source": [
"# if we're running in test mode, just run 2 batches\n",
"if papermill:\n",
" train_len = config.TRAIN.BATCH_SIZE_PER_GPU*2 \n",
"# if we're running in demo mode, just run 10 batches to fine-tune the model\n",
" train_len = 2\n",
"# if we're running in demo mode, just run 20 batches to fine-tune the model\n",
"elif DEMO:\n",
" train_len = config.TRAIN.BATCH_SIZE_PER_GPU*10 \n",
" train_len = 20\n",
"# if we're not in test or demo modes, run the entire loop\n",
"else:\n",
" train_len = len(train_loader)\n",
"\n",
"snapshot_duration = scheduler_step * train_len if not papermill else 2*len(train_loader)"
"snapshot_duration = scheduler_step * train_len if not papermill else train_len"
]
},
{
Expand Down Expand Up @@ -678,10 +676,7 @@
"# create training engine\n",
"trainer = create_supervised_trainer(\n",
" model, optimizer, criterion, prepare_batch, device=device\n",
")\n",
"\n",
"# add learning rate scheduler\n",
"trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)"
")"
]
},
{
Expand Down Expand Up @@ -710,35 +705,16 @@
"generate_path(output_dir)\n",
"\n",
"# define main summary writer which logs all model summaries\n",
"summary_writer = create_summary_writer(log_dir=path.join(output_dir, config.LOG_DIR))\n",
"\n",
"# add logging of training output\n",
"trainer.add_event_handler(\n",
" Events.ITERATION_COMPLETED,\n",
" logging_handlers.log_training_output(log_interval=config.TRAIN.BATCH_SIZE_PER_GPU),\n",
")\n",
"\n",
"# add logging of learning rate\n",
"trainer.add_event_handler(Events.EPOCH_STARTED, logging_handlers.log_lr(optimizer))\n",
"\n",
"# log learning rate to tensorboard\n",
"trainer.add_event_handler(\n",
" Events.EPOCH_STARTED,\n",
" tensorboard_handlers.log_lr(summary_writer, optimizer, \"epoch\"),\n",
")\n",
"\n",
"# log training summary to tensorboard as well\n",
"trainer.add_event_handler(\n",
" Events.ITERATION_COMPLETED,\n",
" tensorboard_handlers.log_training_output(summary_writer),\n",
")"
"summary_writer = create_summary_writer(log_dir=path.join(output_dir, config.LOG_DIR))\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We also checkpoint models and snapshot them to disk with every training epoch."
"Next we need to score the model on validation set as it's training. To do this we need to add helper functions to manipulate data into the required shape just as we've done to prepare each batch for training at the beginning of this notebook.\n",
"\n",
"We also set up evaluation metrics which we want to record on the training set."
]
},
{
Expand All @@ -747,28 +723,52 @@
"metadata": {},
"outputs": [],
"source": [
"# add model checkpointing\n",
"checkpoint_handler = ModelCheckpoint(\n",
" output_dir,\n",
" \"model_f3_nb\",\n",
" save_interval=1,\n",
" n_saved=1,\n",
" create_dir=True,\n",
" require_empty=False,\n",
"transform_fn = lambda output_dict: (output_dict[\"y_pred\"].squeeze(), output_dict[\"mask\"].squeeze())\n",
"evaluator = create_supervised_evaluator(\n",
" model,\n",
" prepare_batch,\n",
" metrics={\n",
" \"nll\": Loss(criterion, output_transform=transform_fn),\n",
" \"pixacc\": pixelwise_accuracy(n_classes, output_transform=transform_fn, device=device),\n",
" \"cacc\": class_accuracy(n_classes, output_transform=transform_fn),\n",
" \"mca\": mean_class_accuracy(n_classes, output_transform=transform_fn),\n",
" \"ciou\": class_iou(n_classes, output_transform=transform_fn),\n",
" \"mIoU\": mean_iou(n_classes, output_transform=transform_fn),\n",
" },\n",
" device=device,\n",
")\n",
"trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)\n",
"\n",
"# Logging:\n",
"trainer.add_event_handler(\n",
" Events.EPOCH_COMPLETED, checkpoint_handler, {config.MODEL.NAME: model}\n",
")"
" Events.ITERATION_COMPLETED, logging_handlers.log_training_output(log_interval=config.PRINT_FREQ),\n",
")\n",
"trainer.add_event_handler(Events.EPOCH_COMPLETED, logging_handlers.log_lr(optimizer))\n",
"\n",
"# Tensorboard and Logging:\n",
"trainer.add_event_handler(Events.ITERATION_COMPLETED, tensorboard_handlers.log_training_output(summary_writer))\n",
"trainer.add_event_handler(Events.ITERATION_COMPLETED, tensorboard_handlers.log_validation_output(summary_writer))\n",
"\n",
"# add specific logger which also triggers printed metrics on test set\n",
"@trainer.on(Events.EPOCH_COMPLETED)\n",
"def log_training_results(engine):\n",
" evaluator.run(train_loader)\n",
" tensorboard_handlers.log_results(engine, evaluator, summary_writer, n_classes, stage=\"Training\")\n",
" logging_handlers.log_metrics(engine, evaluator, stage=\"Training\")\n",
"\n",
"# add specific logger which also triggers printed metrics on validation set\n",
"@trainer.on(Events.EPOCH_COMPLETED)\n",
"def log_validation_results(engine):\n",
" evaluator.run(val_loader)\n",
" tensorboard_handlers.log_results(engine, evaluator, summary_writer, n_classes, stage=\"Validation\")\n",
" logging_handlers.log_metrics(engine, evaluator, stage=\"Validation\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Next we need to score the model on validation set as it's training. To do this we need to add helper functions to manipulate data into the required shape just as we've done to prepare each batch for training at the beginning of this notebook.\n",
"\n",
"We also set up evaluation metrics which we want to record on the training set."
"We also checkpoint models and snapshot them to disk with every training epoch."
]
},
{
Expand All @@ -777,90 +777,18 @@
"metadata": {},
"outputs": [],
"source": [
"# helper function for\n",
"def _select_pred_and_mask(model_out_dict):\n",
" return (model_out_dict[\"y_pred\"].squeeze(), model_out_dict[\"mask\"].squeeze())\n",
"\n",
"\n",
"def _select_max(pred_tensor):\n",
" return pred_tensor.max(1)[1]\n",
"\n",
"\n",
"def _tensor_to_numpy(pred_tensor):\n",
" return pred_tensor.squeeze().cpu().numpy()\n",
"\n",
"\n",
"def snapshot_function():\n",
" return (trainer.state.iteration % snapshot_duration) == 0\n",
"\n",
"evaluator = create_supervised_evaluator(\n",
" model,\n",
" prepare_batch,\n",
" metrics={\n",
" \"nll\": Loss(criterion, output_transform=_select_pred_and_mask),\n",
" \"pixacc\": pixelwise_accuracy(\n",
" n_classes, output_transform=_select_pred_and_mask, device=device\n",
" ),\n",
" \"cacc\": class_accuracy(n_classes, output_transform=_select_pred_and_mask),\n",
" \"mca\": mean_class_accuracy(n_classes, output_transform=_select_pred_and_mask),\n",
" \"ciou\": class_iou(n_classes, output_transform=_select_pred_and_mask),\n",
" \"mIoU\": mean_iou(n_classes, output_transform=_select_pred_and_mask),\n",
" },\n",
" device=device,\n",
")\n",
"\n",
"trainer.add_event_handler(Events.EPOCH_COMPLETED, Evaluator(evaluator, val_loader))\n",
"\n",
"evaluator.add_event_handler(\n",
" Events.EPOCH_COMPLETED,\n",
" logging_handlers.log_metrics(\n",
" \"Validation results\",\n",
" metrics_dict={\n",
" \"nll\": \"Avg loss :\",\n",
" \"pixacc\": \"Pixelwise Accuracy :\",\n",
" \"mca\": \"Avg Class Accuracy :\",\n",
" \"mIoU\": \"Avg Class IoU :\",\n",
" },\n",
" ),\n",
")\n",
"\n",
"evaluator.add_event_handler(\n",
" Events.EPOCH_COMPLETED,\n",
" tensorboard_handlers.log_metrics(\n",
" summary_writer,\n",
" trainer,\n",
" \"epoch\",\n",
" metrics_dict={\n",
" \"mIoU\": \"Validation/mIoU\",\n",
" \"nll\": \"Validation/Loss\",\n",
" \"mca\": \"Validation/MCA\",\n",
" \"pixacc\": \"Validation/Pixel_Acc\",\n",
" },\n",
" ),\n",
")\n",
"\n",
"\n",
"transform_func = compose(np_to_tb, decode_segmap(n_classes=n_classes), _tensor_to_numpy)\n",
"\n",
"transform_pred = compose(transform_func, _select_max)\n",
"\n",
"evaluator.add_event_handler(\n",
" Events.EPOCH_COMPLETED,\n",
" create_image_writer(summary_writer, \"Validation/Image\", \"image\"),\n",
")\n",
"\n",
"evaluator.add_event_handler(\n",
" Events.EPOCH_COMPLETED,\n",
" create_image_writer(\n",
" summary_writer, \"Validation/Mask\", \"mask\", transform_func=transform_func\n",
" ),\n",
"# add model checkpointing\n",
"checkpoint_handler = ModelCheckpoint(\n",
" output_dir,\n",
" \"model_f3_nb\",\n",
" save_interval=1,\n",
" n_saved=1,\n",
" create_dir=True,\n",
" require_empty=False,\n",
")\n",
"\n",
"evaluator.add_event_handler(\n",
" Events.EPOCH_COMPLETED,\n",
" create_image_writer(\n",
" summary_writer, \"Validation/Pred\", \"y_pred\", transform_func=transform_pred\n",
" ),\n",
"trainer.add_event_handler(\n",
" Events.EPOCH_COMPLETED, checkpoint_handler, {config.MODEL.NAME: model}\n",
")"
]
},
Expand Down
8 changes: 6 additions & 2 deletions experiments/interpretation/dutchf3_patch/local/default.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,10 @@

_C = CN()

_C.OUTPUT_DIR = "output" # This will be the base directory for all output, such as logs and saved models
_C.LOG_DIR = "" # This will be a subdirectory inside OUTPUT_DIR
# This will be the base directory for all output, such as logs and saved models
_C.OUTPUT_DIR = "output"
# This will be a subdirectory inside OUTPUT_DIR
_C.LOG_DIR = ""
_C.GPUS = (0,)
_C.WORKERS = 4
_C.PRINT_FREQ = 20
Expand All @@ -21,6 +23,8 @@
_C.LOG_CONFIG = "logging.conf"
_C.SEED = 42
_C.OPENCV_BORDER_CONSTANT = 0
# number of batches to use in test/debug mode
_C.NUM_DEBUG_BATCHES = 1

# Cudnn related params
_C.CUDNN = CN()
Expand Down
Loading