From 3bbe96a0b1277dbecaa4434b4671e67ad128ed45 Mon Sep 17 00:00:00 2001
From: grantmerz <grantmerz97@gmail.com>
Date: Mon, 8 Jul 2024 11:47:19 -0500
Subject: [PATCH] add scripts

---
 configs/solo/solo_swin_DC2_new.py | 118 ------------------
 scripts/README.md                 |  19 +++
 scripts/run_model.py              | 199 ++++++++++++++++++++++++++++++
 3 files changed, 218 insertions(+), 118 deletions(-)
 delete mode 100644 configs/solo/solo_swin_DC2_new.py
 create mode 100644 scripts/README.md
 create mode 100644 scripts/run_model.py

diff --git a/configs/solo/solo_swin_DC2_new.py b/configs/solo/solo_swin_DC2_new.py
deleted file mode 100644
index 7a4dd0f..0000000
--- a/configs/solo/solo_swin_DC2_new.py
+++ /dev/null
@@ -1,118 +0,0 @@
-""" This is a demo "solo config" file for use in solo_test_run_transformers.py.
-
-This uses template configs cascade_mask_rcnn_swin_b_in21k_50ep and yaml_style_defaults."""
-
-from omegaconf import OmegaConf
-import numpy as np
-# ---------------------------------------------------------------------------- #
-# Local variables and metadata
-# ---------------------------------------------------------------------------- #
-epoch=2
-bs=2
-metadata = OmegaConf.create() 
-metadata.classes = ["object"]
-
-numclasses = len(metadata.classes)
-
-# ---------------------------------------------------------------------------- #
-# Standard config (this has always been the LazyConfig/.py-style config)
-# ---------------------------------------------------------------------------- #
-# Get values from templates
-from ..COCO.cascade_mask_rcnn_swin_b_in21k_50ep import dataloader, model, train, lr_multiplier, optimizer
-import deepdisc.model.loaders as loaders
-from deepdisc.data_format.augment_image import dc2_train_augs, dc2_train_augs_full
-from deepdisc.data_format.image_readers import DC2ImageReader
-
-# Overrides
-dataloader.augs = dc2_train_augs
-dataloader.train.total_batch_size = bs
-
-model.proposal_generator.anchor_generator.sizes = [[8], [16], [32], [64], [128]]
-model.roi_heads.num_classes = numclasses
-model.roi_heads.batch_size_per_image = 512
-
-model.roi_heads.num_classes = numclasses
-model.roi_heads.batch_size_per_image = 512
-
-
-# ---------------------------------------------------------------------------- #
-#Change for different data sets
-
-#This is the number of color channels in the images
-model.backbone.bottom_up.in_chans = 6         
-
-#Take the averaged mean and standard deviations of each color channel in the test set
-model.pixel_mean = [
-        0.05381286,
-        0.04986344,
-        0.07526361,
-        0.10420945,
-        0.14229655,
-        0.21245764,
-]
-model.pixel_std = [
-        2.9318833,
-        1.8443471,
-        2.581817,
-        3.5950038,
-        4.5809164,
-        7.302009,
-]
-
-# ---------------------------------------------------------------------------- #
-model.proposal_generator.nms_thresh = 0.3
-
-for box_predictor in model.roi_heads.box_predictors:
-    box_predictor.test_topk_per_image = 2000
-    box_predictor.test_score_thresh = 0.5
-    box_predictor.test_nms_thresh = 0.3
-
-#The ImageNet1k pretrained weights file
-train.init_checkpoint = "/home/shared/hsc/detectron2/projects/ViTDet/model_final_246a82.pkl"
-
-optimizer.lr = 0.001
-dataloader.test.mapper = loaders.DictMapper
-dataloader.train.mapper = loaders.DictMapper
-
-# ---------------------------------------------------------------------------- #
-#Change for different data sets
-reader = DC2ImageReader()
-dataloader.imagereader = reader
-# ---------------------------------------------------------------------------- #
-dataloader.epoch=epoch
-
-
-# ---------------------------------------------------------------------------- #
-# Yaml-style config (was formerly saved as a .yaml file, loaded to cfg_loader)
-# ---------------------------------------------------------------------------- #
-# Get values from template
-from .yacs_style_defaults import MISC, DATALOADER, DATASETS, GLOBAL, INPUT, MODEL, SOLVER, TEST
-
-# Overrides
-SOLVER.IMS_PER_BATCH = bs
-
-DATASETS.TRAIN = "astro_train"
-DATASETS.TEST = "astro_val"
-
-SOLVER.BASE_LR = 0.001
-SOLVER.CLIP_GRADIENTS.ENABLED = True
-# Type of gradient clipping, currently 2 values are supported:
-# - "value": the absolute values of elements of each gradients are clipped
-# - "norm": the norm of the gradient for each parameter is clipped thus
-#   affecting all elements in the parameter
-SOLVER.CLIP_GRADIENTS.CLIP_TYPE = "norm"
-# Maximum absolute value used for clipping gradients
-# Floating point number p for L-p norm to be used with the "norm"
-# gradient clipping type; for L-inf, please specify .inf
-SOLVER.CLIP_GRADIENTS.NORM_TYPE = 5.0
-
-
-e1 = epoch * 15
-e2 = epoch * 25
-e3 = epoch * 30
-efinal = epoch * 50
-
-SOLVER.STEPS = [e1,e2,e3]  # do not decay learning rate for retraining
-SOLVER.LR_SCHEDULER_NAME = "WarmupMultiStepLR"
-SOLVER.WARMUP_ITERS = 0
-SOLVER.MAX_ITER = efinal  # for DefaultTrainer
diff --git a/scripts/README.md b/scripts/README.md
new file mode 100644
index 0000000..f1dbf29
--- /dev/null
+++ b/scripts/README.md
@@ -0,0 +1,19 @@
+
+## Training script:  
+
+This directory contains the script used to run the full training, ```run_model.py```  
+
+Run the script with ```python run_model.py --cfgfile $path_to_config --train-metadata $path_to_train_jsondict --eval-metadata $path_to_eval_dict --num-gpus $ngpu --run-name $name_of_run --output-dir $path_to_output.```  
+
+You can test this with the double/single_test.json files in ```/tests/deepdisc/test_data/dc2/```.  You should download the pre-trained weights [here](https://dl.fbaipublicfiles.com/detectron2/ViTDet/COCO/cascade_mask_rcnn_swin_b_in21k/f342979038/model_final_246a82.pkl)
+
+
+The command line options are explained below  
+
+- cfgfile: The configuration file used to build the model, learning rate optimizer, trainer, and dataloaders.  See ```/configs/solo/solo_swin.py``` for an example config.
+- train-metadata: The training data as a list of dicts stored in json format.  The dicts should have the "instance detection/segmentation" keys specified in the [detectron2 repo](https://detectron2.readthedocs.io/en/latest/tutorials/datasets.html)
+- eval-metadata: The same as the training metadata, but for the evaluation set.
+- num-gpus: The number of gpus used to train the model.  Must be a multiple of the batch size specified in the config
+- run-name: A string prefix that will be used to save the outputs of the script such as model weights and loss curves
+- output-dir: The directory to save the outputs
+
diff --git a/scripts/run_model.py b/scripts/run_model.py
new file mode 100644
index 0000000..546e8f9
--- /dev/null
+++ b/scripts/run_model.py
@@ -0,0 +1,199 @@
+try:
+    # ignore ShapelyDeprecationWarning from fvcore
+    import warnings
+    from shapely.errors import ShapelyDeprecationWarning
+    warnings.filterwarnings("ignore", category=sShapelyDeprecationWarning)
+except:
+    pass
+warnings.filterwarnings("ignore", category=RuntimeWarning)
+warnings.filterwarnings("ignore", category=UserWarning)
+
+# Some basic setup:
+# Setup detectron2 logger
+from detectron2.utils.logger import setup_logger
+setup_logger()
+
+import gc
+import os
+import time
+
+import detectron2.utils.comm as comm
+
+# import some common libraries
+import numpy as np
+import torch
+
+# import some common detectron2 utilities
+from detectron2.config import LazyConfig, get_cfg
+from detectron2.engine import launch
+
+from deepdisc.data_format.augment_image import hsc_test_augs, train_augs
+from deepdisc.data_format.image_readers import DC2ImageReader, HSCImageReader
+from deepdisc.data_format.register_data import register_data_set
+from deepdisc.model.loaders import DictMapper, RedshiftDictMapper, return_test_loader, return_train_loader
+from deepdisc.model.models import RedshiftPDFCasROIHeads, return_lazy_model
+from deepdisc.training.trainers import (
+    return_evallosshook,
+    return_lazy_trainer,
+    return_optimizer,
+    return_savehook,
+    return_schedulerhook,
+)
+from deepdisc.utils.parse_arguments import dtype_from_args, make_training_arg_parser
+
+
+def main(args, freeze):
+    # Hack if you get SSL certificate error
+    import ssl
+    ssl._create_default_https_context = ssl._create_unverified_context
+
+    # Handle args
+    output_dir = args.output_dir
+    run_name = args.run_name    
+
+    # Get file locations
+    trainfile = args.train_metadata
+    evalfile = args.eval_metadata
+   
+
+    cfgfile = args.cfgfile
+    
+    # Load the config
+    cfg = LazyConfig.load(cfgfile)
+    for key in cfg.get("MISC", dict()).keys():
+        cfg[key] = cfg.MISC[key]
+
+    # Register the data sets
+    astrotrain_metadata = register_data_set(
+        cfg.DATASETS.TRAIN, trainfile, thing_classes=cfg.metadata.classes
+    )
+    astroval_metadata = register_data_set(
+        cfg.DATASETS.TEST, evalfile, thing_classes=cfg.metadata.classes
+    )
+    
+    # Set the output directory
+    cfg.OUTPUT_DIR = output_dir
+    os.makedirs(cfg.OUTPUT_DIR, exist_ok=True)
+
+    # Iterations for 15, 25, 35, 50 epochs
+    epoch = cfg.dataloader.epoch
+    e1 = epoch * 15
+    e2 = epoch * 25
+    e3 = epoch * 30
+    efinal = epoch * 50
+    
+
+    #val_per = epoch
+    val_per = 5
+
+    model = return_lazy_model(cfg,freeze)
+
+    mapper = cfg.dataloader.train.mapper(
+            cfg.dataloader.imagereader, cfg.dataloader.key_mapper, cfg.dataloader.augs
+        ).map_data
+
+
+    loader = return_train_loader(cfg, mapper)
+    eval_loader = return_test_loader(cfg, mapper)    
+    
+    cfg.optimizer.params.model = model
+
+        
+    if freeze:
+
+        cfg.optimizer.lr = 0.001
+        optimizer = return_optimizer(cfg)
+
+
+        saveHook = return_savehook(run_name)
+        lossHook = return_evallosshook(val_per, model, eval_loader)
+        schedulerHook = return_schedulerhook(optimizer)
+        hookList = [lossHook, schedulerHook, saveHook]
+
+        trainer = return_lazy_trainer(model, loader, optimizer, cfg, hookList)
+        trainer.set_period(epoch//2)
+        #trainer.train(0, e1)
+        trainer.train(0,10)
+        if comm.is_main_process():
+            np.save(output_dir + run_name + "_losses", trainer.lossList)
+            np.save(output_dir + run_name + "_val_losses", trainer.vallossList)
+            
+        return
+            
+    else:
+        cfg.train.init_checkpoint = os.path.join(output_dir, run_name + ".pth")
+        cfg.SOLVER.BASE_LR = 0.0001
+        cfg.SOLVER.MAX_ITER = efinal  # for DefaultTrainer
+        cfg.SOLVER.STEPS=[e2,e3]
+        
+        cfg.optimizer.lr = 0.0001
+        
+        optimizer = return_optimizer(cfg)
+        schedulerHook = return_schedulerhook(optimizer)
+        
+        saveHook = return_savehook(run_name)
+        lossHook = return_evallosshook(val_per, model, eval_loader)
+        schedulerHook = return_schedulerhook(optimizer)
+        hookList = [lossHook, schedulerHook, saveHook]
+
+        trainer = return_lazy_trainer(model, loader, optimizer, cfg, hookList)
+        trainer.set_period(epoch//2)
+        #trainer.train(e1, efinal)
+        trainer.train(10,20)
+        if comm.is_main_process():
+            losses = np.load(output_dir + run_name + "_losses.npy")
+            losses = np.concatenate((losses, trainer.lossList))
+            np.save(output_dir + run_name + "_losses", losses)
+        return
+            
+    
+
+if __name__ == "__main__":
+    args = make_training_arg_parser().parse_args()
+    print("Command Line Args:", args)
+
+    print("Training head layers")
+    freeze = True
+    t0 = time.time()
+    launch(
+        main,
+        args.num_gpus,
+        num_machines=args.num_machines,
+        machine_rank=args.machine_rank,
+        dist_url=args.dist_url,
+        args=(
+            args,
+            freeze
+        ),
+    )
+
+    torch.cuda.empty_cache()
+    gc.collect()
+    
+    
+    ######
+    # After finetuning the head layers, train the whole model
+    ######
+    
+    print("Training all layers")
+    freeze = False
+    t0 = time.time()
+    launch(
+        main,
+        args.num_gpus,
+        num_machines=args.num_machines,
+        machine_rank=args.machine_rank,
+        dist_url=args.dist_url,
+        args=(
+            args,
+            freeze
+        ),
+    )
+
+    torch.cuda.empty_cache()
+    gc.collect()
+
+
+    
+    print(f"Took {time.time()-t0} seconds")
+    
\ No newline at end of file