From 36d0f61c0375f33cc6029ee9b76cd013b7bc0267 Mon Sep 17 00:00:00 2001
From: Garrett Vanhoy <gmvanhoy@gmail.com>
Date: Thu, 23 Mar 2023 16:07:22 -0400
Subject: [PATCH] 44 create and automate test suite (#46)

* Basic test and workflow.

* Workflow installing pytest.

* Wrong command.

* Running linting first.

* Proper unit test.

* Fix test dependency.

* Removed non-working spectrogram code.

* Fixing version mechanism

---------
---
 .github/workflows/pytest.yml                  |   39 +
 requirements.txt                              |    2 +
 setup.py                                      |   15 +-
 tests/test_datasets_sig53.py                  |    8 +
 torchsig/__init__.py                          |    3 +-
 torchsig/models/__init__.py                   |    2 -
 .../models/spectrogram_models/__init__.py     |    8 -
 .../models/spectrogram_models/detr/LICENSE.md |  201 ---
 .../models/spectrogram_models/detr/README.md  |    5 -
 .../spectrogram_models/detr/__init__.py       |    1 -
 .../models/spectrogram_models/detr/detr.py    |  300 ----
 .../models/spectrogram_models/detr/modules.py |  494 ------
 .../models/spectrogram_models/detr/utils.py   |  195 ---
 .../mask2former/LICENSE_Detectron2.md         |  202 ---
 .../mask2former/LICENSE_Mask2Former.md        |   19 -
 .../spectrogram_models/mask2former/README.md  |    5 -
 .../mask2former/__init__.py                   |    1 -
 .../mask2former/backbone.py                   |   38 -
 .../mask2former/criterion.py                  |  583 --------
 .../mask2former/mask2former.py                |  258 ----
 .../spectrogram_models/mask2former/modules.py |  146 --
 .../mask2former/ops/functions/__init__.py     |   13 -
 .../ops/functions/ms_deform_attn_func.py      |   72 -
 .../mask2former/ops/make.sh                   |   13 -
 .../mask2former/ops/modules/__init__.py       |   12 -
 .../mask2former/ops/modules/ms_deform_attn.py |  125 --
 .../mask2former/ops/setup.py                  |   78 -
 .../ops/src/cpu/ms_deform_attn_cpu.cpp        |   46 -
 .../ops/src/cpu/ms_deform_attn_cpu.h          |   38 -
 .../ops/src/cuda/ms_deform_attn_cuda.cu       |  158 --
 .../ops/src/cuda/ms_deform_attn_cuda.h        |   35 -
 .../ops/src/cuda/ms_deform_im2col_cuda.cuh    | 1332 -----------------
 .../mask2former/ops/src/ms_deform_attn.h      |   67 -
 .../mask2former/ops/src/vision.cpp            |   21 -
 .../mask2former/ops/test.py                   |   92 --
 .../mask2former/pixel_decoder.py              |  724 ---------
 .../mask2former/predictor.py                  |  395 -----
 .../spectrogram_models/mask2former/utils.py   |  298 ----
 .../spectrogram_models/pspnet/LICENSE.md      |   21 -
 .../spectrogram_models/pspnet/README.md       |    5 -
 .../spectrogram_models/pspnet/__init__.py     |    1 -
 .../spectrogram_models/pspnet/modules.py      |   54 -
 .../spectrogram_models/pspnet/pspnet.py       |  281 ----
 .../models/spectrogram_models/pspnet/utils.py |  140 --
 .../spectrogram_models/yolov5/LICENSE.md      |  674 ---------
 .../spectrogram_models/yolov5/README.md       |    5 -
 .../spectrogram_models/yolov5/__init__.py     |    1 -
 .../spectrogram_models/yolov5/mean_ap.py      |  802 ----------
 .../spectrogram_models/yolov5/modules.py      |  804 ----------
 .../models/spectrogram_models/yolov5/utils.py |  343 -----
 .../spectrogram_models/yolov5/yolov5.py       |  247 ---
 .../spectrogram_models/yolov5/yolov5f.yaml    |   50 -
 .../spectrogram_models/yolov5/yolov5l.yaml    |   48 -
 .../spectrogram_models/yolov5/yolov5m.yaml    |   48 -
 .../spectrogram_models/yolov5/yolov5n.yaml    |   48 -
 .../spectrogram_models/yolov5/yolov5p.yaml    |   50 -
 .../spectrogram_models/yolov5/yolov5s.yaml    |   48 -
 .../spectrogram_models/yolov5/yolov5x.yaml    |   48 -
 .../transforms/spectrogram_transforms/spec.py |  846 ++++++-----
 .../target_transforms/target_transforms.py    | 1156 ++++++++------
 torchsig/utils/dataset.py                     |   65 +-
 torchsig/utils/visualize.py                   |  460 ++++--
 torchsig/version.py                           |    1 -
 63 files changed, 1603 insertions(+), 10687 deletions(-)
 create mode 100644 .github/workflows/pytest.yml
 create mode 100644 tests/test_datasets_sig53.py
 delete mode 100644 torchsig/models/spectrogram_models/__init__.py
 delete mode 100644 torchsig/models/spectrogram_models/detr/LICENSE.md
 delete mode 100644 torchsig/models/spectrogram_models/detr/README.md
 delete mode 100644 torchsig/models/spectrogram_models/detr/__init__.py
 delete mode 100644 torchsig/models/spectrogram_models/detr/detr.py
 delete mode 100644 torchsig/models/spectrogram_models/detr/modules.py
 delete mode 100644 torchsig/models/spectrogram_models/detr/utils.py
 delete mode 100644 torchsig/models/spectrogram_models/mask2former/LICENSE_Detectron2.md
 delete mode 100644 torchsig/models/spectrogram_models/mask2former/LICENSE_Mask2Former.md
 delete mode 100644 torchsig/models/spectrogram_models/mask2former/README.md
 delete mode 100644 torchsig/models/spectrogram_models/mask2former/__init__.py
 delete mode 100644 torchsig/models/spectrogram_models/mask2former/backbone.py
 delete mode 100644 torchsig/models/spectrogram_models/mask2former/criterion.py
 delete mode 100644 torchsig/models/spectrogram_models/mask2former/mask2former.py
 delete mode 100644 torchsig/models/spectrogram_models/mask2former/modules.py
 delete mode 100644 torchsig/models/spectrogram_models/mask2former/ops/functions/__init__.py
 delete mode 100644 torchsig/models/spectrogram_models/mask2former/ops/functions/ms_deform_attn_func.py
 delete mode 100755 torchsig/models/spectrogram_models/mask2former/ops/make.sh
 delete mode 100644 torchsig/models/spectrogram_models/mask2former/ops/modules/__init__.py
 delete mode 100644 torchsig/models/spectrogram_models/mask2former/ops/modules/ms_deform_attn.py
 delete mode 100644 torchsig/models/spectrogram_models/mask2former/ops/setup.py
 delete mode 100644 torchsig/models/spectrogram_models/mask2former/ops/src/cpu/ms_deform_attn_cpu.cpp
 delete mode 100644 torchsig/models/spectrogram_models/mask2former/ops/src/cpu/ms_deform_attn_cpu.h
 delete mode 100644 torchsig/models/spectrogram_models/mask2former/ops/src/cuda/ms_deform_attn_cuda.cu
 delete mode 100644 torchsig/models/spectrogram_models/mask2former/ops/src/cuda/ms_deform_attn_cuda.h
 delete mode 100644 torchsig/models/spectrogram_models/mask2former/ops/src/cuda/ms_deform_im2col_cuda.cuh
 delete mode 100644 torchsig/models/spectrogram_models/mask2former/ops/src/ms_deform_attn.h
 delete mode 100644 torchsig/models/spectrogram_models/mask2former/ops/src/vision.cpp
 delete mode 100644 torchsig/models/spectrogram_models/mask2former/ops/test.py
 delete mode 100644 torchsig/models/spectrogram_models/mask2former/pixel_decoder.py
 delete mode 100644 torchsig/models/spectrogram_models/mask2former/predictor.py
 delete mode 100644 torchsig/models/spectrogram_models/mask2former/utils.py
 delete mode 100644 torchsig/models/spectrogram_models/pspnet/LICENSE.md
 delete mode 100644 torchsig/models/spectrogram_models/pspnet/README.md
 delete mode 100644 torchsig/models/spectrogram_models/pspnet/__init__.py
 delete mode 100644 torchsig/models/spectrogram_models/pspnet/modules.py
 delete mode 100644 torchsig/models/spectrogram_models/pspnet/pspnet.py
 delete mode 100644 torchsig/models/spectrogram_models/pspnet/utils.py
 delete mode 100644 torchsig/models/spectrogram_models/yolov5/LICENSE.md
 delete mode 100644 torchsig/models/spectrogram_models/yolov5/README.md
 delete mode 100644 torchsig/models/spectrogram_models/yolov5/__init__.py
 delete mode 100644 torchsig/models/spectrogram_models/yolov5/mean_ap.py
 delete mode 100644 torchsig/models/spectrogram_models/yolov5/modules.py
 delete mode 100644 torchsig/models/spectrogram_models/yolov5/utils.py
 delete mode 100644 torchsig/models/spectrogram_models/yolov5/yolov5.py
 delete mode 100644 torchsig/models/spectrogram_models/yolov5/yolov5f.yaml
 delete mode 100644 torchsig/models/spectrogram_models/yolov5/yolov5l.yaml
 delete mode 100644 torchsig/models/spectrogram_models/yolov5/yolov5m.yaml
 delete mode 100644 torchsig/models/spectrogram_models/yolov5/yolov5n.yaml
 delete mode 100644 torchsig/models/spectrogram_models/yolov5/yolov5p.yaml
 delete mode 100644 torchsig/models/spectrogram_models/yolov5/yolov5s.yaml
 delete mode 100644 torchsig/models/spectrogram_models/yolov5/yolov5x.yaml
 delete mode 100644 torchsig/version.py

diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml
new file mode 100644
index 0000000..0b75fe9
--- /dev/null
+++ b/.github/workflows/pytest.yml
@@ -0,0 +1,39 @@
+name: Test
+
+on: [push]
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.7", "3.8", "3.9", "3.10"]
+
+    steps:
+      - uses: actions/checkout@v3
+      - name: Install Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+      - uses: actions/cache@v3
+        with:
+          path: ${{ env.pythonLocation }}
+          key: ${{ env.pythonLocation }}-${{ hashFiles('setup.py') }}-${{ hashFiles('requirements.txt') }}
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip --upgrade-strategy eager
+          pip install -r requirements.txt
+      - name: Build package
+        run: |
+          python -m pip install .
+      - name: Lint with flake8
+        run: |
+          pip3 install flake8
+          # stop the build if there are Python syntax errors or undefined names
+          flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
+          # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
+          # flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+      - name: Test with pytest
+        run: |
+          pip install pytest
+          pytest
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 3809445..9c35122 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -17,3 +17,5 @@ timm==0.5.4
 segmentation_models_pytorch
 pytorch_lightning
 sympy
+numba
+torchmetrics
diff --git a/setup.py b/setup.py
index faa319b..9fe7f39 100644
--- a/setup.py
+++ b/setup.py
@@ -1,19 +1,16 @@
-import os
-import setuptools
 from distutils.core import setup
+import setuptools
 
 with open("README.md") as f:
     long_description = f.read()
 
-exec(open('torchsig/version.py').read())
-
 setup(
-    name='torchsig',
-    version=__version__,
-    description='Signal Processing Machine Learning Toolkit',
+    name="torchsig",
+    version="0.1.0",
+    description="Signal Processing Machine Learning Toolkit",
     long_description=long_description,
     long_description_content_type="text/markdown",
-    author='TorchSig Team',
-    url='https://github.com/torchdsp/torchsig',
+    author="TorchSig Team",
+    url="https://github.com/torchdsp/torchsig",
     packages=setuptools.find_packages(),
 )
diff --git a/tests/test_datasets_sig53.py b/tests/test_datasets_sig53.py
new file mode 100644
index 0000000..dca69a8
--- /dev/null
+++ b/tests/test_datasets_sig53.py
@@ -0,0 +1,8 @@
+from torchsig.datasets.sig53 import Sig53
+from unittest import TestCase
+
+
+class GenerateSig53(TestCase):
+    def test_can_generate_sig53_clean_train(self):
+        x = 2 + 2
+        self.assertEqual(x, 4)
diff --git a/torchsig/__init__.py b/torchsig/__init__.py
index ea58c63..b3025f3 100644
--- a/torchsig/__init__.py
+++ b/torchsig/__init__.py
@@ -2,4 +2,5 @@
 from torchsig import datasets
 from torchsig import utils
 from torchsig import models
-from .version import __version__
\ No newline at end of file
+
+__version__ = "0.1.0"
diff --git a/torchsig/models/__init__.py b/torchsig/models/__init__.py
index 9f92126..57d1d75 100644
--- a/torchsig/models/__init__.py
+++ b/torchsig/models/__init__.py
@@ -1,4 +1,2 @@
 from . import iq_models
-from . import spectrogram_models
 from torchsig.models.iq_models import *
-from torchsig.models.spectrogram_models import *
diff --git a/torchsig/models/spectrogram_models/__init__.py b/torchsig/models/spectrogram_models/__init__.py
deleted file mode 100644
index 91b7b61..0000000
--- a/torchsig/models/spectrogram_models/__init__.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from .yolov5 import *
-from .detr import *
-from .pspnet import *
-from .mask2former import *
-from torchsig.models.spectrogram_models.yolov5 import *
-from torchsig.models.spectrogram_models.detr import *
-from torchsig.models.spectrogram_models.pspnet import *
-from torchsig.models.spectrogram_models.mask2former import *
\ No newline at end of file
diff --git a/torchsig/models/spectrogram_models/detr/LICENSE.md b/torchsig/models/spectrogram_models/detr/LICENSE.md
deleted file mode 100644
index cc14143..0000000
--- a/torchsig/models/spectrogram_models/detr/LICENSE.md
+++ /dev/null
@@ -1,201 +0,0 @@
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright 2020 - present, Facebook, Inc
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
\ No newline at end of file
diff --git a/torchsig/models/spectrogram_models/detr/README.md b/torchsig/models/spectrogram_models/detr/README.md
deleted file mode 100644
index 1b1c70b..0000000
--- a/torchsig/models/spectrogram_models/detr/README.md
+++ /dev/null
@@ -1,5 +0,0 @@
-# DETR
-
-The DETR code contained here has been cloned, modified, and supplemented from its original [detr github](https://github.com/facebookresearch/detr).
-
-DETR is licensed under an Apache 2.0 license. This license for DETR is contained within this directory.
diff --git a/torchsig/models/spectrogram_models/detr/__init__.py b/torchsig/models/spectrogram_models/detr/__init__.py
deleted file mode 100644
index bd38e28..0000000
--- a/torchsig/models/spectrogram_models/detr/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .detr import *
\ No newline at end of file
diff --git a/torchsig/models/spectrogram_models/detr/detr.py b/torchsig/models/spectrogram_models/detr/detr.py
deleted file mode 100644
index f504170..0000000
--- a/torchsig/models/spectrogram_models/detr/detr.py
+++ /dev/null
@@ -1,300 +0,0 @@
-import timm
-import gdown
-import torch
-import os.path
-import numpy as np
-from torch import nn
-
-from .modules import *
-from .utils import *
-
-
-__all__ = [
-    "detr_b0_nano", "detr_b2_nano", "detr_b4_nano",
-    "detr_b0_nano_mod_family", "detr_b2_nano_mod_family", "detr_b4_nano_mod_family",
-]
-
-model_urls = {
-    "detr_b0_nano": "1t6V3M5hJC8C-RSwPtgKGG89u5doibs46",
-    "detr_b2_nano": "1voDx7e0pBe_lGa_1sUYG8gyzOqz8nxmw",
-    "detr_b4_nano": "1RA7yGvpKiIXHXl_o89Zn6R2dVVTgKsWO",
-    "detr_b0_nano_mod_family": "1w42OxyAFf7CTJ5Yw8OU-kAZQZCpkNyaz",
-    "detr_b2_nano_mod_family": "1Wd8QD5Eq2mbEz3hkMlAQFxWZcxZChLma",
-    "detr_b4_nano_mod_family": "1ykrztgBc6c9knk1F2OirSUE_W3YbsTdB",
-}
-
-
-def detr_b0_nano(
-    pretrained: bool = False, 
-    path: str = "detr_b0_nano.pt",
-    num_classes: int = 1,
-    drop_rate_backbone: float = 0.2,
-    drop_path_rate_backbone: float = 0.2,
-    drop_path_rate_transformer: float = 0.1,
-):
-    """Constructs a DETR architecture with an EfficientNet-B0 backbone and an XCiT-Nano transformer.
-    DETR from `"End-to-End Object Detection with Transformers" <https://arxiv.org/pdf/2005.12872.pdf>`_.
-    EfficientNet from `"EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks" <https://arxiv.org/pdf/1905.11946.pdf>`_.
-    XCiT from `"XCiT: Cross-Covariance Image Transformers" <https://arxiv.org/pdf/2106.09681.pdf>`_.
-    
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on WBSig53
-        path (str): Path to existing model or where to download checkpoint to
-        num_classes (int): Number of output classes; if loading checkpoint and number does not equal 1, final layer will not be loaded from checkpoint
-        drop_path_rate_backbone (float): Backbone drop path rate for training
-        drop_rate_backbone (float): Backbone dropout rate for training
-        drop_path_rate_transformer (float): Transformer drop path rate for training
-        
-    """
-    # Create DETR-B0-Nano
-    mdl = create_detr(
-        backbone='efficientnet_b0',
-        transformer='xcit-nano',
-        num_classes=1,
-        num_objects=50,
-        hidden_dim=256,
-        drop_rate_backbone=drop_rate_backbone,
-        drop_path_rate_backbone=drop_path_rate_backbone,
-        drop_path_rate_transformer=drop_path_rate_transformer,
-        ds_rate_transformer=2,
-        ds_method_transformer='chunker',
-    )
-    if pretrained:
-        model_exists = os.path.exists(path)
-        if not model_exists:
-            file_id = model_urls['detr_b0_nano']
-            dl = gdown.download(id=file_id, output=path)
-        mdl.load_state_dict(torch.load(path), strict=False)
-    if num_classes != 1:
-        mdl.linear_class = nn.Linear(mdl.linear_class.in_features, num_classes)
-    return mdl
-    
-    
-def detr_b2_nano(
-    pretrained: bool = False, 
-    path: str = "detr_b2_nano.pt",
-    num_classes: int = 1,
-    drop_rate_backbone: float = 0.3,
-    drop_path_rate_backbone: float = 0.2,
-    drop_path_rate_transformer: float = 0.1,
-):
-    """Constructs a DETR architecture with an EfficientNet-B2 backbone and an XCiT-Nano transformer.
-    DETR from `"End-to-End Object Detection with Transformers" <https://arxiv.org/pdf/2005.12872.pdf>`_.
-    EfficientNet from `"EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks" <https://arxiv.org/pdf/1905.11946.pdf>`_.
-    XCiT from `"XCiT: Cross-Covariance Image Transformers" <https://arxiv.org/pdf/2106.09681.pdf>`_.
-
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on WBSig53
-        path (str): Path to existing model or where to download checkpoint to
-        num_classes (int): Number of output classes; if loading checkpoint and number does not equal 1, final layer will not be loaded from checkpoint
-        drop_path_rate_backbone (float): Backbone drop path rate for training
-        drop_rate_backbone (float): Backbone dropout rate for training
-        drop_path_rate_transformer (float): Transformer drop path rate for training
-        
-    """
-    # Create DETR-B2-Nano
-    mdl = create_detr(
-        backbone='efficientnet_b2',
-        transformer='xcit-nano',
-        num_classes=1,
-        num_objects=50,
-        hidden_dim=256,
-        drop_rate_backbone=drop_rate_backbone,
-        drop_path_rate_backbone=drop_path_rate_backbone,
-        drop_path_rate_transformer=drop_path_rate_transformer,
-        ds_rate_transformer=2,
-        ds_method_transformer='chunker',
-    )
-    if pretrained:
-        model_exists = os.path.exists(path)
-        if not model_exists:
-            file_id = model_urls['detr_b2_nano']
-            dl = gdown.download(id=file_id, output=path)
-        mdl.load_state_dict(torch.load(path), strict=False)
-    if num_classes != 1:
-        mdl.linear_class = nn.Linear(mdl.linear_class.in_features, num_classes)
-    return mdl
-    
-    
-def detr_b4_nano(
-    pretrained: bool = False, 
-    path: str = "detr_b4_nano.pt",
-    num_classes: int = 1,
-    drop_rate_backbone: float = 0.4,
-    drop_path_rate_backbone: float = 0.2,
-    drop_path_rate_transformer: float = 0.1,
-):
-    """Constructs a DETR architecture with an EfficientNet-B4 backbone and an XCiT-Nano transformer.
-    DETR from `"End-to-End Object Detection with Transformers" <https://arxiv.org/pdf/2005.12872.pdf>`_.
-    EfficientNet from `"EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks" <https://arxiv.org/pdf/1905.11946.pdf>`_.
-    XCiT from `"XCiT: Cross-Covariance Image Transformers" <https://arxiv.org/pdf/2106.09681.pdf>`_.
-
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on WBSig53
-        path (str): Path to existing model or where to download checkpoint to
-        num_classes (int): Number of output classes; if loading checkpoint and number does not equal 1, final layer will not be loaded from checkpoint
-        drop_path_rate_backbone (float): Backbone drop path rate for training
-        drop_rate_backbone (float): Backbone dropout rate for training
-        drop_path_rate_transformer (float): Transformer drop path rate for training
-        
-    """
-    # Create DETR-B4-Nano
-    mdl = create_detr(
-        backbone='efficientnet_b4',
-        transformer='xcit-nano',
-        num_classes=1,
-        num_objects=50,
-        hidden_dim=256,
-        drop_rate_backbone=drop_rate_backbone,
-        drop_path_rate_backbone=drop_path_rate_backbone,
-        drop_path_rate_transformer=drop_path_rate_transformer,
-        ds_rate_transformer=2,
-        ds_method_transformer='chunker',
-    )
-    if pretrained:
-        model_exists = os.path.exists(path)
-        if not model_exists:
-            file_id = model_urls['detr_b4_nano']
-            dl = gdown.download(id=file_id, output=path)
-        mdl.load_state_dict(torch.load(path), strict=False)
-    if num_classes != 1:
-        mdl.linear_class = nn.Linear(mdl.linear_class.in_features, num_classes)
-    return mdl
-
-
-def detr_b0_nano_mod_family(
-    pretrained: bool = False, 
-    path: str = "detr_b0_nano_mod_family.pt",
-    num_classes: int = 6,
-    drop_rate_backbone: float = 0.2,
-    drop_path_rate_backbone: float = 0.2,
-    drop_path_rate_transformer: float = 0.1,
-):
-    """Constructs a DETR architecture with an EfficientNet-B0 backbone and an XCiT-Nano transformer.
-    DETR from `"End-to-End Object Detection with Transformers" <https://arxiv.org/pdf/2005.12872.pdf>`_.
-    EfficientNet from `"EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks" <https://arxiv.org/pdf/1905.11946.pdf>`_.
-    XCiT from `"XCiT: Cross-Covariance Image Transformers" <https://arxiv.org/pdf/2106.09681.pdf>`_.
-    
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on WBSig53
-        path (str): Path to existing model or where to download checkpoint to
-        num_classes (int): Number of output classes; if loading checkpoint and number does not equal 6, final layer will not be loaded from checkpoint
-        drop_path_rate_backbone (float): Backbone drop path rate for training
-        drop_rate_backbone (float): Backbone dropout rate for training
-        drop_path_rate_transformer (float): Transformer drop path rate for training
-        
-    """
-    # Create DETR-B0-Nano
-    mdl = create_detr(
-        backbone='efficientnet_b0',
-        transformer='xcit-nano',
-        num_classes=6,
-        num_objects=50,
-        hidden_dim=256,
-        drop_rate_backbone=drop_rate_backbone,
-        drop_path_rate_backbone=drop_path_rate_backbone,
-        drop_path_rate_transformer=drop_path_rate_transformer,
-        ds_rate_transformer=2,
-        ds_method_transformer='chunker',
-    )
-    if pretrained:
-        model_exists = os.path.exists(path)
-        if not model_exists:
-            file_id = model_urls['detr_b0_nano_mod_family']
-            dl = gdown.download(id=file_id, output=path)
-        mdl.load_state_dict(torch.load(path), strict=False)
-    if num_classes != 6:
-        mdl.linear_class = nn.Linear(mdl.linear_class.in_features, num_classes)
-    return mdl
-    
-    
-def detr_b2_nano_mod_family(
-    pretrained: bool = False, 
-    path: str = "detr_b2_nano_mod_family.pt",
-    num_classes: int = 1,
-    drop_rate_backbone: float = 0.3,
-    drop_path_rate_backbone: float = 0.2,
-    drop_path_rate_transformer: float = 0.1,
-):
-    """Constructs a DETR architecture with an EfficientNet-B2 backbone and an XCiT-Nano transformer.
-    DETR from `"End-to-End Object Detection with Transformers" <https://arxiv.org/pdf/2005.12872.pdf>`_.
-    EfficientNet from `"EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks" <https://arxiv.org/pdf/1905.11946.pdf>`_.
-    XCiT from `"XCiT: Cross-Covariance Image Transformers" <https://arxiv.org/pdf/2106.09681.pdf>`_.
-
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on WBSig53
-        path (str): Path to existing model or where to download checkpoint to
-        num_classes (int): Number of output classes; if loading checkpoint and number does not equal 6, final layer will not be loaded from checkpoint
-        drop_path_rate_backbone (float): Backbone drop path rate for training
-        drop_rate_backbone (float): Backbone dropout rate for training
-        drop_path_rate_transformer (float): Transformer drop path rate for training
-        
-    """
-    # Create DETR-B2-Nano
-    mdl = create_detr(
-        backbone='efficientnet_b2',
-        transformer='xcit-nano',
-        num_classes=6,
-        num_objects=50,
-        hidden_dim=256,
-        drop_rate_backbone=drop_rate_backbone,
-        drop_path_rate_backbone=drop_path_rate_backbone,
-        drop_path_rate_transformer=drop_path_rate_transformer,
-        ds_rate_transformer=2,
-        ds_method_transformer='chunker',
-    )
-    if pretrained:
-        model_exists = os.path.exists(path)
-        if not model_exists:
-            file_id = model_urls['detr_b2_nano_mod_family']
-            dl = gdown.download(id=file_id, output=path)
-        mdl.load_state_dict(torch.load(path), strict=False)
-    if num_classes != 6:
-        mdl.linear_class = nn.Linear(mdl.linear_class.in_features, num_classes)
-    return mdl
-    
-    
-def detr_b4_nano_mod_family(
-    pretrained: bool = False, 
-    path: str = "detr_b4_nano_mod_family.pt",
-    num_classes: int = 6,
-    drop_rate_backbone: float = 0.4,
-    drop_path_rate_backbone: float = 0.2,
-    drop_path_rate_transformer: float = 0.1,
-):
-    """Constructs a DETR architecture with an EfficientNet-B4 backbone and an XCiT-Nano transformer.
-    DETR from `"End-to-End Object Detection with Transformers" <https://arxiv.org/pdf/2005.12872.pdf>`_.
-    EfficientNet from `"EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks" <https://arxiv.org/pdf/1905.11946.pdf>`_.
-    XCiT from `"XCiT: Cross-Covariance Image Transformers" <https://arxiv.org/pdf/2106.09681.pdf>`_.
-
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on WBSig53
-        path (str): Path to existing model or where to download checkpoint to
-        num_classes (int): Number of output classes; if loading checkpoint and number does not equal 6, final layer will not be loaded from checkpoint
-        drop_path_rate_backbone (float): Backbone drop path rate for training
-        drop_rate_backbone (float): Backbone dropout rate for training
-        drop_path_rate_transformer (float): Transformer drop path rate for training
-        
-    """
-    # Create DETR-B4-Nano
-    mdl = create_detr(
-        backbone='efficientnet_b4',
-        transformer='xcit-nano',
-        num_classes=6,
-        num_objects=50,
-        hidden_dim=256,
-        drop_rate_backbone=drop_rate_backbone,
-        drop_path_rate_backbone=drop_path_rate_backbone,
-        drop_path_rate_transformer=drop_path_rate_transformer,
-        ds_rate_transformer=2,
-        ds_method_transformer='chunker',
-    )
-    if pretrained:
-        model_exists = os.path.exists(path)
-        if not model_exists:
-            file_id = model_urls['detr_b0_nano_mod_family']
-            dl = gdown.download(id=file_id, output=path)
-        mdl.load_state_dict(torch.load(path), strict=False)
-    if num_classes != 6:
-        mdl.linear_class = nn.Linear(mdl.linear_class.in_features, num_classes)
-    return mdl
diff --git a/torchsig/models/spectrogram_models/detr/modules.py b/torchsig/models/spectrogram_models/detr/modules.py
deleted file mode 100644
index 211d103..0000000
--- a/torchsig/models/spectrogram_models/detr/modules.py
+++ /dev/null
@@ -1,494 +0,0 @@
-import timm
-import torch
-from torch import nn
-from typing import List
-from torch.nn import functional as F
-from scipy.optimize import linear_sum_assignment
-
-from .utils import xcit_name_to_timm_name
-from .utils import drop_classifier, find_output_features
-from .utils import box_cxcywh_to_xyxy, generalized_box_iou
-from .utils import is_dist_avail_and_initialized, get_world_size, accuracy
-
-
-class ConvDownSampler(torch.nn.Module):
-    def __init__(self, in_chans, embed_dim, ds_rate=16):
-        super().__init__()
-        ds_rate //= 2
-        chan = embed_dim // ds_rate
-        blocks = [
-            torch.nn.Conv2d(in_chans, chan, (5,5), 2, 2), 
-            torch.nn.BatchNorm2d(chan), 
-            torch.nn.SiLU()
-        ]
-
-        while ds_rate > 1:
-            blocks += [
-                torch.nn.Conv2d(chan, 2 * chan, (5,5), 2, 2),
-                torch.nn.BatchNorm2d(2 * chan),
-                torch.nn.SiLU(),
-            ]
-            ds_rate //= 2
-            chan = 2 * chan
-
-        blocks += [
-            torch.nn.Conv2d(
-                chan,
-                embed_dim,
-                (1,1),
-            )
-        ]
-        self.blocks = torch.nn.Sequential(*blocks)
-
-    def forward(self, X):
-        return self.blocks(X)
-
-
-class Chunker(torch.nn.Module):
-    def __init__(self, in_chans, embed_dim, ds_rate=16):
-        super().__init__()
-        self.embed = torch.nn.Conv2d(in_chans, embed_dim // ds_rate, (7,7), padding=3)
-        self.project = torch.nn.Conv2d((embed_dim // ds_rate) * ds_rate, embed_dim, (1,1))
-        self.ds_rate = ds_rate
-
-    def forward(self, X):
-        X = self.embed(X)
-        X = torch.cat(
-            [
-                torch.cat(torch.split(x_i, 1, -1), 1)
-                for x_i in torch.split(X, self.ds_rate, -1)
-            ],
-            -1,
-        )
-        X = self.project(X)
-
-        return X
-    
-    
-class XCiT(torch.nn.Module):
-    def __init__(self, backbone, in_chans=2, num_objects=50, ds_rate=2, ds_method="downsample"):
-        super().__init__()
-        self.backbone = backbone
-        self.num_objects = num_objects
-        W = backbone.num_features
-        self.grouper = torch.nn.Conv1d(W, backbone.num_classes, 1)
-        if ds_method == "downsample":
-            self.backbone.patch_embed = ConvDownSampler(in_chans, W, ds_rate)
-        else:
-            self.backbone.patch_embed = Chunker(in_chans, W, ds_rate)
-
-    def forward(self, x):
-        mdl = self.backbone
-        B = x.shape[0]
-        x = self.backbone.patch_embed(x)
-
-        Hp, Wp = x.shape[-2], x.shape[-1]
-        pos_encoding = (
-            mdl.pos_embed(B, Hp, Wp).reshape(B, -1, Hp*Wp).permute(0, 2, 1).half()
-        )
-        x = x.reshape(B, -1, Hp*Wp).permute(0, 2,1) + pos_encoding
-        for blk in mdl.blocks:
-            x = blk(x, Hp, Wp)
-        cls_tokens = mdl.cls_token.expand(B, -1, -1)
-        x = torch.cat((cls_tokens, x), dim=1)
-        for blk in mdl.cls_attn_blocks:
-            x = blk(x)
-        x = mdl.norm(x)
-        x = self.grouper(x.transpose(1, 2)[:, :, :self.num_objects])
-        x = x.squeeze()
-        if x.dim() == 2:
-            x = x.unsqueeze(0)
-        x = x.transpose(1,2)
-        return x
-    
-
-class MLP(torch.nn.Module):
-    """Very simple multi-layer perceptron (also called FFN) from DETR repo
-    
-    """
-    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
-        super().__init__()
-        self.num_layers = num_layers
-        h = [hidden_dim] * (num_layers - 1)
-        self.layers = torch.nn.ModuleList(
-            torch.nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])
-        )
-
-    def forward(self, x):
-        for i, layer in enumerate(self.layers):
-            x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
-        return x
-    
-    
-class DETRModel(torch.nn.Module):
-    def __init__(
-        self, 
-        backbone: torch.nn.Module,
-        transformer: torch.nn.Module,
-        num_classes: int = 53,
-        num_objects: int = 50,
-        hidden_dim: int = 256,
-    ):
-        super().__init__()
-        # Convolutional backbone
-        self.backbone = backbone
-        
-        # Conversion layer
-        self.conv = torch.nn.Conv2d(
-            in_channels=find_output_features(self.backbone),
-            out_channels=hidden_dim,
-            kernel_size=1,
-        )
-
-        # Transformer
-        self.transformer = transformer
-        
-        # Prediction heads, one extra class for predicting non-empty slots
-        self.linear_class = torch.nn.Linear(hidden_dim, num_classes + 1)
-        self.linear_bbox = MLP(hidden_dim, hidden_dim, 4, 3)
-
-    def forward(self, x):
-        # Propagate inputs through backbone
-        x = self.backbone(x)
-
-        # Convert from 2048 to 256 feature planes for the transformer
-        h = self.conv(x)
-        
-        # Propagate through the transformer
-        h = self.transformer(h)
-        
-        # Project transformer outputs to class labels and bounding boxes
-        return {
-            'pred_logits': self.linear_class(h), 
-            'pred_boxes': self.linear_bbox(h).sigmoid()
-        }
-
-    
-class SetCriterion(nn.Module):
-    """ This class computes the loss for DETR.
-    The process happens in two steps:
-        1) we compute hungarian assignment between ground truth boxes and the outputs of the model
-        2) we supervise each pair of matched ground-truth / prediction (supervise class and box)
-    """
-    def __init__(
-        self, 
-        num_classes: int = 1, 
-        class_loss_coef: float = 1.0, 
-        bbox_loss_coef: float = 5.0, 
-        giou_loss_coef: float = 2.0, 
-        eos_coef: float = 0.1, 
-        losses: List[str] = ['labels', 'boxes', 'cardinality'],
-    ):
-        """ Create the criterion.
-        Parameters:
-            num_classes: number of object categories, omitting the special no-object category
-            matcher: module able to compute a matching between targets and proposals
-            weight_dict: dict containing as key the names of the losses and as values their relative weight.
-            eos_coef: relative classification weight applied to the no-object category
-            losses: list of all the losses to be applied. See get_loss for list of available losses.
-        """
-        super().__init__()
-        self.num_classes = num_classes
-        self.weight_dict = {
-            'loss_ce': class_loss_coef,
-            'loss_bbox': bbox_loss_coef,
-            'loss_giou': giou_loss_coef,
-        }
-        self.matcher = HungarianMatcher(
-            cost_class=self.weight_dict['loss_ce'], 
-            cost_bbox=self.weight_dict['loss_bbox'],
-            cost_giou=self.weight_dict['loss_giou'],
-        )
-        self.eos_coef = eos_coef
-        self.losses = losses
-        empty_weight = torch.ones(self.num_classes + 1)
-        empty_weight[-1] = self.eos_coef
-        self.register_buffer('empty_weight', empty_weight)
-
-    def loss_labels(self, outputs, targets, indices, num_boxes, log=True):
-        """Classification loss (NLL)
-        targets dicts must contain the key "labels" containing a tensor of dim [nb_target_boxes]
-        """
-        assert 'pred_logits' in outputs
-        src_logits = outputs['pred_logits']
-
-        idx = self._get_src_permutation_idx(indices)
-        target_classes_o = torch.cat([t["labels"][J] for t, (_, J) in zip(targets, indices)])
-        target_classes = torch.full(
-            src_logits.shape[:2], 
-            self.num_classes,
-            dtype=torch.int64, 
-            device=src_logits.device
-        )
-        target_classes[idx] = target_classes_o
-
-        loss_ce = F.cross_entropy(src_logits.transpose(1, 2), target_classes, self.empty_weight)
-        losses = {'loss_ce': loss_ce}
-
-        if log:
-            # TODO this should probably be a separate loss, not hacked in this one here
-            losses['class_error'] = 100 - accuracy(src_logits[idx], target_classes_o)[0]
-        return losses
-
-    @torch.no_grad()
-    def loss_cardinality(self, outputs, targets, indices, num_boxes):
-        """ Compute the cardinality error, ie the absolute error in the number of predicted non-empty boxes
-        This is not really a loss, it is intended for logging purposes only. It doesn't propagate gradients
-        """
-        pred_logits = outputs['pred_logits']
-        device = pred_logits.device
-        tgt_lengths = torch.as_tensor([len(v["labels"]) for v in targets], device=device)
-        # Count the number of predictions that are NOT "no-object" (which is the last class)
-        card_pred = (pred_logits.argmax(-1) != pred_logits.shape[-1] - 1).sum(1)
-        card_err = F.l1_loss(card_pred.float(), tgt_lengths.float())
-        losses = {'cardinality_error': card_err}
-        return losses
-
-    def loss_boxes(self, outputs, targets, indices, num_boxes):
-        """Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss
-           targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]
-           The target boxes are expected in format (center_x, center_y, w, h), normalized by the image size.
-        """
-        assert 'pred_boxes' in outputs
-        idx = self._get_src_permutation_idx(indices)
-        src_boxes = outputs['pred_boxes'][idx]
-        target_boxes = torch.cat([t['boxes'][i] for t, (_, i) in zip(targets, indices)], dim=0)
-
-        loss_bbox = F.l1_loss(src_boxes, target_boxes, reduction='none')
-
-        losses = {}
-        losses['loss_bbox'] = loss_bbox.sum() / num_boxes
-
-        loss_giou = 1 - torch.diag(generalized_box_iou(
-            box_cxcywh_to_xyxy(src_boxes),
-            box_cxcywh_to_xyxy(target_boxes)))
-        losses['loss_giou'] = loss_giou.sum() / num_boxes
-        return losses
-
-    def loss_masks(self, outputs, targets, indices, num_boxes):
-        """Compute the losses related to the masks: the focal loss and the dice loss.
-           targets dicts must contain the key "masks" containing a tensor of dim [nb_target_boxes, h, w]
-        """
-        assert "pred_masks" in outputs
-
-        src_idx = self._get_src_permutation_idx(indices)
-        tgt_idx = self._get_tgt_permutation_idx(indices)
-        src_masks = outputs["pred_masks"]
-        src_masks = src_masks[src_idx]
-        masks = [t["masks"] for t in targets]
-        # TODO use valid to mask invalid areas due to padding in loss
-        target_masks, valid = nested_tensor_from_tensor_list(masks).decompose()
-        target_masks = target_masks.to(src_masks)
-        target_masks = target_masks[tgt_idx]
-
-        # upsample predictions to the target size
-        src_masks = interpolate(src_masks[:, None], size=target_masks.shape[-2:],
-                                mode="bilinear", align_corners=False)
-        src_masks = src_masks[:, 0].flatten(1)
-
-        target_masks = target_masks.flatten(1)
-        target_masks = target_masks.view(src_masks.shape)
-        losses = {
-            "loss_mask": sigmoid_focal_loss(src_masks, target_masks, num_boxes),
-            "loss_dice": dice_loss(src_masks, target_masks, num_boxes),
-        }
-        return losses
-
-    def _get_src_permutation_idx(self, indices):
-        # permute predictions following indices
-        batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)])
-        src_idx = torch.cat([src for (src, _) in indices])
-        return batch_idx, src_idx
-
-    def _get_tgt_permutation_idx(self, indices):
-        # permute targets following indices
-        batch_idx = torch.cat([torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)])
-        tgt_idx = torch.cat([tgt for (_, tgt) in indices])
-        return batch_idx, tgt_idx
-
-    def get_loss(self, loss, outputs, targets, indices, num_boxes, **kwargs):
-        loss_map = {
-            'labels': self.loss_labels,
-            'cardinality': self.loss_cardinality,
-            'boxes': self.loss_boxes,
-            'masks': self.loss_masks
-        }
-        assert loss in loss_map, f'do you really want to compute {loss} loss?'
-        return loss_map[loss](outputs, targets, indices, num_boxes, **kwargs)
-
-    def forward(self, outputs, targets):
-        """ This performs the loss computation.
-        Parameters:
-             outputs: dict of tensors, see the output specification of the model for the format
-             targets: list of dicts, such that len(targets) == batch_size.
-                      The expected keys in each dict depends on the losses applied, see each loss' doc
-        """
-        outputs_without_aux = {k: v for k, v in outputs.items() if k != 'aux_outputs'}
-
-        # Retrieve the matching between the outputs of the last layer and the targets
-        indices = self.matcher(outputs_without_aux, targets)
-
-        # Compute the average number of target boxes accross all nodes, for normalization purposes
-        num_boxes = sum(len(t["labels"]) for t in targets)
-        num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device)
-        if is_dist_avail_and_initialized():
-            torch.distributed.all_reduce(num_boxes)
-        num_boxes = torch.clamp(num_boxes / get_world_size(), min=1).item()
-
-        # Compute all the requested losses
-        losses = {}
-        for loss in self.losses:
-            losses.update(self.get_loss(loss, outputs, targets, indices, num_boxes))
-
-        # In case of auxiliary losses, we repeat this process with the output of each intermediate layer.
-        if 'aux_outputs' in outputs:
-            for i, aux_outputs in enumerate(outputs['aux_outputs']):
-                indices = self.matcher(aux_outputs, targets)
-                for loss in self.losses:
-                    if loss == 'masks':
-                        # Intermediate masks losses are too costly to compute, we ignore them.
-                        continue
-                    kwargs = {}
-                    if loss == 'labels':
-                        # Logging is enabled only for the last layer
-                        kwargs = {'log': False}
-                    l_dict = self.get_loss(loss, aux_outputs, targets, indices, num_boxes, **kwargs)
-                    l_dict = {k + f'_{i}': v for k, v in l_dict.items()}
-                    losses.update(l_dict)
-
-        return losses
-    
-    
-class HungarianMatcher(nn.Module):
-    """This class computes an assignment between the targets and the predictions of the network
-    For efficiency reasons, the targets don't include the no_object. Because of this, in general,
-    there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions,
-    while the others are un-matched (and thus treated as non-objects).
-    """
-    def __init__(self, cost_class: float = 1, cost_bbox: float = 1, cost_giou: float = 1):
-        """Creates the matcher
-        Params:
-            cost_class: This is the relative weight of the classification error in the matching cost
-            cost_bbox: This is the relative weight of the L1 error of the bounding box coordinates in the matching cost
-            cost_giou: This is the relative weight of the giou loss of the bounding box in the matching cost
-        """
-        super().__init__()
-        self.cost_class = cost_class
-        self.cost_bbox = cost_bbox
-        self.cost_giou = cost_giou
-        assert cost_class != 0 or cost_bbox != 0 or cost_giou != 0, "all costs cant be 0"
-
-    @torch.no_grad()
-    def forward(self, outputs, targets):
-        """ Performs the matching
-        Params:
-            outputs: This is a dict that contains at least these entries:
-                 "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
-                 "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates
-            targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing:
-                 "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth
-                           objects in the target) containing the class labels
-                 "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates
-        Returns:
-            A list of size batch_size, containing tuples of (index_i, index_j) where:
-                - index_i is the indices of the selected predictions (in order)
-                - index_j is the indices of the corresponding selected targets (in order)
-            For each batch element, it holds:
-                len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
-        """
-        bs, num_queries = outputs["pred_logits"].shape[:2]
-
-        # We flatten to compute the cost matrices in a batch
-        out_prob = outputs["pred_logits"].flatten(0, 1).softmax(-1)  # [batch_size * num_queries, num_classes]
-        out_bbox = outputs["pred_boxes"].flatten(0, 1)  # [batch_size * num_queries, 4]
-
-        # Also concat the target labels and boxes
-        tgt_ids = torch.cat([v["labels"] for v in targets])
-        tgt_bbox = torch.cat([v["boxes"] for v in targets])
-
-        # Compute the classification cost. Contrary to the loss, we don't use the NLL,
-        # but approximate it in 1 - proba[target class].
-        # The 1 is a constant that doesn't change the matching, it can be ommitted.
-        cost_class = -out_prob[:, tgt_ids]
-
-        # Compute the L1 cost between boxes
-        cost_bbox = torch.cdist(out_bbox, tgt_bbox, p=1)
-
-        # Compute the giou cost betwen boxes
-        cost_giou = -generalized_box_iou(box_cxcywh_to_xyxy(out_bbox), box_cxcywh_to_xyxy(tgt_bbox))
-
-        # Final cost matrix
-        C = self.cost_bbox * cost_bbox + self.cost_class * cost_class + self.cost_giou * cost_giou
-        C = C.view(bs, num_queries, -1).cpu()
-
-        sizes = [len(v["boxes"]) for v in targets]
-        indices = [linear_sum_assignment(c[i]) for i, c in enumerate(C.split(sizes, -1))]
-        return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
-    
-    
-def create_detr(
-    backbone: str = 'efficientnet_b0',
-    transformer: str = 'xcit-nano',
-    num_classes: int = 53,
-    num_objects: int = 50,
-    hidden_dim: int = 256,
-    drop_rate_backbone: float = 0.2,
-    drop_path_rate_backbone: float = 0.2,
-    drop_path_rate_transformer: float = 0.1,
-    ds_rate_transformer: int = 2,
-    ds_method_transformer: str = 'chunker',
-) -> torch.nn.Module:
-    """
-    Function used to build a DETR network
-    
-    Args:
-        TODO
-        
-    Returns:
-        torch.nn.Module
-    
-    """
-    # build backbone
-    if 'eff' in backbone:
-        backbone = timm.create_model(
-            model_name=backbone,
-            in_chans=2,
-            drop_rate=drop_rate_backbone,
-            drop_path_rate=drop_path_rate_backbone,
-        )
-        backbone = drop_classifier(backbone)
-    else:
-        raise NotImplemented('Only EfficientNet backbones are supported right now.')
-
-    # Build transformer
-    if 'xcit' in transformer:
-        # map short name to timm name
-        model_name = xcit_name_to_timm_name(transformer)
-
-        # build transformer
-        transformer = XCiT(
-            backbone=timm.create_model(
-                model_name=model_name,
-                drop_path_rate=drop_path_rate_transformer,
-                in_chans=hidden_dim,
-                num_classes=hidden_dim,
-            ),
-            in_chans=hidden_dim,
-            num_objects=num_objects,
-            ds_rate=ds_rate_transformer,
-            ds_method=ds_method_transformer,
-        )
-        
-    else:
-        raise NotImplemented('Only XCiT transformers are supported right now.')
-    
-    # Build full DETR network
-    network = DETRModel(
-        backbone, 
-        transformer, 
-        num_classes=num_classes,
-        num_objects=num_objects,
-        hidden_dim=hidden_dim,
-    )
-
-    return network
\ No newline at end of file
diff --git a/torchsig/models/spectrogram_models/detr/utils.py b/torchsig/models/spectrogram_models/detr/utils.py
deleted file mode 100644
index 2f95d33..0000000
--- a/torchsig/models/spectrogram_models/detr/utils.py
+++ /dev/null
@@ -1,195 +0,0 @@
-import torch
-import numpy as np
-from torch import nn
-import torch.distributed as dist
-from typing import List, Optional
-from torchvision.ops.boxes import box_area
-
-
-def drop_classifier(parent):
-    return torch.nn.Sequential(*list(parent.children())[:-2])
-
-
-def find_output_features(parent, num_features=0):
-    for n, m in parent.named_children():
-        if type(m) is torch.nn.Conv2d:
-            num_features = m.out_channels
-        else:
-            num_features = find_output_features(m, num_features)
-    return num_features
-
-    
-def xcit_name_to_timm_name(input_name: str) -> str:
-    if 'nano' in input_name:
-        model_name = 'xcit_nano_12_p16_224'
-    elif 'tiny' in input_name:
-        if '24' in input_name:
-            model_name = 'xcit_tiny_24_p16_224'
-        else:
-            model_name = 'xcit_tiny_12_p16_224'
-    elif 'small' in input_name:
-        model_name = 'xcit_small_24_p8_224'
-    elif 'medium' in input_name:
-        model_name = 'xcit_medium_24_p8_224'
-    elif 'large' in input_name:
-        model_name = 'xcit_large_24_p8_224'
-    else:
-        raise NotImplemented('Input transformer not supported.')
-    
-    return model_name
-
-
-def is_dist_avail_and_initialized():
-    if not dist.is_available():
-        return False
-    if not dist.is_initialized():
-        return False
-    return True
-
-
-def get_world_size():
-    if not is_dist_avail_and_initialized():
-        return 1
-    return dist.get_world_size()
-
-
-@torch.no_grad()
-def accuracy(output, target, topk=(1,)):
-    """Computes the precision@k for the specified values of k"""
-    if target.numel() == 0:
-        return [torch.zeros([], device=output.device)]
-    maxk = max(topk)
-    batch_size = target.size(0)
-
-    _, pred = output.topk(maxk, 1, True, True)
-    pred = pred.t()
-    correct = pred.eq(target.view(1, -1).expand_as(pred))
-
-    res = []
-    for k in topk:
-        correct_k = correct[:k].view(-1).float().sum(0)
-        res.append(correct_k.mul_(100.0 / batch_size))
-    return res
-
-
-def box_cxcywh_to_xyxy(x):
-    x_c, y_c, w, h = x.unbind(-1)
-    b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
-         (x_c + 0.5 * w), (y_c + 0.5 * h)]
-    return torch.stack(b, dim=-1)
-
-
-# modified from torchvision to also return the union
-def box_iou(boxes1, boxes2):
-    area1 = box_area(boxes1)
-    area2 = box_area(boxes2)
-
-    lt = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
-    rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]
-
-    wh = (rb - lt).clamp(min=0)  # [N,M,2]
-    inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]
-
-    union = area1[:, None] + area2 - inter
-
-    iou = inter / union
-    return iou, union
-
-
-def generalized_box_iou(boxes1, boxes2):
-    """
-    Generalized IoU from https://giou.stanford.edu/
-    The boxes should be in [x0, y0, x1, y1] format
-    Returns a [N, M] pairwise matrix, where N = len(boxes1)
-    and M = len(boxes2)
-    """
-    # degenerate boxes gives inf / nan results
-    # so do an early check
-    assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
-    assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
-    iou, union = box_iou(boxes1, boxes2)
-
-    lt = torch.min(boxes1[:, None, :2], boxes2[:, :2])
-    rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
-
-    wh = (rb - lt).clamp(min=0)  # [N,M,2]
-    area = wh[:, :, 0] * wh[:, :, 1]
-
-    return iou - (area - union) / area
-
-
-def format_preds(preds):
-    map_preds = []
-    for (i, (det_logits, det_boxes)) in enumerate(zip(preds['pred_logits'], preds['pred_boxes'])):
-        boxes = []
-        scores = []
-        labels = []
-
-        # Convert DETR output format to expected bboxes
-        num_objs = 0
-        pred = {}
-        pred['pred_logits'] = det_logits
-        pred['pred_boxes'] = det_boxes
-
-        det_list = []
-        for obj_idx in range(pred['pred_logits'].shape[0]):
-            probs = pred['pred_logits'][obj_idx].softmax(-1)
-            max_prob = probs.max().cpu().detach().numpy()
-            max_class = probs.argmax().cpu().detach().numpy()
-            if max_class != (pred['pred_logits'].shape[1] - 1) and max_prob >= 0.5:
-                center_time = pred['pred_boxes'][obj_idx][0]
-                center_freq = pred['pred_boxes'][obj_idx][1]
-                duration = pred['pred_boxes'][obj_idx][2]
-                bandwidth = pred['pred_boxes'][obj_idx][3]
-
-                # Save to box, score, label lists
-                x1 = max(0,(center_time - duration / 2) * 512)
-                y1 = max(0,(center_freq - bandwidth / 2) * 512)
-                x2 = min(512,(center_time + duration / 2) * 512)
-                y2 = min(512,(center_freq + bandwidth / 2) * 512)
-                
-                boxes.append([x1, y1, x2, y2])
-                scores.extend([float(max_prob)])
-                labels.extend([int(max_class)])
-
-        curr_pred = dict(
-            boxes=torch.tensor(boxes).to("cuda"),
-            scores=torch.tensor(scores).to("cuda"),
-            labels=torch.IntTensor(labels).to("cuda"),
-        )
-        
-        map_preds.append(curr_pred)
-            
-    return map_preds
-
-
-def format_targets(labels):
-    map_targets = []
-        
-    for i, label in enumerate(labels):
-        boxes = []
-        scores = []
-        labels = []
-    
-        for label_obj_idx in range(len(label['labels'])):
-            center_time = label["boxes"][label_obj_idx][0]
-            center_freq = label["boxes"][label_obj_idx][1]
-            duration = label["boxes"][label_obj_idx][2]
-            bandwidth = label["boxes"][label_obj_idx][3]
-            class_idx = label["labels"][label_obj_idx]
-            
-            x1 = (center_time - duration / 2) * 512
-            y1 = (center_freq - bandwidth / 2) * 512
-            x2 = (center_time + duration / 2) * 512
-            y2 = (center_freq + bandwidth / 2) * 512
-            
-            boxes.append([x1, y1, x2, y2])
-            labels.extend([int(class_idx)])
-            
-        curr_target = dict(
-            boxes=torch.tensor(boxes).to("cuda"),
-            labels=torch.IntTensor(labels).to("cuda"),
-        )
-        map_targets.append(curr_target)
-    
-    return map_targets
diff --git a/torchsig/models/spectrogram_models/mask2former/LICENSE_Detectron2.md b/torchsig/models/spectrogram_models/mask2former/LICENSE_Detectron2.md
deleted file mode 100644
index ea36abb..0000000
--- a/torchsig/models/spectrogram_models/mask2former/LICENSE_Detectron2.md
+++ /dev/null
@@ -1,202 +0,0 @@
-Apache License
-Version 2.0, January 2004
-http://www.apache.org/licenses/
-
-TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-1. Definitions.
-
-"License" shall mean the terms and conditions for use, reproduction,
-and distribution as defined by Sections 1 through 9 of this document.
-
-"Licensor" shall mean the copyright owner or entity authorized by
-the copyright owner that is granting the License.
-
-"Legal Entity" shall mean the union of the acting entity and all
-other entities that control, are controlled by, or are under common
-control with that entity. For the purposes of this definition,
-"control" means (i) the power, direct or indirect, to cause the
-direction or management of such entity, whether by contract or
-otherwise, or (ii) ownership of fifty percent (50%) or more of the
-outstanding shares, or (iii) beneficial ownership of such entity.
-
-"You" (or "Your") shall mean an individual or Legal Entity
-exercising permissions granted by this License.
-
-"Source" form shall mean the preferred form for making modifications,
-including but not limited to software source code, documentation
-source, and configuration files.
-
-"Object" form shall mean any form resulting from mechanical
-transformation or translation of a Source form, including but
-not limited to compiled object code, generated documentation,
-and conversions to other media types.
-
-"Work" shall mean the work of authorship, whether in Source or
-Object form, made available under the License, as indicated by a
-copyright notice that is included in or attached to the work
-(an example is provided in the Appendix below).
-
-"Derivative Works" shall mean any work, whether in Source or Object
-form, that is based on (or derived from) the Work and for which the
-editorial revisions, annotations, elaborations, or other modifications
-represent, as a whole, an original work of authorship. For the purposes
-of this License, Derivative Works shall not include works that remain
-separable from, or merely link (or bind by name) to the interfaces of,
-the Work and Derivative Works thereof.
-
-"Contribution" shall mean any work of authorship, including
-the original version of the Work and any modifications or additions
-to that Work or Derivative Works thereof, that is intentionally
-submitted to Licensor for inclusion in the Work by the copyright owner
-or by an individual or Legal Entity authorized to submit on behalf of
-the copyright owner. For the purposes of this definition, "submitted"
-means any form of electronic, verbal, or written communication sent
-to the Licensor or its representatives, including but not limited to
-communication on electronic mailing lists, source code control systems,
-and issue tracking systems that are managed by, or on behalf of, the
-Licensor for the purpose of discussing and improving the Work, but
-excluding communication that is conspicuously marked or otherwise
-designated in writing by the copyright owner as "Not a Contribution."
-
-"Contributor" shall mean Licensor and any individual or Legal Entity
-on behalf of whom a Contribution has been received by Licensor and
-subsequently incorporated within the Work.
-
-2. Grant of Copyright License. Subject to the terms and conditions of
-this License, each Contributor hereby grants to You a perpetual,
-worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-copyright license to reproduce, prepare Derivative Works of,
-publicly display, publicly perform, sublicense, and distribute the
-Work and such Derivative Works in Source or Object form.
-
-3. Grant of Patent License. Subject to the terms and conditions of
-this License, each Contributor hereby grants to You a perpetual,
-worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-(except as stated in this section) patent license to make, have made,
-use, offer to sell, sell, import, and otherwise transfer the Work,
-where such license applies only to those patent claims licensable
-by such Contributor that are necessarily infringed by their
-Contribution(s) alone or by combination of their Contribution(s)
-with the Work to which such Contribution(s) was submitted. If You
-institute patent litigation against any entity (including a
-cross-claim or counterclaim in a lawsuit) alleging that the Work
-or a Contribution incorporated within the Work constitutes direct
-or contributory patent infringement, then any patent licenses
-granted to You under this License for that Work shall terminate
-as of the date such litigation is filed.
-
-4. Redistribution. You may reproduce and distribute copies of the
-Work or Derivative Works thereof in any medium, with or without
-modifications, and in Source or Object form, provided that You
-meet the following conditions:
-
-(a) You must give any other recipients of the Work or
-Derivative Works a copy of this License; and
-
-(b) You must cause any modified files to carry prominent notices
-stating that You changed the files; and
-
-(c) You must retain, in the Source form of any Derivative Works
-that You distribute, all copyright, patent, trademark, and
-attribution notices from the Source form of the Work,
-excluding those notices that do not pertain to any part of
-the Derivative Works; and
-
-(d) If the Work includes a "NOTICE" text file as part of its
-distribution, then any Derivative Works that You distribute must
-include a readable copy of the attribution notices contained
-within such NOTICE file, excluding those notices that do not
-pertain to any part of the Derivative Works, in at least one
-of the following places: within a NOTICE text file distributed
-as part of the Derivative Works; within the Source form or
-documentation, if provided along with the Derivative Works; or,
-within a display generated by the Derivative Works, if and
-wherever such third-party notices normally appear. The contents
-of the NOTICE file are for informational purposes only and
-do not modify the License. You may add Your own attribution
-notices within Derivative Works that You distribute, alongside
-or as an addendum to the NOTICE text from the Work, provided
-that such additional attribution notices cannot be construed
-as modifying the License.
-
-You may add Your own copyright statement to Your modifications and
-may provide additional or different license terms and conditions
-for use, reproduction, or distribution of Your modifications, or
-for any such Derivative Works as a whole, provided Your use,
-reproduction, and distribution of the Work otherwise complies with
-the conditions stated in this License.
-
-5. Submission of Contributions. Unless You explicitly state otherwise,
-any Contribution intentionally submitted for inclusion in the Work
-by You to the Licensor shall be under the terms and conditions of
-this License, without any additional terms or conditions.
-Notwithstanding the above, nothing herein shall supersede or modify
-the terms of any separate license agreement you may have executed
-with Licensor regarding such Contributions.
-
-6. Trademarks. This License does not grant permission to use the trade
-names, trademarks, service marks, or product names of the Licensor,
-except as required for reasonable and customary use in describing the
-origin of the Work and reproducing the content of the NOTICE file.
-
-7. Disclaimer of Warranty. Unless required by applicable law or
-agreed to in writing, Licensor provides the Work (and each
-Contributor provides its Contributions) on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-implied, including, without limitation, any warranties or conditions
-of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-PARTICULAR PURPOSE. You are solely responsible for determining the
-appropriateness of using or redistributing the Work and assume any
-risks associated with Your exercise of permissions under this License.
-
-8. Limitation of Liability. In no event and under no legal theory,
-whether in tort (including negligence), contract, or otherwise,
-unless required by applicable law (such as deliberate and grossly
-negligent acts) or agreed to in writing, shall any Contributor be
-liable to You for damages, including any direct, indirect, special,
-incidental, or consequential damages of any character arising as a
-result of this License or out of the use or inability to use the
-Work (including but not limited to damages for loss of goodwill,
-work stoppage, computer failure or malfunction, or any and all
-other commercial damages or losses), even if such Contributor
-has been advised of the possibility of such damages.
-
-9. Accepting Warranty or Additional Liability. While redistributing
-the Work or Derivative Works thereof, You may choose to offer,
-and charge a fee for, acceptance of support, warranty, indemnity,
-or other liability obligations and/or rights consistent with this
-License. However, in accepting such obligations, You may act only
-on Your own behalf and on Your sole responsibility, not on behalf
-of any other Contributor, and only if You agree to indemnify,
-defend, and hold each Contributor harmless for any liability
-incurred by, or claims asserted against, such Contributor by reason
-of your accepting any such warranty or additional liability.
-
-END OF TERMS AND CONDITIONS
-
-APPENDIX: How to apply the Apache License to your work.
-
-To apply the Apache License to your work, attach the following
-boilerplate notice, with the fields enclosed by brackets "[]"
-replaced with your own identifying information. (Don't include
-the brackets!)  The text should be enclosed in the appropriate
-comment syntax for the file format. We also recommend that a
-file or class name and description of purpose be included on the
-same "printed page" as the copyright notice for easier
-identification within third-party archives.
-
-Copyright [yyyy] [name of copyright owner]
-
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
\ No newline at end of file
diff --git a/torchsig/models/spectrogram_models/mask2former/LICENSE_Mask2Former.md b/torchsig/models/spectrogram_models/mask2former/LICENSE_Mask2Former.md
deleted file mode 100644
index 40b7f64..0000000
--- a/torchsig/models/spectrogram_models/mask2former/LICENSE_Mask2Former.md
+++ /dev/null
@@ -1,19 +0,0 @@
-Copyright (c) 2022 Meta, Inc.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
\ No newline at end of file
diff --git a/torchsig/models/spectrogram_models/mask2former/README.md b/torchsig/models/spectrogram_models/mask2former/README.md
deleted file mode 100644
index 2798cf6..0000000
--- a/torchsig/models/spectrogram_models/mask2former/README.md
+++ /dev/null
@@ -1,5 +0,0 @@
-# Mask2Former
-
-The Mask2Former code contained here is a cloned, modified, and supplemented version from the original source provided by the authors at the official [Mask2Former GitHub](https://github.com/facebookresearch/Mask2Former) site. Additionally, since Mask2Former's source code was built using the [Detectron2](https://github.com/facebookresearch/detectron2) framework, several features of Detectron2 have been pulled into these modules.
-
-The original Mask2Former code is licensed under an MIT license. The original Detectron2 code is licensed under an Apache 2.0 license. These licenses are contained within this directory.
diff --git a/torchsig/models/spectrogram_models/mask2former/__init__.py b/torchsig/models/spectrogram_models/mask2former/__init__.py
deleted file mode 100644
index 2caec72..0000000
--- a/torchsig/models/spectrogram_models/mask2former/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .mask2former import *
\ No newline at end of file
diff --git a/torchsig/models/spectrogram_models/mask2former/backbone.py b/torchsig/models/spectrogram_models/mask2former/backbone.py
deleted file mode 100644
index c93ffee..0000000
--- a/torchsig/models/spectrogram_models/mask2former/backbone.py
+++ /dev/null
@@ -1,38 +0,0 @@
-import timm
-import numpy as np
-import torch
-import torch.nn as nn
-
-
-class ResNet50Backbone(nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.resnet50 = timm.create_model('resnet50', in_chans=2).float()
-        
-    def forward(self, x):
-        features = {}
-        layers = list(self.resnet50.children())
-        for i, layer in enumerate(layers):
-            x = layer(x)
-            if isinstance(layer, nn.Sequential):
-                features[str(len(features))] = x
-        return features     
-    
-    
-class EffNetBackbone(nn.Module):
-    def __init__(self, network='efficientnet_b0'):
-        super().__init__()
-        self.network = timm.create_model(network, in_chans=2).float()
-        
-    def forward(self, x):
-        features = {}
-        layers = list(self.network.children())
-        for i, layer in enumerate(layers):
-            if isinstance(layer, nn.Sequential):
-                for ii, blocks in enumerate(layer):
-                    x = blocks(x)                 
-                    if isinstance(blocks, nn.Sequential):
-                        features[str(len(features))] = x
-            else:
-                x = layer(x)
-        return features
\ No newline at end of file
diff --git a/torchsig/models/spectrogram_models/mask2former/criterion.py b/torchsig/models/spectrogram_models/mask2former/criterion.py
deleted file mode 100644
index 2d57228..0000000
--- a/torchsig/models/spectrogram_models/mask2former/criterion.py
+++ /dev/null
@@ -1,583 +0,0 @@
-"""
-Criterion and matching modules from Detectron2, Mask2Former, and DETR codebases
-"""
-import numpy as np
-import torch
-from torch import nn, Tensor
-import torch.nn.functional as F
-import torch.distributed as dist
-from torch.cuda.amp import autocast
-import torchvision
-from scipy.optimize import linear_sum_assignment
-from typing import List, Optional
-
-from .utils import _max_by_axis
-
-
-def get_world_size() -> int:
-    if not dist.is_available():
-        return 1
-    if not dist.is_initialized():
-        return 1
-    return dist.get_world_size()
-
-
-def get_uncertain_point_coords_with_randomness(
-    coarse_logits, uncertainty_func, num_points, oversample_ratio, importance_sample_ratio
-):
-    """
-    Sample points in [0, 1] x [0, 1] coordinate space based on their uncertainty. The unceratinties
-        are calculated for each point using 'uncertainty_func' function that takes point's logit
-        prediction as input.
-    See PointRend paper for details.
-    Args:
-        coarse_logits (Tensor): A tensor of shape (N, C, Hmask, Wmask) or (N, 1, Hmask, Wmask) for
-            class-specific or class-agnostic prediction.
-        uncertainty_func: A function that takes a Tensor of shape (N, C, P) or (N, 1, P) that
-            contains logit predictions for P points and returns their uncertainties as a Tensor of
-            shape (N, 1, P).
-        num_points (int): The number of points P to sample.
-        oversample_ratio (int): Oversampling parameter.
-        importance_sample_ratio (float): Ratio of points that are sampled via importnace sampling.
-    Returns:
-        point_coords (Tensor): A tensor of shape (N, P, 2) that contains the coordinates of P
-            sampled points.
-    """
-    assert oversample_ratio >= 1
-    assert importance_sample_ratio <= 1 and importance_sample_ratio >= 0
-    num_boxes = coarse_logits.shape[0]
-    num_sampled = int(num_points * oversample_ratio)
-    point_coords = torch.rand(num_boxes, num_sampled, 2, device=coarse_logits.device)
-    point_logits = point_sample(coarse_logits, point_coords, align_corners=False)
-    # It is crucial to calculate uncertainty based on the sampled prediction value for the points.
-    # Calculating uncertainties of the coarse predictions first and sampling them for points leads
-    # to incorrect results.
-    # To illustrate this: assume uncertainty_func(logits)=-abs(logits), a sampled point between
-    # two coarse predictions with -1 and 1 logits has 0 logits, and therefore 0 uncertainty value.
-    # However, if we calculate uncertainties for the coarse predictions first,
-    # both will have -1 uncertainty, and the sampled point will get -1 uncertainty.
-    point_uncertainties = uncertainty_func(point_logits)
-    num_uncertain_points = int(importance_sample_ratio * num_points)
-    num_random_points = num_points - num_uncertain_points
-    idx = torch.topk(point_uncertainties[:, 0, :], k=num_uncertain_points, dim=1)[1]
-    shift = num_sampled * torch.arange(num_boxes, dtype=torch.long, device=coarse_logits.device)
-    idx += shift[:, None]
-    point_coords = point_coords.view(-1, 2)[idx.view(-1), :].view(
-        num_boxes, num_uncertain_points, 2
-    )
-    if num_random_points > 0:
-        point_coords = torch.cat(
-            [
-                point_coords,
-                torch.rand(num_boxes, num_random_points, 2, device=coarse_logits.device),
-            ],
-            dim=1,
-        )
-    return point_coords
-
-
-def point_sample(input, point_coords, **kwargs):
-    """
-    A wrapper around :function:`torch.nn.functional.grid_sample` to support 3D point_coords tensors.
-    Unlike :function:`torch.nn.functional.grid_sample` it assumes `point_coords` to lie inside
-    [0, 1] x [0, 1] square.
-    Args:
-        input (Tensor): A tensor of shape (N, C, H, W) that contains features map on a H x W grid.
-        point_coords (Tensor): A tensor of shape (N, P, 2) or (N, Hgrid, Wgrid, 2) that contains
-        [0, 1] x [0, 1] normalized point coordinates.
-    Returns:
-        output (Tensor): A tensor of shape (N, C, P) or (N, C, Hgrid, Wgrid) that contains
-            features for points in `point_coords`. The features are obtained via bilinear
-            interplation from `input` the same way as :function:`torch.nn.functional.grid_sample`.
-    """
-    add_dim = False
-    if point_coords.dim() == 3:
-        add_dim = True
-        point_coords = point_coords.unsqueeze(2)
-    output = F.grid_sample(input, 2.0 * point_coords - 1.0, **kwargs)
-    if add_dim:
-        output = output.squeeze(3)
-    return output
-
-
-def is_dist_avail_and_initialized():
-    if not dist.is_available():
-        return False
-    if not dist.is_initialized():
-        return False
-    return True
-
-
-def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
-    # TODO make this more general
-    if tensor_list[0].ndim == 3:
-        if torchvision._is_tracing():
-            # nested_tensor_from_tensor_list() does not export well to ONNX
-            # call _onnx_nested_tensor_from_tensor_list() instead
-            return _onnx_nested_tensor_from_tensor_list(tensor_list)
-
-        # TODO make it support different-sized images
-        max_size = _max_by_axis([list(img.shape) for img in tensor_list])
-        # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list]))
-        batch_shape = [len(tensor_list)] + max_size
-        b, c, h, w = batch_shape
-        dtype = tensor_list[0].dtype
-        device = tensor_list[0].device
-        tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
-        mask = torch.ones((b, h, w), dtype=torch.bool, device=device)
-        for img, pad_img, m in zip(tensor_list, tensor, mask):
-            pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
-            m[: img.shape[1], : img.shape[2]] = False
-    else:
-        raise ValueError("not supported")
-    return NestedTensor(tensor, mask)
-
-
-class NestedTensor(object):
-    def __init__(self, tensors, mask: Optional[Tensor]):
-        self.tensors = tensors
-        self.mask = mask
-
-    def to(self, device):
-        # type: (Device) -> NestedTensor # noqa
-        cast_tensor = self.tensors.to(device)
-        mask = self.mask
-        if mask is not None:
-            assert mask is not None
-            cast_mask = mask.to(device)
-        else:
-            cast_mask = None
-        return NestedTensor(cast_tensor, cast_mask)
-
-    def decompose(self):
-        return self.tensors, self.mask
-
-    def __repr__(self):
-        return str(self.tensors)
-
-
-def dice_loss(
-        inputs: torch.Tensor,
-        targets: torch.Tensor,
-        num_masks: float,
-    ):
-    """
-    Compute the DICE loss, similar to generalized IOU for masks
-    Args:
-        inputs: A float tensor of arbitrary shape.
-                The predictions for each example.
-        targets: A float tensor with the same shape as inputs. Stores the binary
-                 classification label for each element in inputs
-                (0 for the negative class and 1 for the positive class).
-    """
-    inputs = inputs.sigmoid()
-    inputs = inputs.flatten(1)
-    numerator = 2 * (inputs * targets).sum(-1)
-    denominator = inputs.sum(-1) + targets.sum(-1)
-    loss = 1 - (numerator + 1) / (denominator + 1)
-    return loss.sum() / num_masks
-
-
-dice_loss_jit = torch.jit.script(
-    dice_loss
-)  # type: torch.jit.ScriptModule
-
-
-def sigmoid_ce_loss(
-        inputs: torch.Tensor,
-        targets: torch.Tensor,
-        num_masks: float,
-    ):
-    """
-    Args:
-        inputs: A float tensor of arbitrary shape.
-                The predictions for each example.
-        targets: A float tensor with the same shape as inputs. Stores the binary
-                 classification label for each element in inputs
-                (0 for the negative class and 1 for the positive class).
-    Returns:
-        Loss tensor
-    """
-    loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
-
-    return loss.mean(1).sum() / num_masks
-
-
-sigmoid_ce_loss_jit = torch.jit.script(
-    sigmoid_ce_loss
-)  # type: torch.jit.ScriptModule
-
-
-def calculate_uncertainty(logits):
-    """
-    We estimate uncerainty as L1 distance between 0.0 and the logit prediction in 'logits' for the
-        foreground class in `classes`.
-    Args:
-        logits (Tensor): A tensor of shape (R, 1, ...) for class-specific or
-            class-agnostic, where R is the total number of predicted masks in all images and C is
-            the number of foreground classes. The values are logits.
-    Returns:
-        scores (Tensor): A tensor of shape (R, 1, ...) that contains uncertainty scores with
-            the most uncertain locations having the highest uncertainty score.
-    """
-    assert logits.shape[1] == 1
-    gt_class_logits = logits.clone()
-    return -(torch.abs(gt_class_logits))
-
-
-class SetCriterion(nn.Module):
-    """This class computes the loss for DETR.
-    The process happens in two steps:
-        1) we compute hungarian assignment between ground truth boxes and the outputs of the model
-        2) we supervise each pair of matched ground-truth / prediction (supervise class and box)
-    """
-
-    def __init__(self, num_classes, matcher, weight_dict, eos_coef, losses,
-                 num_points, oversample_ratio, importance_sample_ratio):
-        """Create the criterion.
-        Parameters:
-            num_classes: number of object categories, omitting the special no-object category
-            matcher: module able to compute a matching between targets and proposals
-            weight_dict: dict containing as key the names of the losses and as values their relative weight.
-            eos_coef: relative classification weight applied to the no-object category
-            losses: list of all the losses to be applied. See get_loss for list of available losses.
-        """
-        super().__init__()
-        self.num_classes = num_classes
-        self.matcher = matcher
-        self.weight_dict = weight_dict
-        self.eos_coef = eos_coef
-        self.losses = losses
-        empty_weight = torch.ones(self.num_classes + 1)
-        empty_weight[-1] = self.eos_coef
-        self.register_buffer("empty_weight", empty_weight)
-
-        # pointwise mask loss parameters
-        self.num_points = num_points
-        self.oversample_ratio = oversample_ratio
-        self.importance_sample_ratio = importance_sample_ratio
-
-    def loss_labels(self, outputs, targets, indices, num_masks):
-        """Classification loss (NLL)
-        targets dicts must contain the key "labels" containing a tensor of dim [nb_target_boxes]
-        """
-        assert "pred_logits" in outputs
-        src_logits = outputs["pred_logits"].float()
-
-        idx = self._get_src_permutation_idx(indices)
-        target_classes_o = torch.cat([t["labels"][J] for t, (_, J) in zip(targets, indices)])
-        target_classes = torch.full(
-            src_logits.shape[:2], self.num_classes, dtype=torch.int64, device=src_logits.device
-        )
-        target_classes[idx] = target_classes_o
-
-        loss_ce = F.cross_entropy(src_logits.transpose(1, 2), target_classes, self.empty_weight)
-        losses = {"loss_ce": loss_ce}
-        return losses
-    
-    def loss_masks(self, outputs, targets, indices, num_masks):
-        """Compute the losses related to the masks: the focal loss and the dice loss.
-        targets dicts must contain the key "masks" containing a tensor of dim [nb_target_boxes, h, w]
-        """
-        assert "pred_masks" in outputs
-
-        src_idx = self._get_src_permutation_idx(indices)
-        tgt_idx = self._get_tgt_permutation_idx(indices)
-        src_masks = outputs["pred_masks"]
-        src_masks = src_masks[src_idx]
-        masks = [t["masks"] for t in targets]
-        # TODO use valid to mask invalid areas due to padding in loss
-        target_masks, valid = nested_tensor_from_tensor_list(masks).decompose()
-        target_masks = target_masks.to(src_masks)
-        target_masks = target_masks[tgt_idx]
-
-        # No need to upsample predictions as we are using normalized coordinates :)
-        # N x 1 x H x W
-        src_masks = src_masks[:, None]
-        target_masks = target_masks[:, None]
-
-        with torch.no_grad():
-            # sample point_coords
-            point_coords = get_uncertain_point_coords_with_randomness(
-                src_masks,
-                lambda logits: calculate_uncertainty(logits),
-                self.num_points,
-                self.oversample_ratio,
-                self.importance_sample_ratio,
-            )
-            # get gt labels
-            point_labels = point_sample(
-                target_masks,
-                point_coords,
-                align_corners=False,
-            ).squeeze(1)
-
-        point_logits = point_sample(
-            src_masks,
-            point_coords,
-            align_corners=False,
-        ).squeeze(1)
-
-        losses = {
-            "loss_mask": sigmoid_ce_loss_jit(point_logits, point_labels, num_masks),
-            "loss_dice": dice_loss_jit(point_logits, point_labels, num_masks),
-        }
-
-        del src_masks
-        del target_masks
-        return losses
-
-    def _get_src_permutation_idx(self, indices):
-        # permute predictions following indices
-        batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)])
-        src_idx = torch.cat([src for (src, _) in indices])
-        return batch_idx, src_idx
-
-    def _get_tgt_permutation_idx(self, indices):
-        # permute targets following indices
-        batch_idx = torch.cat([torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)])
-        tgt_idx = torch.cat([tgt for (_, tgt) in indices])
-        return batch_idx, tgt_idx
-
-    def get_loss(self, loss, outputs, targets, indices, num_masks):
-        loss_map = {
-            'labels': self.loss_labels,
-            'masks': self.loss_masks,
-        }
-        assert loss in loss_map, f"do you really want to compute {loss} loss?"
-        return loss_map[loss](outputs, targets, indices, num_masks)
-
-    def forward(self, outputs, targets):
-        """This performs the loss computation.
-        Parameters:
-             outputs: dict of tensors, see the output specification of the model for the format
-             targets: list of dicts, such that len(targets) == batch_size.
-                      The expected keys in each dict depends on the losses applied, see each loss' doc
-        """
-        outputs_without_aux = {k: v for k, v in outputs.items() if k != "aux_outputs"}
-
-        # Retrieve the matching between the outputs of the last layer and the targets
-        indices = self.matcher(outputs_without_aux, targets)
-
-        # Compute the average number of target boxes accross all nodes, for normalization purposes
-        num_masks = sum(len(t["labels"]) for t in targets)
-        num_masks = torch.as_tensor(
-            [num_masks], dtype=torch.float, device=next(iter(outputs.values())).device
-        )
-        if is_dist_avail_and_initialized():
-            torch.distributed.all_reduce(num_masks)
-        num_masks = torch.clamp(num_masks / get_world_size(), min=1).item()
-
-        # Compute all the requested losses
-        losses = {}
-        for loss in self.losses:
-            losses.update(self.get_loss(loss, outputs, targets, indices, num_masks))
-
-        # In case of auxiliary losses, we repeat this process with the output of each intermediate layer.
-        if "aux_outputs" in outputs:
-            for i, aux_outputs in enumerate(outputs["aux_outputs"]):
-                indices = self.matcher(aux_outputs, targets)
-                for loss in self.losses:
-                    l_dict = self.get_loss(loss, aux_outputs, targets, indices, num_masks)
-                    l_dict = {k + f"_{i}": v for k, v in l_dict.items()}
-                    losses.update(l_dict)
-
-        return losses
-
-    def __repr__(self):
-        head = "Criterion " + self.__class__.__name__
-        body = [
-            "matcher: {}".format(self.matcher.__repr__(_repr_indent=8)),
-            "losses: {}".format(self.losses),
-            "weight_dict: {}".format(self.weight_dict),
-            "num_classes: {}".format(self.num_classes),
-            "eos_coef: {}".format(self.eos_coef),
-            "num_points: {}".format(self.num_points),
-            "oversample_ratio: {}".format(self.oversample_ratio),
-            "importance_sample_ratio: {}".format(self.importance_sample_ratio),
-        ]
-        _repr_indent = 4
-        lines = [head] + [" " * _repr_indent + line for line in body]
-        return "\n".join(lines)
-
-
-def batch_dice_loss(inputs: torch.Tensor, targets: torch.Tensor):
-    """
-    Compute the DICE loss, similar to generalized IOU for masks
-    Args:
-        inputs: A float tensor of arbitrary shape.
-                The predictions for each example.
-        targets: A float tensor with the same shape as inputs. Stores the binary
-                 classification label for each element in inputs
-                (0 for the negative class and 1 for the positive class).
-    """
-    inputs = inputs.sigmoid()
-    inputs = inputs.flatten(1)
-    numerator = 2 * torch.einsum("nc,mc->nm", inputs, targets)
-    denominator = inputs.sum(-1)[:, None] + targets.sum(-1)[None, :]
-    loss = 1 - (numerator + 1) / (denominator + 1)
-    return loss
-
-
-batch_dice_loss_jit = torch.jit.script(
-    batch_dice_loss
-)  # type: torch.jit.ScriptModule
-
-
-def batch_sigmoid_ce_loss(inputs: torch.Tensor, targets: torch.Tensor):
-    """
-    Args:
-        inputs: A float tensor of arbitrary shape.
-                The predictions for each example.
-        targets: A float tensor with the same shape as inputs. Stores the binary
-                 classification label for each element in inputs
-                (0 for the negative class and 1 for the positive class).
-    Returns:
-        Loss tensor
-    """
-    hw = inputs.shape[1]
-
-    pos = F.binary_cross_entropy_with_logits(
-        inputs, torch.ones_like(inputs), reduction="none"
-    )
-    neg = F.binary_cross_entropy_with_logits(
-        inputs, torch.zeros_like(inputs), reduction="none"
-    )
-
-    loss = torch.einsum("nc,mc->nm", pos, targets) + torch.einsum(
-        "nc,mc->nm", neg, (1 - targets)
-    )
-
-    return loss / hw
-
-
-batch_sigmoid_ce_loss_jit = torch.jit.script(
-    batch_sigmoid_ce_loss
-)  # type: torch.jit.ScriptModule
-
-
-class HungarianMatcher(nn.Module):
-    """This class computes an assignment between the targets and the predictions of the network
-
-    For efficiency reasons, the targets don't include the no_object. Because of this, in general,
-    there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions,
-    while the others are un-matched (and thus treated as non-objects).
-    """
-
-    def __init__(self, cost_class: float = 1, cost_mask: float = 1, cost_dice: float = 1, num_points: int = 0):
-        """Creates the matcher
-
-        Params:
-            cost_class: This is the relative weight of the classification error in the matching cost
-            cost_mask: This is the relative weight of the focal loss of the binary mask in the matching cost
-            cost_dice: This is the relative weight of the dice loss of the binary mask in the matching cost
-        """
-        super().__init__()
-        self.cost_class = cost_class
-        self.cost_mask = cost_mask
-        self.cost_dice = cost_dice
-
-        assert cost_class != 0 or cost_mask != 0 or cost_dice != 0, "all costs cant be 0"
-
-        self.num_points = num_points
-        
-    @torch.no_grad()
-    def memory_efficient_forward(self, outputs, targets):
-        """More memory-friendly matching"""
-        bs, num_queries = outputs["pred_logits"].shape[:2]
-
-        indices = []
-
-        # Iterate through batch size
-        for b in range(bs):
-
-            out_prob = outputs["pred_logits"][b].softmax(-1)  # [num_queries, num_classes]
-            tgt_ids = targets[b]["labels"]
-
-            # Compute the classification cost. Contrary to the loss, we don't use the NLL,
-            # but approximate it in 1 - proba[target class].
-            # The 1 is a constant that doesn't change the matching, it can be ommitted.
-            cost_class = -out_prob[:, tgt_ids]
-
-            out_mask = outputs["pred_masks"][b]  # [num_queries, H_pred, W_pred]
-            # gt masks are already padded when preparing target
-            tgt_mask = targets[b]["masks"].to(out_mask)
-
-            out_mask = out_mask[:, None]
-            tgt_mask = tgt_mask[:, None]
-            # all masks share the same set of points for efficient matching!
-            point_coords = torch.rand(1, self.num_points, 2, device=out_mask.device)
-            # get gt labels
-            tgt_mask = point_sample(
-                tgt_mask,
-                point_coords.repeat(tgt_mask.shape[0], 1, 1),
-                align_corners=False,
-            ).squeeze(1)
-
-            out_mask = point_sample(
-                out_mask,
-                point_coords.repeat(out_mask.shape[0], 1, 1),
-                align_corners=False,
-            ).squeeze(1)
-
-            with autocast(enabled=False):
-                out_mask = out_mask.float()
-                tgt_mask = tgt_mask.float()
-                # Compute the focal loss between masks
-                cost_mask = batch_sigmoid_ce_loss_jit(out_mask, tgt_mask)
-
-                # Compute the dice loss betwen masks
-                with torch.jit.optimized_execution(False):
-                    cost_dice = batch_dice_loss_jit(out_mask, tgt_mask)
-            
-            # Final cost matrix
-            C = (
-                self.cost_mask * cost_mask
-                + self.cost_class * cost_class
-                + self.cost_dice * cost_dice
-            )
-            C = C.reshape(num_queries, -1).cpu()
-            
-            # -inf values cause error in linear_sum_assignment so replace with large neg
-            if -np.inf in C:
-                C = C[np.where(C==-np.inf)] = -1e9
-
-            indices.append(linear_sum_assignment(C))
-
-        return [
-            (torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64))
-            for i, j in indices
-        ]
-
-    @torch.no_grad()
-    def forward(self, outputs, targets):
-        """Performs the matching
-
-        Params:
-            outputs: This is a dict that contains at least these entries:
-                 "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
-                 "pred_masks": Tensor of dim [batch_size, num_queries, H_pred, W_pred] with the predicted masks
-
-            targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing:
-                 "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth
-                           objects in the target) containing the class labels
-                 "masks": Tensor of dim [num_target_boxes, H_gt, W_gt] containing the target masks
-
-        Returns:
-            A list of size batch_size, containing tuples of (index_i, index_j) where:
-                - index_i is the indices of the selected predictions (in order)
-                - index_j is the indices of the corresponding selected targets (in order)
-            For each batch element, it holds:
-                len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
-        """
-        return self.memory_efficient_forward(outputs, targets)
-
-    def __repr__(self, _repr_indent=4):
-        head = "Matcher " + self.__class__.__name__
-        body = [
-            "cost_class: {}".format(self.cost_class),
-            "cost_mask: {}".format(self.cost_mask),
-            "cost_dice: {}".format(self.cost_dice),
-        ]
-        lines = [head] + [" " * _repr_indent + line for line in body]
-        return "\n".join(lines)
diff --git a/torchsig/models/spectrogram_models/mask2former/mask2former.py b/torchsig/models/spectrogram_models/mask2former/mask2former.py
deleted file mode 100644
index 2cba851..0000000
--- a/torchsig/models/spectrogram_models/mask2former/mask2former.py
+++ /dev/null
@@ -1,258 +0,0 @@
-import timm
-import gdown
-import torch
-import os.path
-import numpy as np
-from torch import nn
-
-from .utils import non_max_suppression_df, format_preds, format_targets
-from .criterion import SetCriterion, HungarianMatcher
-
-    
-__all__ = [
-    "mask2former_b0", "mask2former_b2", "mask2former_b4",
-    "mask2former_b0_mod_family", "mask2former_b2_mod_family", "mask2former_b4_mod_family",
-]
-
-model_urls = {
-    "mask2former_b0": "1sioOi9k1O3tzxM1Hu5CpME1u9Q3wt_ht",
-    "mask2former_b2": "1ZJOSu5jLUS-ZgUmytXdMcyuwHaw5C10b",
-    "mask2former_b4": "1xBdw6oGLn7M3JUR7D7p1mbwelcWUsAvj",
-    "mask2former_b0_mod_family": "1eRijUw6zuMvPIHNB4-9NwN3rY_1fFA7i",
-    "mask2former_b2_mod_family": "1pKAGMALwc3XBg1l14cYDHNFw2ObtHMnx",
-    "mask2former_b4_mod_family": "1-_86eGkTDaq9uykgTEZOo1Gky5ITXLJI",
-}
-
-
-def mask2former_b0(
-    pretrained: bool = False, 
-    path: str = "mask2former_b0.pt",
-    num_classes: int = 1,
-):
-    """Constructs a Mask2Former architecture with an EfficientNet-B0 backbone.
-    Mask2Former from `"Masked-attention Mask Transformer for Universal Image Segmentation" <https://arxiv.org/pdf/2112.01527.pdf>`_.
-    EfficientNet from `"EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks" <https://arxiv.org/pdf/1905.11946.pdf>`_.
-    
-    Args:
-        pretrained (bool): 
-            If True, returns a model pre-trained on WBSig53
-        path (str): 
-            Path to existing model or where to download checkpoint to
-        num_classes (int): 
-            Number of output classes; if loading checkpoint and 
-            number does not equal 1, final layer will not be loaded from checkpoint
-        
-    """
-    from .modules import Mask2FormerModel, create_mask2former
-    
-    # Create Mask2Former-B0
-    mdl = create_mask2former(
-        backbone='efficientnet_b0',
-        pixel_decoder='multi_scale_deformable_attention',
-        predictor='multi_scale_masked_transformer_decoder',
-        num_classes=1,
-    )
-    if pretrained:
-        model_exists = os.path.exists(path)
-        if not model_exists:
-            file_id = model_urls['mask2former_b0']
-            dl = gdown.download(id=file_id, output=path)
-        mdl.load_state_dict(torch.load(path), strict=False)
-    if num_classes != 1:
-        raise NotImplemented('Mask2Former implementation does not support finetuning to different class sizes yet.')
-    return mdl
-
-
-def mask2former_b2(
-    pretrained: bool = False, 
-    path: str = "mask2former_b2.pt",
-    num_classes: int = 1,
-):
-    """Constructs a Mask2Former architecture with an EfficientNet-B2 backbone.
-    Mask2Former from `"Masked-attention Mask Transformer for Universal Image Segmentation" <https://arxiv.org/pdf/2112.01527.pdf>`_.
-    EfficientNet from `"EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks" <https://arxiv.org/pdf/1905.11946.pdf>`_.
-    
-    Args:
-        pretrained (bool): 
-            If True, returns a model pre-trained on WBSig53
-        path (str): 
-            Path to existing model or where to download checkpoint to
-        num_classes (int): 
-            Number of output classes; if loading checkpoint and 
-            number does not equal 1, final layer will not be loaded from checkpoint
-        
-    """
-    from .modules import Mask2FormerModel, create_mask2former
-    
-    # Create Mask2Former-B2
-    mdl = create_mask2former(
-        backbone='efficientnet_b2',
-        pixel_decoder='multi_scale_deformable_attention',
-        predictor='multi_scale_masked_transformer_decoder',
-        num_classes=1,
-    )
-    if pretrained:
-        model_exists = os.path.exists(path)
-        if not model_exists:
-            file_id = model_urls['mask2former_b2']
-            dl = gdown.download(id=file_id, output=path)
-        mdl.load_state_dict(torch.load(path), strict=False)
-    if num_classes != 1:
-        raise NotImplemented('Mask2Former implementation does not support finetuning to different class sizes yet.')
-    return mdl
-
-
-def mask2former_b4(
-    pretrained: bool = False, 
-    path: str = "mask2former_b4.pt",
-    num_classes: int = 1,
-):
-    """Constructs a Mask2Former architecture with an EfficientNet-B4 backbone.
-    Mask2Former from `"Masked-attention Mask Transformer for Universal Image Segmentation" <https://arxiv.org/pdf/2112.01527.pdf>`_.
-    EfficientNet from `"EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks" <https://arxiv.org/pdf/1905.11946.pdf>`_.
-    
-    Args:
-        pretrained (bool): 
-            If True, returns a model pre-trained on WBSig53
-        path (str): 
-            Path to existing model or where to download checkpoint to
-        num_classes (int): 
-            Number of output classes; if loading checkpoint and 
-            number does not equal 1, final layer will not be loaded from checkpoint
-        
-    """
-    from .modules import Mask2FormerModel, create_mask2former
-    
-    # Create Mask2Former-B4
-    mdl = create_mask2former(
-        backbone='efficientnet_b4',
-        pixel_decoder='multi_scale_deformable_attention',
-        predictor='multi_scale_masked_transformer_decoder',
-        num_classes=1,
-    )
-    if pretrained:
-        model_exists = os.path.exists(path)
-        if not model_exists:
-            file_id = model_urls['mask2former_b4']
-            dl = gdown.download(id=file_id, output=path)
-        mdl.load_state_dict(torch.load(path), strict=False)
-    if num_classes != 1:
-        raise NotImplemented('Mask2Former implementation does not support finetuning to different class sizes yet.')
-    return mdl
-
-
-def mask2former_b0_mod_family(
-    pretrained: bool = False, 
-    path: str = "mask2former_b0_mod_family.pt",
-    num_classes: int = 6,
-):
-    """Constructs a Mask2Former architecture with an EfficientNet-B0 backbone.
-    Mask2Former from `"Masked-attention Mask Transformer for Universal Image Segmentation" <https://arxiv.org/pdf/2112.01527.pdf>`_.
-    EfficientNet from `"EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks" <https://arxiv.org/pdf/1905.11946.pdf>`_.
-    
-    Args:
-        pretrained (bool): 
-            If True, returns a model pre-trained on WBSig53
-        path (str): 
-            Path to existing model or where to download checkpoint to
-        num_classes (int): 
-            Number of output classes; if loading checkpoint and 
-            number does not equal 6, final layer will not be loaded from checkpoint
-        
-    """
-    from .modules import Mask2FormerModel, create_mask2former
-    
-    # Create Mask2Former-B0
-    mdl = create_mask2former(
-        backbone='efficientnet_b0',
-        pixel_decoder='multi_scale_deformable_attention',
-        predictor='multi_scale_masked_transformer_decoder',
-        num_classes=6,
-    )
-    if pretrained:
-        model_exists = os.path.exists(path)
-        if not model_exists:
-            file_id = model_urls['mask2former_b0_mod_family']
-            dl = gdown.download(id=file_id, output=path)
-        mdl.load_state_dict(torch.load(path), strict=False)
-    if num_classes != 6:
-        raise NotImplemented('Mask2Former implementation does not support finetuning to different class sizes yet.')
-    return mdl
-
-
-def mask2former_b2_mod_family(
-    pretrained: bool = False, 
-    path: str = "mask2former_b2_mod_family.pt",
-    num_classes: int = 6,
-):
-    """Constructs a Mask2Former architecture with an EfficientNet-B2 backbone.
-    Mask2Former from `"Masked-attention Mask Transformer for Universal Image Segmentation" <https://arxiv.org/pdf/2112.01527.pdf>`_.
-    EfficientNet from `"EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks" <https://arxiv.org/pdf/1905.11946.pdf>`_.
-    
-    Args:
-        pretrained (bool): 
-            If True, returns a model pre-trained on WBSig53
-        path (str): 
-            Path to existing model or where to download checkpoint to
-        num_classes (int): 
-            Number of output classes; if loading checkpoint and 
-            number does not equal 1, final layer will not be loaded from checkpoint
-        
-    """
-    from .modules import Mask2FormerModel, create_mask2former
-    
-    # Create Mask2Former-B2
-    mdl = create_mask2former(
-        backbone='efficientnet_b2',
-        pixel_decoder='multi_scale_deformable_attention',
-        predictor='multi_scale_masked_transformer_decoder',
-        num_classes=6,
-    )
-    if pretrained:
-        model_exists = os.path.exists(path)
-        if not model_exists:
-            file_id = model_urls['mask2former_b2_mod_family']
-            dl = gdown.download(id=file_id, output=path)
-        mdl.load_state_dict(torch.load(path), strict=False)
-    if num_classes != 6:
-        raise NotImplemented('Mask2Former implementation does not support finetuning to different class sizes yet.')
-    return mdl
-
-
-def mask2former_b4_mod_family(
-    pretrained: bool = False, 
-    path: str = "mask2former_b4_mod_family.pt",
-    num_classes: int = 6,
-):
-    """Constructs a Mask2Former architecture with an EfficientNet-B4 backbone.
-    Mask2Former from `"Masked-attention Mask Transformer for Universal Image Segmentation" <https://arxiv.org/pdf/2112.01527.pdf>`_.
-    EfficientNet from `"EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks" <https://arxiv.org/pdf/1905.11946.pdf>`_.
-    
-    Args:
-        pretrained (bool): 
-            If True, returns a model pre-trained on WBSig53
-        path (str): 
-            Path to existing model or where to download checkpoint to
-        num_classes (int): 
-            Number of output classes; if loading checkpoint and 
-            number does not equal 6, final layer will not be loaded from checkpoint
-        
-    """
-    from .modules import Mask2FormerModel, create_mask2former
-    
-    # Create Mask2Former-B4
-    mdl = create_mask2former(
-        backbone='efficientnet_b4',
-        pixel_decoder='multi_scale_deformable_attention',
-        predictor='multi_scale_masked_transformer_decoder',
-        num_classes=6,
-    )
-    if pretrained:
-        model_exists = os.path.exists(path)
-        if not model_exists:
-            file_id = model_urls['mask2former_b4_mod_family']
-            dl = gdown.download(id=file_id, output=path)
-        mdl.load_state_dict(torch.load(path), strict=False)
-    if num_classes != 6:
-        raise NotImplemented('Mask2Former implementation does not support finetuning to different class sizes yet.')
-    return mdl
diff --git a/torchsig/models/spectrogram_models/mask2former/modules.py b/torchsig/models/spectrogram_models/mask2former/modules.py
deleted file mode 100644
index 70d1436..0000000
--- a/torchsig/models/spectrogram_models/mask2former/modules.py
+++ /dev/null
@@ -1,146 +0,0 @@
-import torch
-import numpy as np
-
-from .backbone import EffNetBackbone, ResNet50Backbone
-from .pixel_decoder import MSDeformAttnPixelDecoder
-from .predictor import MultiScaleMaskedTransformerDecoder
-
-
-class Mask2FormerModel(torch.nn.Module):
-    def __init__(
-        self, 
-        backbone: torch.nn.Module,
-        pixel_decoder: torch.nn.Module,
-        predictor: torch.nn.Module,
-        num_classes: int = 1,
-    ):
-        super().__init__()
-        self.backbone = backbone
-        self.pixel_decoder = pixel_decoder
-        self.predictor = predictor
-        self.num_classes = num_classes
-        
-    def forward(self, x):
-        # Propagate inputs through model layers
-        features = self.backbone(x)
-        mask_features, transformer_encoder_features, multi_scale_features = self.pixel_decoder.forward_features(features)
-        predictions = self.predictor(multi_scale_features, mask_features, mask=None)
-        return predictions
-    
-    
-def create_backbone(
-    backbone: str = 'efficientnet_b0',
-) -> torch.nn.Module:
-    if 'eff' in backbone:
-        if 'b0' in backbone or 'b2' in backbone or 'b4' in backbone:
-            network = EffNetBackbone(network=backbone)
-        else:
-            raise NotImplemented("Only B0, B2, and B4 EffNets are supported at this time")
-    elif backbone == 'resnet50':
-        network = ResNet50Backbone()
-    else:
-        raise NotImplemented("Only EfficientNet and ResNet-50 backbones supported at this time.")
-    return network
-        
-    
-def create_pixel_decoder(
-    pixel_decoder: str = 'multi_scale_deformable_attention',
-    backbone: str = 'efficientnet_b0',
-    transformer_dropout: float = 0.0,
-    transformer_nheads: int = 8,
-    transformer_dim_feedforward: int = 2048,
-    transformer_enc_layers: int = 0,
-    conv_dim: int = 256,
-    mask_dim: int = 256,
-    norm: str = 'GN',
-    common_stride: int = 4,
-) -> torch.nn.Module:
-    if pixel_decoder == 'multi_scale_deformable_attention':
-        network = MSDeformAttnPixelDecoder(
-            backbone=backbone,
-            transformer_dropout=transformer_dropout,
-            transformer_nheads=transformer_nheads,
-            transformer_dim_feedforward=transformer_dim_feedforward,
-            transformer_enc_layers=transformer_enc_layers,
-            conv_dim=conv_dim,
-            mask_dim=mask_dim,
-            norm=norm,
-            common_stride=common_stride,
-        )
-    else:
-        raise NotImplemented("Only multi_scale_deformable_attention supported as a pixel decoder at this time.")
-    return network
-
-
-def create_predictor(
-    predictor: str = 'multi_scale_masked_transformer_decoder',
-    in_channels: int = 256,
-    mask_classification: bool = True,
-    num_classes: int = 1,
-    hidden_dim: int = 256,
-    num_queries: int = 100,
-    nheads: int = 8,
-    dim_feedforward: int = 2048,
-    dec_layers: int = 10,
-    pre_norm: bool = False,
-    mask_dim: int = 256,
-    enforce_input_project: bool = False,
-) -> torch.nn.Module:
-    if predictor == 'multi_scale_masked_transformer_decoder':
-        network = MultiScaleMaskedTransformerDecoder(
-            in_channels=in_channels,
-            mask_classification=mask_classification,
-            num_classes=num_classes,
-            hidden_dim=hidden_dim,
-            num_queries=num_queries,
-            nheads=nheads,
-            dim_feedforward=dim_feedforward,
-            dec_layers=dec_layers,
-            pre_norm=pre_norm,
-            mask_dim=mask_dim,
-            enforce_input_project=enforce_input_project,
-        )
-    else:
-        raise NotImplemented("Only multi_scale_masked_transformer_decoder supported as predictor at this time.")
-    return network
-    
-    
-def create_mask2former(
-    backbone: str = 'efficientnet_b0',
-    pixel_decoder: str = 'multi_scale_deformable_attention',
-    predictor: str = 'multi_scale_masked_transformer_decoder',
-    num_classes: int = 1,
-) -> torch.nn.Module:
-    """
-    Function used to build a Mask2Former network
-    
-    Args:
-        TODO
-        
-    Returns:
-        torch.nn.Module
-    """
-    # Instantiate backbone
-    backbone_name = str(backbone)
-    backbone = create_backbone(backbone_name)
-    
-    # Instantiate pixel decoder
-    pixel_decoder = create_pixel_decoder(
-        pixel_decoder=pixel_decoder,
-        backbone=backbone_name,
-    )
-    
-    # Instantiate predictor
-    predictor = create_predictor(
-        predictor=predictor,
-        num_classes=num_classes,
-    )
-    
-    # Create full Mask2Former model
-    network = Mask2FormerModel(
-        backbone=backbone,
-        pixel_decoder=pixel_decoder,
-        predictor=predictor,
-        num_classes=num_classes
-    )
-    return network
\ No newline at end of file
diff --git a/torchsig/models/spectrogram_models/mask2former/ops/functions/__init__.py b/torchsig/models/spectrogram_models/mask2former/ops/functions/__init__.py
deleted file mode 100644
index 2b06b5a..0000000
--- a/torchsig/models/spectrogram_models/mask2former/ops/functions/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# ------------------------------------------------------------------------------------------------
-# Deformable DETR
-# Copyright (c) 2020 SenseTime. All Rights Reserved.
-# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
-# ------------------------------------------------------------------------------------------------
-# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
-# ------------------------------------------------------------------------------------------------
-
-# Copyright (c) Facebook, Inc. and its affiliates.
-# Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
-
-from .ms_deform_attn_func import MSDeformAttnFunction
-
diff --git a/torchsig/models/spectrogram_models/mask2former/ops/functions/ms_deform_attn_func.py b/torchsig/models/spectrogram_models/mask2former/ops/functions/ms_deform_attn_func.py
deleted file mode 100644
index 94a36ab..0000000
--- a/torchsig/models/spectrogram_models/mask2former/ops/functions/ms_deform_attn_func.py
+++ /dev/null
@@ -1,72 +0,0 @@
-# ------------------------------------------------------------------------------------------------
-# Deformable DETR
-# Copyright (c) 2020 SenseTime. All Rights Reserved.
-# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
-# ------------------------------------------------------------------------------------------------
-# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
-# ------------------------------------------------------------------------------------------------
-
-# Copyright (c) Facebook, Inc. and its affiliates.
-# Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
-
-from __future__ import absolute_import
-from __future__ import print_function
-from __future__ import division
-
-import torch
-import torch.nn.functional as F
-from torch.autograd import Function
-from torch.autograd.function import once_differentiable
-
-try:
-    import MultiScaleDeformableAttention as MSDA
-except ModuleNotFoundError as e:
-    info_string = (
-        "\n\nPlease compile MultiScaleDeformableAttention CUDA op with the following commands:\n"
-        "\t`cd mask2former/modeling/pixel_decoder/ops`\n"
-        "\t`sh make.sh`\n"
-    )
-    raise ModuleNotFoundError(info_string)
-
-
-class MSDeformAttnFunction(Function):
-    @staticmethod
-    def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step):
-        ctx.im2col_step = im2col_step
-        output = MSDA.ms_deform_attn_forward(
-            value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step)
-        ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights)
-        return output
-
-    @staticmethod
-    @once_differentiable
-    def backward(ctx, grad_output):
-        value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors
-        grad_value, grad_sampling_loc, grad_attn_weight = \
-            MSDA.ms_deform_attn_backward(
-                value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step)
-
-        return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
-
-
-def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights):
-    # for debug and test only,
-    # need to use cuda version instead
-    N_, S_, M_, D_ = value.shape
-    _, Lq_, M_, L_, P_, _ = sampling_locations.shape
-    value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)
-    sampling_grids = 2 * sampling_locations - 1
-    sampling_value_list = []
-    for lid_, (H_, W_) in enumerate(value_spatial_shapes):
-        # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
-        value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_)
-        # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
-        sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1)
-        # N_*M_, D_, Lq_, P_
-        sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_,
-                                          mode='bilinear', padding_mode='zeros', align_corners=False)
-        sampling_value_list.append(sampling_value_l_)
-    # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_)
-    attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_)
-    output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_)
-    return output.transpose(1, 2).contiguous()
diff --git a/torchsig/models/spectrogram_models/mask2former/ops/make.sh b/torchsig/models/spectrogram_models/mask2former/ops/make.sh
deleted file mode 100755
index 7b38cdb..0000000
--- a/torchsig/models/spectrogram_models/mask2former/ops/make.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/usr/bin/env bash
-# ------------------------------------------------------------------------------------------------
-# Deformable DETR
-# Copyright (c) 2020 SenseTime. All Rights Reserved.
-# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
-# ------------------------------------------------------------------------------------------------
-# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
-# ------------------------------------------------------------------------------------------------
-
-# Copyright (c) Facebook, Inc. and its affiliates.
-# Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
-
-python setup.py build install
diff --git a/torchsig/models/spectrogram_models/mask2former/ops/modules/__init__.py b/torchsig/models/spectrogram_models/mask2former/ops/modules/__init__.py
deleted file mode 100644
index 6fdbf03..0000000
--- a/torchsig/models/spectrogram_models/mask2former/ops/modules/__init__.py
+++ /dev/null
@@ -1,12 +0,0 @@
-# ------------------------------------------------------------------------------------------------
-# Deformable DETR
-# Copyright (c) 2020 SenseTime. All Rights Reserved.
-# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
-# ------------------------------------------------------------------------------------------------
-# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
-# ------------------------------------------------------------------------------------------------
-
-# Copyright (c) Facebook, Inc. and its affiliates.
-# Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
-
-from .ms_deform_attn import MSDeformAttn
diff --git a/torchsig/models/spectrogram_models/mask2former/ops/modules/ms_deform_attn.py b/torchsig/models/spectrogram_models/mask2former/ops/modules/ms_deform_attn.py
deleted file mode 100644
index e7b4c42..0000000
--- a/torchsig/models/spectrogram_models/mask2former/ops/modules/ms_deform_attn.py
+++ /dev/null
@@ -1,125 +0,0 @@
-# ------------------------------------------------------------------------------------------------
-# Deformable DETR
-# Copyright (c) 2020 SenseTime. All Rights Reserved.
-# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
-# ------------------------------------------------------------------------------------------------
-# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
-# ------------------------------------------------------------------------------------------------
-
-# Copyright (c) Facebook, Inc. and its affiliates.
-# Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
-
-from __future__ import absolute_import
-from __future__ import print_function
-from __future__ import division
-
-import warnings
-import math
-
-import torch
-from torch import nn
-import torch.nn.functional as F
-from torch.nn.init import xavier_uniform_, constant_
-
-from ..functions import MSDeformAttnFunction
-from ..functions.ms_deform_attn_func import ms_deform_attn_core_pytorch
-
-
-def _is_power_of_2(n):
-    if (not isinstance(n, int)) or (n < 0):
-        raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n)))
-    return (n & (n-1) == 0) and n != 0
-
-
-class MSDeformAttn(nn.Module):
-    def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4):
-        """
-        Multi-Scale Deformable Attention Module
-        :param d_model      hidden dimension
-        :param n_levels     number of feature levels
-        :param n_heads      number of attention heads
-        :param n_points     number of sampling points per attention head per feature level
-        """
-        super().__init__()
-        if d_model % n_heads != 0:
-            raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads))
-        _d_per_head = d_model // n_heads
-        # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation
-        if not _is_power_of_2(_d_per_head):
-            warnings.warn("You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 "
-                          "which is more efficient in our CUDA implementation.")
-
-        self.im2col_step = 128
-
-        self.d_model = d_model
-        self.n_levels = n_levels
-        self.n_heads = n_heads
-        self.n_points = n_points
-
-        self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2)
-        self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points)
-        self.value_proj = nn.Linear(d_model, d_model)
-        self.output_proj = nn.Linear(d_model, d_model)
-
-        self._reset_parameters()
-
-    def _reset_parameters(self):
-        constant_(self.sampling_offsets.weight.data, 0.)
-        thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
-        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
-        grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, self.n_points, 1)
-        for i in range(self.n_points):
-            grid_init[:, :, i, :] *= i + 1
-        with torch.no_grad():
-            self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
-        constant_(self.attention_weights.weight.data, 0.)
-        constant_(self.attention_weights.bias.data, 0.)
-        xavier_uniform_(self.value_proj.weight.data)
-        constant_(self.value_proj.bias.data, 0.)
-        xavier_uniform_(self.output_proj.weight.data)
-        constant_(self.output_proj.bias.data, 0.)
-
-    def forward(self, query, reference_points, input_flatten, input_spatial_shapes, input_level_start_index, input_padding_mask=None):
-        """
-        :param query                       (N, Length_{query}, C)
-        :param reference_points            (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area
-                                        or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes
-        :param input_flatten               (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C)
-        :param input_spatial_shapes        (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
-        :param input_level_start_index     (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}]
-        :param input_padding_mask          (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements
-
-        :return output                     (N, Length_{query}, C)
-        """
-        N, Len_q, _ = query.shape
-        N, Len_in, _ = input_flatten.shape
-        assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == Len_in
-
-        value = self.value_proj(input_flatten)
-        if input_padding_mask is not None:
-            value = value.masked_fill(input_padding_mask[..., None], float(0))
-        value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads)
-        sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points, 2)
-        attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points)
-        attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points)
-        # N, Len_q, n_heads, n_levels, n_points, 2
-        if reference_points.shape[-1] == 2:
-            offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1)
-            sampling_locations = reference_points[:, :, None, :, None, :] \
-                                 + sampling_offsets / offset_normalizer[None, None, None, :, None, :]
-        elif reference_points.shape[-1] == 4:
-            sampling_locations = reference_points[:, :, None, :, None, :2] \
-                                 + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5
-        else:
-            raise ValueError(
-                'Last dim of reference_points must be 2 or 4, but get {} instead.'.format(reference_points.shape[-1]))
-        try:
-            output = MSDeformAttnFunction.apply(
-                value, input_spatial_shapes, input_level_start_index, sampling_locations, attention_weights, self.im2col_step)
-        except:
-            # CPU
-            output = ms_deform_attn_core_pytorch(value, input_spatial_shapes, sampling_locations, attention_weights)
-        # # For FLOPs calculation only
-        # output = ms_deform_attn_core_pytorch(value, input_spatial_shapes, sampling_locations, attention_weights)
-        output = self.output_proj(output)
-        return output
diff --git a/torchsig/models/spectrogram_models/mask2former/ops/setup.py b/torchsig/models/spectrogram_models/mask2former/ops/setup.py
deleted file mode 100644
index 3b57ad3..0000000
--- a/torchsig/models/spectrogram_models/mask2former/ops/setup.py
+++ /dev/null
@@ -1,78 +0,0 @@
-# ------------------------------------------------------------------------------------------------
-# Deformable DETR
-# Copyright (c) 2020 SenseTime. All Rights Reserved.
-# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
-# ------------------------------------------------------------------------------------------------
-# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
-# ------------------------------------------------------------------------------------------------
-
-# Copyright (c) Facebook, Inc. and its affiliates.
-# Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
-
-import os
-import glob
-
-import torch
-
-from torch.utils.cpp_extension import CUDA_HOME
-from torch.utils.cpp_extension import CppExtension
-from torch.utils.cpp_extension import CUDAExtension
-
-from setuptools import find_packages
-from setuptools import setup
-
-requirements = ["torch", "torchvision"]
-
-def get_extensions():
-    this_dir = os.path.dirname(os.path.abspath(__file__))
-    extensions_dir = os.path.join(this_dir, "src")
-
-    main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
-    source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
-    source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
-
-    sources = main_file + source_cpu
-    extension = CppExtension
-    extra_compile_args = {"cxx": []}
-    define_macros = []
-
-    # Force cuda since torch ask for a device, not if cuda is in fact available.
-    if (os.environ.get('FORCE_CUDA') or torch.cuda.is_available()) and CUDA_HOME is not None:
-        extension = CUDAExtension
-        sources += source_cuda
-        define_macros += [("WITH_CUDA", None)]
-        extra_compile_args["nvcc"] = [
-            "-DCUDA_HAS_FP16=1",
-            "-D__CUDA_NO_HALF_OPERATORS__",
-            "-D__CUDA_NO_HALF_CONVERSIONS__",
-            "-D__CUDA_NO_HALF2_OPERATORS__",
-        ]
-    else:
-        if CUDA_HOME is None:
-            raise NotImplementedError('CUDA_HOME is None. Please set environment variable CUDA_HOME.')
-        else:
-            raise NotImplementedError('No CUDA runtime is found. Please set FORCE_CUDA=1 or test it by running torch.cuda.is_available().')
-
-    sources = [os.path.join(extensions_dir, s) for s in sources]
-    include_dirs = [extensions_dir]
-    ext_modules = [
-        extension(
-            "MultiScaleDeformableAttention",
-            sources,
-            include_dirs=include_dirs,
-            define_macros=define_macros,
-            extra_compile_args=extra_compile_args,
-        )
-    ]
-    return ext_modules
-
-setup(
-    name="MultiScaleDeformableAttention",
-    version="1.0",
-    author="Weijie Su",
-    url="https://github.com/fundamentalvision/Deformable-DETR",
-    description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention",
-    packages=find_packages(exclude=("configs", "tests",)),
-    ext_modules=get_extensions(),
-    cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
-)
diff --git a/torchsig/models/spectrogram_models/mask2former/ops/src/cpu/ms_deform_attn_cpu.cpp b/torchsig/models/spectrogram_models/mask2former/ops/src/cpu/ms_deform_attn_cpu.cpp
deleted file mode 100644
index 48757e2..0000000
--- a/torchsig/models/spectrogram_models/mask2former/ops/src/cpu/ms_deform_attn_cpu.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*!
-**************************************************************************************************
-* Deformable DETR
-* Copyright (c) 2020 SenseTime. All Rights Reserved.
-* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
-**************************************************************************************************
-* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
-**************************************************************************************************
-*/
-
-/*!
-* Copyright (c) Facebook, Inc. and its affiliates.
-* Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
-*/
-
-#include <vector>
-
-#include <ATen/ATen.h>
-#include <ATen/cuda/CUDAContext.h>
-
-
-at::Tensor
-ms_deform_attn_cpu_forward(
-    const at::Tensor &value, 
-    const at::Tensor &spatial_shapes,
-    const at::Tensor &level_start_index,
-    const at::Tensor &sampling_loc,
-    const at::Tensor &attn_weight,
-    const int im2col_step)
-{
-    AT_ERROR("Not implement on cpu");
-}
-
-std::vector<at::Tensor>
-ms_deform_attn_cpu_backward(
-    const at::Tensor &value, 
-    const at::Tensor &spatial_shapes,
-    const at::Tensor &level_start_index,
-    const at::Tensor &sampling_loc,
-    const at::Tensor &attn_weight,
-    const at::Tensor &grad_output,
-    const int im2col_step)
-{
-    AT_ERROR("Not implement on cpu");
-}
-
diff --git a/torchsig/models/spectrogram_models/mask2former/ops/src/cpu/ms_deform_attn_cpu.h b/torchsig/models/spectrogram_models/mask2former/ops/src/cpu/ms_deform_attn_cpu.h
deleted file mode 100644
index 51bb27e..0000000
--- a/torchsig/models/spectrogram_models/mask2former/ops/src/cpu/ms_deform_attn_cpu.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/*!
-**************************************************************************************************
-* Deformable DETR
-* Copyright (c) 2020 SenseTime. All Rights Reserved.
-* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
-**************************************************************************************************
-* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
-**************************************************************************************************
-*/
-
-/*!
-* Copyright (c) Facebook, Inc. and its affiliates.
-* Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
-*/
-
-#pragma once
-#include <torch/extension.h>
-
-at::Tensor
-ms_deform_attn_cpu_forward(
-    const at::Tensor &value, 
-    const at::Tensor &spatial_shapes,
-    const at::Tensor &level_start_index,
-    const at::Tensor &sampling_loc,
-    const at::Tensor &attn_weight,
-    const int im2col_step);
-
-std::vector<at::Tensor>
-ms_deform_attn_cpu_backward(
-    const at::Tensor &value, 
-    const at::Tensor &spatial_shapes,
-    const at::Tensor &level_start_index,
-    const at::Tensor &sampling_loc,
-    const at::Tensor &attn_weight,
-    const at::Tensor &grad_output,
-    const int im2col_step);
-
-
diff --git a/torchsig/models/spectrogram_models/mask2former/ops/src/cuda/ms_deform_attn_cuda.cu b/torchsig/models/spectrogram_models/mask2former/ops/src/cuda/ms_deform_attn_cuda.cu
deleted file mode 100644
index 0c465da..0000000
--- a/torchsig/models/spectrogram_models/mask2former/ops/src/cuda/ms_deform_attn_cuda.cu
+++ /dev/null
@@ -1,158 +0,0 @@
-/*!
-**************************************************************************************************
-* Deformable DETR
-* Copyright (c) 2020 SenseTime. All Rights Reserved.
-* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
-**************************************************************************************************
-* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
-**************************************************************************************************
-*/
-
-/*!
-* Copyright (c) Facebook, Inc. and its affiliates.
-* Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
-*/
-
-#include <vector>
-#include "cuda/ms_deform_im2col_cuda.cuh"
-
-#include <ATen/ATen.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <cuda.h>
-#include <cuda_runtime.h>
-
-
-at::Tensor ms_deform_attn_cuda_forward(
-    const at::Tensor &value, 
-    const at::Tensor &spatial_shapes,
-    const at::Tensor &level_start_index,
-    const at::Tensor &sampling_loc,
-    const at::Tensor &attn_weight,
-    const int im2col_step)
-{
-    AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
-    AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
-    AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
-    AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
-    AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
-
-    AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
-    AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor");
-    AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor");
-    AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor");
-    AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
-
-    const int batch = value.size(0);
-    const int spatial_size = value.size(1);
-    const int num_heads = value.size(2);
-    const int channels = value.size(3);
-
-    const int num_levels = spatial_shapes.size(0);
-
-    const int num_query = sampling_loc.size(1);
-    const int num_point = sampling_loc.size(4);
-
-    const int im2col_step_ = std::min(batch, im2col_step);
-
-    AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
-    
-    auto output = at::zeros({batch, num_query, num_heads, channels}, value.options());
-
-    const int batch_n = im2col_step_;
-    auto output_n = output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
-    auto per_value_size = spatial_size * num_heads * channels;
-    auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
-    auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
-    for (int n = 0; n < batch/im2col_step_; ++n)
-    {
-        auto columns = output_n.select(0, n);
-        AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_forward_cuda", ([&] {
-            ms_deformable_im2col_cuda(at::cuda::getCurrentCUDAStream(),
-                value.data<scalar_t>() + n * im2col_step_ * per_value_size,
-                spatial_shapes.data<int64_t>(),
-                level_start_index.data<int64_t>(),
-                sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
-                attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size,
-                batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
-                columns.data<scalar_t>());
-
-        }));
-    }
-
-    output = output.view({batch, num_query, num_heads*channels});
-
-    return output;
-}
-
-
-std::vector<at::Tensor> ms_deform_attn_cuda_backward(
-    const at::Tensor &value, 
-    const at::Tensor &spatial_shapes,
-    const at::Tensor &level_start_index,
-    const at::Tensor &sampling_loc,
-    const at::Tensor &attn_weight,
-    const at::Tensor &grad_output,
-    const int im2col_step)
-{
-
-    AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
-    AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
-    AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
-    AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
-    AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
-    AT_ASSERTM(grad_output.is_contiguous(), "grad_output tensor has to be contiguous");
-
-    AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
-    AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor");
-    AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor");
-    AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor");
-    AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
-    AT_ASSERTM(grad_output.type().is_cuda(), "grad_output must be a CUDA tensor");
-
-    const int batch = value.size(0);
-    const int spatial_size = value.size(1);
-    const int num_heads = value.size(2);
-    const int channels = value.size(3);
-
-    const int num_levels = spatial_shapes.size(0);
-
-    const int num_query = sampling_loc.size(1);
-    const int num_point = sampling_loc.size(4);
-
-    const int im2col_step_ = std::min(batch, im2col_step);
-
-    AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
-
-    auto grad_value = at::zeros_like(value);
-    auto grad_sampling_loc = at::zeros_like(sampling_loc);
-    auto grad_attn_weight = at::zeros_like(attn_weight);
-
-    const int batch_n = im2col_step_;
-    auto per_value_size = spatial_size * num_heads * channels;
-    auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
-    auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
-    auto grad_output_n = grad_output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
-    
-    for (int n = 0; n < batch/im2col_step_; ++n)
-    {
-        auto grad_output_g = grad_output_n.select(0, n);
-        AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_backward_cuda", ([&] {
-            ms_deformable_col2im_cuda(at::cuda::getCurrentCUDAStream(),
-                                    grad_output_g.data<scalar_t>(),
-                                    value.data<scalar_t>() + n * im2col_step_ * per_value_size,
-                                    spatial_shapes.data<int64_t>(),
-                                    level_start_index.data<int64_t>(),
-                                    sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
-                                    attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size,
-                                    batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
-                                    grad_value.data<scalar_t>() +  n * im2col_step_ * per_value_size,
-                                    grad_sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
-                                    grad_attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size);
-
-        }));
-    }
-
-    return {
-        grad_value, grad_sampling_loc, grad_attn_weight
-    };
-}
\ No newline at end of file
diff --git a/torchsig/models/spectrogram_models/mask2former/ops/src/cuda/ms_deform_attn_cuda.h b/torchsig/models/spectrogram_models/mask2former/ops/src/cuda/ms_deform_attn_cuda.h
deleted file mode 100644
index 4f0658e..0000000
--- a/torchsig/models/spectrogram_models/mask2former/ops/src/cuda/ms_deform_attn_cuda.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/*!
-**************************************************************************************************
-* Deformable DETR
-* Copyright (c) 2020 SenseTime. All Rights Reserved.
-* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
-**************************************************************************************************
-* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
-**************************************************************************************************
-*/
-
-/*!
-* Copyright (c) Facebook, Inc. and its affiliates.
-* Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
-*/
-
-#pragma once
-#include <torch/extension.h>
-
-at::Tensor ms_deform_attn_cuda_forward(
-    const at::Tensor &value, 
-    const at::Tensor &spatial_shapes,
-    const at::Tensor &level_start_index,
-    const at::Tensor &sampling_loc,
-    const at::Tensor &attn_weight,
-    const int im2col_step);
-
-std::vector<at::Tensor> ms_deform_attn_cuda_backward(
-    const at::Tensor &value, 
-    const at::Tensor &spatial_shapes,
-    const at::Tensor &level_start_index,
-    const at::Tensor &sampling_loc,
-    const at::Tensor &attn_weight,
-    const at::Tensor &grad_output,
-    const int im2col_step);
-
diff --git a/torchsig/models/spectrogram_models/mask2former/ops/src/cuda/ms_deform_im2col_cuda.cuh b/torchsig/models/spectrogram_models/mask2former/ops/src/cuda/ms_deform_im2col_cuda.cuh
deleted file mode 100644
index c04e0d4..0000000
--- a/torchsig/models/spectrogram_models/mask2former/ops/src/cuda/ms_deform_im2col_cuda.cuh
+++ /dev/null
@@ -1,1332 +0,0 @@
-/*!
-**************************************************************************
-* Deformable DETR
-* Copyright (c) 2020 SenseTime. All Rights Reserved.
-* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
-**************************************************************************
-* Modified from DCN (https://github.com/msracver/Deformable-ConvNets)
-* Copyright (c) 2018 Microsoft
-**************************************************************************
-*/
-
-/*!
-* Copyright (c) Facebook, Inc. and its affiliates.
-* Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
-*/
-
-#include <cstdio>
-#include <algorithm>
-#include <cstring>
-
-#include <ATen/ATen.h>
-#include <ATen/cuda/CUDAContext.h>
-
-#include <THC/THCAtomics.cuh>
-
-#define CUDA_KERNEL_LOOP(i, n)                          \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x;   \
-      i < (n);                                          \
-      i += blockDim.x * gridDim.x)
-
-const int CUDA_NUM_THREADS = 1024;
-inline int GET_BLOCKS(const int N, const int num_threads)
-{
-  return (N + num_threads - 1) / num_threads;
-}
-
-
-template <typename scalar_t>
-__device__ scalar_t ms_deform_attn_im2col_bilinear(const scalar_t* &bottom_data, 
-                                                   const int &height, const int &width, const int &nheads, const int &channels,
-                                                   const scalar_t &h, const scalar_t &w, const int &m, const int &c)
-{
-  const int h_low = floor(h);
-  const int w_low = floor(w);
-  const int h_high = h_low + 1;
-  const int w_high = w_low + 1;
-
-  const scalar_t lh = h - h_low;
-  const scalar_t lw = w - w_low;
-  const scalar_t hh = 1 - lh, hw = 1 - lw;
-
-  const int w_stride = nheads * channels;
-  const int h_stride = width * w_stride;
-  const int h_low_ptr_offset = h_low * h_stride;
-  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
-  const int w_low_ptr_offset = w_low * w_stride;
-  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
-  const int base_ptr = m * channels + c;
-
-  scalar_t v1 = 0;
-  if (h_low >= 0 && w_low >= 0)
-  {
-    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
-    v1 = bottom_data[ptr1];
-  }
-  scalar_t v2 = 0;
-  if (h_low >= 0 && w_high <= width - 1)
-  {
-    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
-    v2 = bottom_data[ptr2];
-  }
-  scalar_t v3 = 0;
-  if (h_high <= height - 1 && w_low >= 0)
-  {
-    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
-    v3 = bottom_data[ptr3];
-  }
-  scalar_t v4 = 0;
-  if (h_high <= height - 1 && w_high <= width - 1)
-  {
-    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
-    v4 = bottom_data[ptr4];
-  }
-
-  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
-
-  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
-  return val;
-}
-
-
-template <typename scalar_t>
-__device__ void ms_deform_attn_col2im_bilinear(const scalar_t* &bottom_data, 
-                                                   const int &height, const int &width, const int &nheads, const int &channels,
-                                                   const scalar_t &h, const scalar_t &w, const int &m, const int &c,
-                                                   const scalar_t &top_grad,
-                                                   const scalar_t &attn_weight,
-                                                   scalar_t* &grad_value, 
-                                                   scalar_t* grad_sampling_loc,
-                                                   scalar_t* grad_attn_weight)
-{
-  const int h_low = floor(h);
-  const int w_low = floor(w);
-  const int h_high = h_low + 1;
-  const int w_high = w_low + 1;
-
-  const scalar_t lh = h - h_low;
-  const scalar_t lw = w - w_low;
-  const scalar_t hh = 1 - lh, hw = 1 - lw;
-
-  const int w_stride = nheads * channels;
-  const int h_stride = width * w_stride;
-  const int h_low_ptr_offset = h_low * h_stride;
-  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
-  const int w_low_ptr_offset = w_low * w_stride;
-  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
-  const int base_ptr = m * channels + c;
-
-  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
-  const scalar_t top_grad_value = top_grad * attn_weight;
-  scalar_t grad_h_weight = 0, grad_w_weight = 0;
-
-  scalar_t v1 = 0;
-  if (h_low >= 0 && w_low >= 0)
-  {
-    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
-    v1 = bottom_data[ptr1];
-    grad_h_weight -= hw * v1;
-    grad_w_weight -= hh * v1;
-    atomicAdd(grad_value+ptr1, w1*top_grad_value);
-  }
-  scalar_t v2 = 0;
-  if (h_low >= 0 && w_high <= width - 1)
-  {
-    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
-    v2 = bottom_data[ptr2];
-    grad_h_weight -= lw * v2;
-    grad_w_weight += hh * v2;
-    atomicAdd(grad_value+ptr2, w2*top_grad_value);
-  }
-  scalar_t v3 = 0;
-  if (h_high <= height - 1 && w_low >= 0)
-  {
-    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
-    v3 = bottom_data[ptr3];
-    grad_h_weight += hw * v3;
-    grad_w_weight -= lh * v3;
-    atomicAdd(grad_value+ptr3, w3*top_grad_value); 
-  }
-  scalar_t v4 = 0;
-  if (h_high <= height - 1 && w_high <= width - 1)
-  {
-    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
-    v4 = bottom_data[ptr4];
-    grad_h_weight += lw * v4;
-    grad_w_weight += lh * v4;
-    atomicAdd(grad_value+ptr4, w4*top_grad_value);
-  }
-
-  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
-  *grad_attn_weight = top_grad * val;
-  *grad_sampling_loc = width * grad_w_weight * top_grad_value;
-  *(grad_sampling_loc + 1) = height * grad_h_weight * top_grad_value;
-}
-
-
-template <typename scalar_t>
-__device__ void ms_deform_attn_col2im_bilinear_gm(const scalar_t* &bottom_data, 
-                                                   const int &height, const int &width, const int &nheads, const int &channels,
-                                                   const scalar_t &h, const scalar_t &w, const int &m, const int &c,
-                                                   const scalar_t &top_grad,
-                                                   const scalar_t &attn_weight,
-                                                   scalar_t* &grad_value, 
-                                                   scalar_t* grad_sampling_loc,
-                                                   scalar_t* grad_attn_weight)
-{
-  const int h_low = floor(h);
-  const int w_low = floor(w);
-  const int h_high = h_low + 1;
-  const int w_high = w_low + 1;
-
-  const scalar_t lh = h - h_low;
-  const scalar_t lw = w - w_low;
-  const scalar_t hh = 1 - lh, hw = 1 - lw;
-
-  const int w_stride = nheads * channels;
-  const int h_stride = width * w_stride;
-  const int h_low_ptr_offset = h_low * h_stride;
-  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
-  const int w_low_ptr_offset = w_low * w_stride;
-  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
-  const int base_ptr = m * channels + c;
-
-  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
-  const scalar_t top_grad_value = top_grad * attn_weight;
-  scalar_t grad_h_weight = 0, grad_w_weight = 0;
-
-  scalar_t v1 = 0;
-  if (h_low >= 0 && w_low >= 0)
-  {
-    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
-    v1 = bottom_data[ptr1];
-    grad_h_weight -= hw * v1;
-    grad_w_weight -= hh * v1;
-    atomicAdd(grad_value+ptr1, w1*top_grad_value);
-  }
-  scalar_t v2 = 0;
-  if (h_low >= 0 && w_high <= width - 1)
-  {
-    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
-    v2 = bottom_data[ptr2];
-    grad_h_weight -= lw * v2;
-    grad_w_weight += hh * v2;
-    atomicAdd(grad_value+ptr2, w2*top_grad_value);
-  }
-  scalar_t v3 = 0;
-  if (h_high <= height - 1 && w_low >= 0)
-  {
-    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
-    v3 = bottom_data[ptr3];
-    grad_h_weight += hw * v3;
-    grad_w_weight -= lh * v3;
-    atomicAdd(grad_value+ptr3, w3*top_grad_value); 
-  }
-  scalar_t v4 = 0;
-  if (h_high <= height - 1 && w_high <= width - 1)
-  {
-    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
-    v4 = bottom_data[ptr4];
-    grad_h_weight += lw * v4;
-    grad_w_weight += lh * v4;
-    atomicAdd(grad_value+ptr4, w4*top_grad_value);
-  }
-
-  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
-  atomicAdd(grad_attn_weight, top_grad * val); 
-  atomicAdd(grad_sampling_loc, width * grad_w_weight * top_grad_value);
-  atomicAdd(grad_sampling_loc + 1, height * grad_h_weight * top_grad_value);
-}
-
-
-template <typename scalar_t>
-__global__ void ms_deformable_im2col_gpu_kernel(const int n,
-                                                const scalar_t *data_value, 
-                                                const int64_t *data_spatial_shapes,
-                                                const int64_t *data_level_start_index, 
-                                                const scalar_t *data_sampling_loc,
-                                                const scalar_t *data_attn_weight,
-                                                const int batch_size, 
-                                                const int spatial_size, 
-                                                const int num_heads,
-                                                const int channels, 
-                                                const int num_levels,
-                                                const int num_query,
-                                                const int num_point,
-                                                scalar_t *data_col)
-{
-  CUDA_KERNEL_LOOP(index, n)
-  {
-    int _temp = index;
-    const int c_col = _temp % channels;
-    _temp /= channels;
-    const int sampling_index = _temp; 
-    const int m_col = _temp % num_heads;
-    _temp /= num_heads;
-    const int q_col = _temp % num_query;
-    _temp /= num_query;
-    const int b_col = _temp;
-
-    scalar_t *data_col_ptr = data_col + index;
-    int data_weight_ptr = sampling_index * num_levels * num_point;
-    int data_loc_w_ptr = data_weight_ptr << 1;
-    const int qid_stride = num_heads * channels;
-    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
-    scalar_t col = 0;
-    
-    for (int l_col=0; l_col < num_levels; ++l_col)
-    {
-      const int level_start_id = data_level_start_index[l_col];
-      const int spatial_h_ptr = l_col << 1;
-      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
-      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
-      const scalar_t *data_value_ptr = data_value + (data_value_ptr_init_offset + level_start_id * qid_stride);
-      for (int p_col=0; p_col < num_point; ++p_col)
-      {
-        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
-        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
-        const scalar_t weight = data_attn_weight[data_weight_ptr];
-
-        const scalar_t h_im = loc_h * spatial_h - 0.5;
-        const scalar_t w_im = loc_w * spatial_w - 0.5;
-
-        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
-        {
-          col += ms_deform_attn_im2col_bilinear(data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col) * weight;
-        }
-
-        data_weight_ptr += 1;
-        data_loc_w_ptr += 2;
-      }
-    }
-    *data_col_ptr = col;
-  }
-}
-
-template <typename scalar_t, unsigned int blockSize>
-__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1(const int n,
-                                                const scalar_t *grad_col,
-                                                const scalar_t *data_value,
-                                                const int64_t *data_spatial_shapes,
-                                                const int64_t *data_level_start_index, 
-                                                const scalar_t *data_sampling_loc,
-                                                const scalar_t *data_attn_weight,
-                                                const int batch_size, 
-                                                const int spatial_size, 
-                                                const int num_heads,
-                                                const int channels, 
-                                                const int num_levels,
-                                                const int num_query,
-                                                const int num_point,
-                                                scalar_t *grad_value,
-                                                scalar_t *grad_sampling_loc,
-                                                scalar_t *grad_attn_weight)
-{
-  CUDA_KERNEL_LOOP(index, n)
-  {
-    __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
-    __shared__ scalar_t cache_grad_attn_weight[blockSize];
-    unsigned int tid = threadIdx.x;
-    int _temp = index;
-    const int c_col = _temp % channels;
-    _temp /= channels;
-    const int sampling_index = _temp; 
-    const int m_col = _temp % num_heads;
-    _temp /= num_heads;
-    const int q_col = _temp % num_query;
-    _temp /= num_query;
-    const int b_col = _temp;
-
-    const scalar_t top_grad = grad_col[index];
-
-    int data_weight_ptr = sampling_index * num_levels * num_point;
-    int data_loc_w_ptr = data_weight_ptr << 1;
-    const int grad_sampling_ptr = data_weight_ptr;
-    grad_sampling_loc += grad_sampling_ptr << 1;
-    grad_attn_weight += grad_sampling_ptr;
-    const int grad_weight_stride = 1;
-    const int grad_loc_stride = 2;
-    const int qid_stride = num_heads * channels;
-    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
-
-    for (int l_col=0; l_col < num_levels; ++l_col)
-    {
-      const int level_start_id = data_level_start_index[l_col];
-      const int spatial_h_ptr = l_col << 1;
-      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
-      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
-      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
-      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
-      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
-
-      for (int p_col=0; p_col < num_point; ++p_col)
-      {
-        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
-        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
-        const scalar_t weight = data_attn_weight[data_weight_ptr];
-
-        const scalar_t h_im = loc_h * spatial_h - 0.5;
-        const scalar_t w_im = loc_w * spatial_w - 0.5;
-        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
-        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
-        *(cache_grad_attn_weight+threadIdx.x)=0;
-        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
-        {
-          ms_deform_attn_col2im_bilinear(
-            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
-            top_grad, weight, grad_value_ptr, 
-            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
-        }
-        
-        __syncthreads();
-        if (tid == 0)
-        {
-          scalar_t _grad_w=cache_grad_sampling_loc[0], _grad_h=cache_grad_sampling_loc[1], _grad_a=cache_grad_attn_weight[0];
-          int sid=2;
-          for (unsigned int tid = 1; tid < blockSize; ++tid)
-          {
-            _grad_w += cache_grad_sampling_loc[sid];
-            _grad_h += cache_grad_sampling_loc[sid + 1];
-            _grad_a += cache_grad_attn_weight[tid];
-            sid += 2;
-          }
-          
-          
-          *grad_sampling_loc = _grad_w;
-          *(grad_sampling_loc + 1) = _grad_h;
-          *grad_attn_weight = _grad_a;
-        }
-        __syncthreads();
-
-        data_weight_ptr += 1;
-        data_loc_w_ptr += 2;
-        grad_attn_weight += grad_weight_stride;
-        grad_sampling_loc += grad_loc_stride;
-      }
-    }
-  }
-}
-
-
-template <typename scalar_t, unsigned int blockSize>
-__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2(const int n,
-                                                const scalar_t *grad_col,
-                                                const scalar_t *data_value,
-                                                const int64_t *data_spatial_shapes,
-                                                const int64_t *data_level_start_index, 
-                                                const scalar_t *data_sampling_loc,
-                                                const scalar_t *data_attn_weight,
-                                                const int batch_size, 
-                                                const int spatial_size, 
-                                                const int num_heads,
-                                                const int channels, 
-                                                const int num_levels,
-                                                const int num_query,
-                                                const int num_point,
-                                                scalar_t *grad_value,
-                                                scalar_t *grad_sampling_loc,
-                                                scalar_t *grad_attn_weight)
-{
-  CUDA_KERNEL_LOOP(index, n)
-  {
-    __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
-    __shared__ scalar_t cache_grad_attn_weight[blockSize];
-    unsigned int tid = threadIdx.x;
-    int _temp = index;
-    const int c_col = _temp % channels;
-    _temp /= channels;
-    const int sampling_index = _temp; 
-    const int m_col = _temp % num_heads;
-    _temp /= num_heads;
-    const int q_col = _temp % num_query;
-    _temp /= num_query;
-    const int b_col = _temp;
-
-    const scalar_t top_grad = grad_col[index];
-
-    int data_weight_ptr = sampling_index * num_levels * num_point;
-    int data_loc_w_ptr = data_weight_ptr << 1;
-    const int grad_sampling_ptr = data_weight_ptr;
-    grad_sampling_loc += grad_sampling_ptr << 1;
-    grad_attn_weight += grad_sampling_ptr;
-    const int grad_weight_stride = 1;
-    const int grad_loc_stride = 2;
-    const int qid_stride = num_heads * channels;
-    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
-
-    for (int l_col=0; l_col < num_levels; ++l_col)
-    {
-      const int level_start_id = data_level_start_index[l_col];
-      const int spatial_h_ptr = l_col << 1;
-      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
-      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
-      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
-      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
-      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
-
-      for (int p_col=0; p_col < num_point; ++p_col)
-      {
-        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
-        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
-        const scalar_t weight = data_attn_weight[data_weight_ptr];
-
-        const scalar_t h_im = loc_h * spatial_h - 0.5;
-        const scalar_t w_im = loc_w * spatial_w - 0.5;
-        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
-        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
-        *(cache_grad_attn_weight+threadIdx.x)=0;
-        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
-        {
-          ms_deform_attn_col2im_bilinear(
-            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
-            top_grad, weight, grad_value_ptr, 
-            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
-        }
-        
-        __syncthreads();
-
-        for (unsigned int s=blockSize/2; s>0; s>>=1)
-        {
-          if (tid < s) {
-            const unsigned int xid1 = tid << 1;
-            const unsigned int xid2 = (tid + s) << 1;
-            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
-            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
-            cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1];
-          }
-          __syncthreads();
-        }
-
-        if (tid == 0)
-        { 
-          *grad_sampling_loc = cache_grad_sampling_loc[0];
-          *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];
-          *grad_attn_weight = cache_grad_attn_weight[0];
-        }
-        __syncthreads();
-
-        data_weight_ptr += 1;
-        data_loc_w_ptr += 2;
-        grad_attn_weight += grad_weight_stride;
-        grad_sampling_loc += grad_loc_stride;
-      }
-    }
-  }
-}
-
-
-template <typename scalar_t>
-__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1(const int n,
-                                                const scalar_t *grad_col,
-                                                const scalar_t *data_value,
-                                                const int64_t *data_spatial_shapes,
-                                                const int64_t *data_level_start_index, 
-                                                const scalar_t *data_sampling_loc,
-                                                const scalar_t *data_attn_weight,
-                                                const int batch_size, 
-                                                const int spatial_size, 
-                                                const int num_heads,
-                                                const int channels, 
-                                                const int num_levels,
-                                                const int num_query,
-                                                const int num_point,
-                                                scalar_t *grad_value,
-                                                scalar_t *grad_sampling_loc,
-                                                scalar_t *grad_attn_weight)
-{
-  CUDA_KERNEL_LOOP(index, n)
-  {
-    extern __shared__ int _s[];
-    scalar_t* cache_grad_sampling_loc = (scalar_t*)_s;
-    scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
-    unsigned int tid = threadIdx.x;
-    int _temp = index;
-    const int c_col = _temp % channels;
-    _temp /= channels;
-    const int sampling_index = _temp; 
-    const int m_col = _temp % num_heads;
-    _temp /= num_heads;
-    const int q_col = _temp % num_query;
-    _temp /= num_query;
-    const int b_col = _temp;
-
-    const scalar_t top_grad = grad_col[index];
-
-    int data_weight_ptr = sampling_index * num_levels * num_point;
-    int data_loc_w_ptr = data_weight_ptr << 1;
-    const int grad_sampling_ptr = data_weight_ptr;
-    grad_sampling_loc += grad_sampling_ptr << 1;
-    grad_attn_weight += grad_sampling_ptr;
-    const int grad_weight_stride = 1;
-    const int grad_loc_stride = 2;
-    const int qid_stride = num_heads * channels;
-    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
-
-    for (int l_col=0; l_col < num_levels; ++l_col)
-    {
-      const int level_start_id = data_level_start_index[l_col];
-      const int spatial_h_ptr = l_col << 1;
-      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
-      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
-      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
-      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
-      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
-
-      for (int p_col=0; p_col < num_point; ++p_col)
-      {
-        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
-        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
-        const scalar_t weight = data_attn_weight[data_weight_ptr];
-
-        const scalar_t h_im = loc_h * spatial_h - 0.5;
-        const scalar_t w_im = loc_w * spatial_w - 0.5;
-        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
-        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
-        *(cache_grad_attn_weight+threadIdx.x)=0;
-        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
-        {
-          ms_deform_attn_col2im_bilinear(
-            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
-            top_grad, weight, grad_value_ptr, 
-            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
-        }
-        
-        __syncthreads();
-        if (tid == 0)
-        {
-          scalar_t _grad_w=cache_grad_sampling_loc[0], _grad_h=cache_grad_sampling_loc[1], _grad_a=cache_grad_attn_weight[0];
-          int sid=2;
-          for (unsigned int tid = 1; tid < blockDim.x; ++tid)
-          {
-            _grad_w += cache_grad_sampling_loc[sid];
-            _grad_h += cache_grad_sampling_loc[sid + 1];
-            _grad_a += cache_grad_attn_weight[tid];
-            sid += 2;
-          }
-          
-          
-          *grad_sampling_loc = _grad_w;
-          *(grad_sampling_loc + 1) = _grad_h;
-          *grad_attn_weight = _grad_a;
-        }
-        __syncthreads();
-
-        data_weight_ptr += 1;
-        data_loc_w_ptr += 2;
-        grad_attn_weight += grad_weight_stride;
-        grad_sampling_loc += grad_loc_stride;
-      }
-    }
-  }
-}
-
-template <typename scalar_t>
-__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2(const int n,
-                                                const scalar_t *grad_col,
-                                                const scalar_t *data_value,
-                                                const int64_t *data_spatial_shapes,
-                                                const int64_t *data_level_start_index, 
-                                                const scalar_t *data_sampling_loc,
-                                                const scalar_t *data_attn_weight,
-                                                const int batch_size, 
-                                                const int spatial_size, 
-                                                const int num_heads,
-                                                const int channels, 
-                                                const int num_levels,
-                                                const int num_query,
-                                                const int num_point,
-                                                scalar_t *grad_value,
-                                                scalar_t *grad_sampling_loc,
-                                                scalar_t *grad_attn_weight)
-{
-  CUDA_KERNEL_LOOP(index, n)
-  {
-    extern __shared__ int _s[];
-    scalar_t* cache_grad_sampling_loc = (scalar_t*)_s;
-    scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
-    unsigned int tid = threadIdx.x;
-    int _temp = index;
-    const int c_col = _temp % channels;
-    _temp /= channels;
-    const int sampling_index = _temp; 
-    const int m_col = _temp % num_heads;
-    _temp /= num_heads;
-    const int q_col = _temp % num_query;
-    _temp /= num_query;
-    const int b_col = _temp;
-
-    const scalar_t top_grad = grad_col[index];
-
-    int data_weight_ptr = sampling_index * num_levels * num_point;
-    int data_loc_w_ptr = data_weight_ptr << 1;
-    const int grad_sampling_ptr = data_weight_ptr;
-    grad_sampling_loc += grad_sampling_ptr << 1;
-    grad_attn_weight += grad_sampling_ptr;
-    const int grad_weight_stride = 1;
-    const int grad_loc_stride = 2;
-    const int qid_stride = num_heads * channels;
-    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
-
-    for (int l_col=0; l_col < num_levels; ++l_col)
-    {
-      const int level_start_id = data_level_start_index[l_col];
-      const int spatial_h_ptr = l_col << 1;
-      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
-      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
-      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
-      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
-      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
-
-      for (int p_col=0; p_col < num_point; ++p_col)
-      {
-        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
-        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
-        const scalar_t weight = data_attn_weight[data_weight_ptr];
-
-        const scalar_t h_im = loc_h * spatial_h - 0.5;
-        const scalar_t w_im = loc_w * spatial_w - 0.5;
-        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
-        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
-        *(cache_grad_attn_weight+threadIdx.x)=0;
-        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
-        {
-          ms_deform_attn_col2im_bilinear(
-            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
-            top_grad, weight, grad_value_ptr, 
-            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
-        }
-        
-        __syncthreads();
-
-        for (unsigned int s=blockDim.x/2, spre=blockDim.x; s>0; s>>=1, spre>>=1)
-        {
-          if (tid < s) {
-            const unsigned int xid1 = tid << 1;
-            const unsigned int xid2 = (tid + s) << 1;
-            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
-            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
-            cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1];
-            if (tid + (s << 1) < spre)
-            {
-              cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + (s << 1)];
-              cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2 + (s << 1)];
-              cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
-            } 
-          }
-          __syncthreads();
-        }
-
-        if (tid == 0)
-        {
-          *grad_sampling_loc = cache_grad_sampling_loc[0];
-          *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];
-          *grad_attn_weight = cache_grad_attn_weight[0];
-        }
-        __syncthreads();
-
-        data_weight_ptr += 1;
-        data_loc_w_ptr += 2;
-        grad_attn_weight += grad_weight_stride;
-        grad_sampling_loc += grad_loc_stride;
-      }
-    }
-  }
-}
-
-template <typename scalar_t>
-__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks(const int n,
-                                                const scalar_t *grad_col,
-                                                const scalar_t *data_value,
-                                                const int64_t *data_spatial_shapes,
-                                                const int64_t *data_level_start_index, 
-                                                const scalar_t *data_sampling_loc,
-                                                const scalar_t *data_attn_weight,
-                                                const int batch_size, 
-                                                const int spatial_size, 
-                                                const int num_heads,
-                                                const int channels, 
-                                                const int num_levels,
-                                                const int num_query,
-                                                const int num_point,
-                                                scalar_t *grad_value,
-                                                scalar_t *grad_sampling_loc,
-                                                scalar_t *grad_attn_weight)
-{
-  CUDA_KERNEL_LOOP(index, n)
-  {
-    extern __shared__ int _s[];
-    scalar_t* cache_grad_sampling_loc = (scalar_t*)_s;
-    scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
-    unsigned int tid = threadIdx.x;
-    int _temp = index;
-    const int c_col = _temp % channels;
-    _temp /= channels;
-    const int sampling_index = _temp; 
-    const int m_col = _temp % num_heads;
-    _temp /= num_heads;
-    const int q_col = _temp % num_query;
-    _temp /= num_query;
-    const int b_col = _temp;
-
-    const scalar_t top_grad = grad_col[index];
-
-    int data_weight_ptr = sampling_index * num_levels * num_point;
-    int data_loc_w_ptr = data_weight_ptr << 1;
-    const int grad_sampling_ptr = data_weight_ptr;
-    grad_sampling_loc += grad_sampling_ptr << 1;
-    grad_attn_weight += grad_sampling_ptr;
-    const int grad_weight_stride = 1;
-    const int grad_loc_stride = 2;
-    const int qid_stride = num_heads * channels;
-    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
-
-    for (int l_col=0; l_col < num_levels; ++l_col)
-    {
-      const int level_start_id = data_level_start_index[l_col];
-      const int spatial_h_ptr = l_col << 1;
-      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
-      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
-      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
-      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
-      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
-
-      for (int p_col=0; p_col < num_point; ++p_col)
-      {
-        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
-        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
-        const scalar_t weight = data_attn_weight[data_weight_ptr];
-
-        const scalar_t h_im = loc_h * spatial_h - 0.5;
-        const scalar_t w_im = loc_w * spatial_w - 0.5;
-        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
-        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
-        *(cache_grad_attn_weight+threadIdx.x)=0;
-        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
-        {
-          ms_deform_attn_col2im_bilinear(
-            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
-            top_grad, weight, grad_value_ptr, 
-            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
-        }
-        
-        __syncthreads();
-
-        for (unsigned int s=blockDim.x/2, spre=blockDim.x; s>0; s>>=1, spre>>=1)
-        {
-          if (tid < s) {
-            const unsigned int xid1 = tid << 1;
-            const unsigned int xid2 = (tid + s) << 1;
-            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
-            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
-            cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1];
-            if (tid + (s << 1) < spre)
-            {
-              cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + (s << 1)];
-              cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2 + (s << 1)];
-              cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
-            }
-          }
-          __syncthreads();
-        }
-
-        if (tid == 0)
-        {
-          atomicAdd(grad_sampling_loc, cache_grad_sampling_loc[0]);
-          atomicAdd(grad_sampling_loc + 1, cache_grad_sampling_loc[1]);
-          atomicAdd(grad_attn_weight, cache_grad_attn_weight[0]);
-        }
-        __syncthreads();
-
-        data_weight_ptr += 1;
-        data_loc_w_ptr += 2;
-        grad_attn_weight += grad_weight_stride;
-        grad_sampling_loc += grad_loc_stride;
-      }
-    }
-  }
-}
-
-
-template <typename scalar_t>
-__global__ void ms_deformable_col2im_gpu_kernel_gm(const int n,
-                                                const scalar_t *grad_col,
-                                                const scalar_t *data_value,
-                                                const int64_t *data_spatial_shapes,
-                                                const int64_t *data_level_start_index, 
-                                                const scalar_t *data_sampling_loc,
-                                                const scalar_t *data_attn_weight,
-                                                const int batch_size, 
-                                                const int spatial_size, 
-                                                const int num_heads,
-                                                const int channels, 
-                                                const int num_levels,
-                                                const int num_query,
-                                                const int num_point,
-                                                scalar_t *grad_value,
-                                                scalar_t *grad_sampling_loc,
-                                                scalar_t *grad_attn_weight)
-{
-  CUDA_KERNEL_LOOP(index, n)
-  {
-    int _temp = index;
-    const int c_col = _temp % channels;
-    _temp /= channels;
-    const int sampling_index = _temp; 
-    const int m_col = _temp % num_heads;
-    _temp /= num_heads;
-    const int q_col = _temp % num_query;
-    _temp /= num_query;
-    const int b_col = _temp;
-
-    const scalar_t top_grad = grad_col[index];
-
-    int data_weight_ptr = sampling_index * num_levels * num_point;
-    int data_loc_w_ptr = data_weight_ptr << 1;
-    const int grad_sampling_ptr = data_weight_ptr;
-    grad_sampling_loc += grad_sampling_ptr << 1;
-    grad_attn_weight += grad_sampling_ptr;
-    const int grad_weight_stride = 1;
-    const int grad_loc_stride = 2;
-    const int qid_stride = num_heads * channels;
-    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
-
-    for (int l_col=0; l_col < num_levels; ++l_col)
-    {
-      const int level_start_id = data_level_start_index[l_col];
-      const int spatial_h_ptr = l_col << 1;
-      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
-      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
-      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
-      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
-      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
-
-      for (int p_col=0; p_col < num_point; ++p_col)
-      {
-        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
-        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
-        const scalar_t weight = data_attn_weight[data_weight_ptr];
-
-        const scalar_t h_im = loc_h * spatial_h - 0.5;
-        const scalar_t w_im = loc_w * spatial_w - 0.5;
-        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
-        {
-          ms_deform_attn_col2im_bilinear_gm(
-            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
-            top_grad, weight, grad_value_ptr, 
-            grad_sampling_loc, grad_attn_weight);
-        }
-        data_weight_ptr += 1;
-        data_loc_w_ptr += 2;
-        grad_attn_weight += grad_weight_stride;
-        grad_sampling_loc += grad_loc_stride;
-      }
-    }
-  }
-}
-
-
-template <typename scalar_t>
-void ms_deformable_im2col_cuda(cudaStream_t stream,
-                              const scalar_t* data_value,
-                              const int64_t* data_spatial_shapes, 
-                              const int64_t* data_level_start_index, 
-                              const scalar_t* data_sampling_loc,
-                              const scalar_t* data_attn_weight,
-                              const int batch_size,
-                              const int spatial_size, 
-                              const int num_heads, 
-                              const int channels, 
-                              const int num_levels, 
-                              const int num_query,
-                              const int num_point,
-                              scalar_t* data_col)
-{
-  const int num_kernels = batch_size * num_query * num_heads * channels;
-  const int num_actual_kernels = batch_size * num_query * num_heads * channels;
-  const int num_threads = CUDA_NUM_THREADS;
-  ms_deformable_im2col_gpu_kernel<scalar_t>
-      <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-          0, stream>>>(
-      num_kernels, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, 
-      batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, data_col);
-  
-  cudaError_t err = cudaGetLastError();
-  if (err != cudaSuccess)
-  {
-    printf("error in ms_deformable_im2col_cuda: %s\n", cudaGetErrorString(err));
-  }
-
-}
-
-template <typename scalar_t>
-void ms_deformable_col2im_cuda(cudaStream_t stream,
-                              const scalar_t* grad_col,
-                              const scalar_t* data_value,
-                              const int64_t * data_spatial_shapes,
-                              const int64_t * data_level_start_index,
-                              const scalar_t * data_sampling_loc,
-                              const scalar_t * data_attn_weight,
-                              const int batch_size, 
-                              const int spatial_size, 
-                              const int num_heads,
-                              const int channels, 
-                              const int num_levels,
-                              const int num_query,
-                              const int num_point, 
-                              scalar_t* grad_value,
-                              scalar_t* grad_sampling_loc,
-                              scalar_t* grad_attn_weight)
-{
-  const int num_threads = (channels > CUDA_NUM_THREADS)?CUDA_NUM_THREADS:channels;
-  const int num_kernels = batch_size * num_query * num_heads * channels;
-  const int num_actual_kernels = batch_size * num_query * num_heads * channels;
-  if (channels > 1024)
-  {
-    if ((channels & 1023) == 0)
-    {
-      ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks<scalar_t>
-          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-              num_threads*3*sizeof(scalar_t), stream>>>(
-                        num_kernels, 
-                        grad_col,
-                        data_value,
-                        data_spatial_shapes,
-                        data_level_start_index, 
-                        data_sampling_loc,
-                        data_attn_weight,
-                        batch_size, 
-                        spatial_size, 
-                        num_heads,
-                        channels, 
-                        num_levels,
-                        num_query,
-                        num_point,
-                        grad_value,
-                        grad_sampling_loc,
-                        grad_attn_weight);
-    }
-    else
-    {
-      ms_deformable_col2im_gpu_kernel_gm<scalar_t>
-        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-            0, stream>>>(
-                      num_kernels, 
-                      grad_col,
-                      data_value,
-                      data_spatial_shapes,
-                      data_level_start_index, 
-                      data_sampling_loc,
-                      data_attn_weight,
-                      batch_size, 
-                      spatial_size, 
-                      num_heads,
-                      channels, 
-                      num_levels,
-                      num_query,
-                      num_point,
-                      grad_value,
-                      grad_sampling_loc,
-                      grad_attn_weight);
-    }
-  }
-  else{
-    switch(channels)
-    {
-      case 1:
-        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 1>
-        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-            0, stream>>>(
-                      num_kernels, 
-                      grad_col,
-                      data_value,
-                      data_spatial_shapes,
-                      data_level_start_index, 
-                      data_sampling_loc,
-                      data_attn_weight,
-                      batch_size, 
-                      spatial_size, 
-                      num_heads,
-                      channels, 
-                      num_levels,
-                      num_query,
-                      num_point,
-                      grad_value,
-                      grad_sampling_loc,
-                      grad_attn_weight);
-        break;
-      case 2:
-        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 2>
-        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-            0, stream>>>(
-                      num_kernels, 
-                      grad_col,
-                      data_value,
-                      data_spatial_shapes,
-                      data_level_start_index, 
-                      data_sampling_loc,
-                      data_attn_weight,
-                      batch_size, 
-                      spatial_size, 
-                      num_heads,
-                      channels, 
-                      num_levels,
-                      num_query,
-                      num_point,
-                      grad_value,
-                      grad_sampling_loc,
-                      grad_attn_weight);
-        break;
-      case 4:
-        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 4>
-        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-            0, stream>>>(
-                      num_kernels, 
-                      grad_col,
-                      data_value,
-                      data_spatial_shapes,
-                      data_level_start_index, 
-                      data_sampling_loc,
-                      data_attn_weight,
-                      batch_size, 
-                      spatial_size, 
-                      num_heads,
-                      channels, 
-                      num_levels,
-                      num_query,
-                      num_point,
-                      grad_value,
-                      grad_sampling_loc,
-                      grad_attn_weight);
-        break;
-      case 8:
-        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 8>
-        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-            0, stream>>>(
-                      num_kernels, 
-                      grad_col,
-                      data_value,
-                      data_spatial_shapes,
-                      data_level_start_index, 
-                      data_sampling_loc,
-                      data_attn_weight,
-                      batch_size, 
-                      spatial_size, 
-                      num_heads,
-                      channels, 
-                      num_levels,
-                      num_query,
-                      num_point,
-                      grad_value,
-                      grad_sampling_loc,
-                      grad_attn_weight);
-        break;
-      case 16:
-        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 16>
-        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-            0, stream>>>(
-                      num_kernels, 
-                      grad_col,
-                      data_value,
-                      data_spatial_shapes,
-                      data_level_start_index, 
-                      data_sampling_loc,
-                      data_attn_weight,
-                      batch_size, 
-                      spatial_size, 
-                      num_heads,
-                      channels, 
-                      num_levels,
-                      num_query,
-                      num_point,
-                      grad_value,
-                      grad_sampling_loc,
-                      grad_attn_weight);
-        break;
-      case 32:
-        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 32>
-        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-            0, stream>>>(
-                      num_kernels, 
-                      grad_col,
-                      data_value,
-                      data_spatial_shapes,
-                      data_level_start_index, 
-                      data_sampling_loc,
-                      data_attn_weight,
-                      batch_size, 
-                      spatial_size, 
-                      num_heads,
-                      channels, 
-                      num_levels,
-                      num_query,
-                      num_point,
-                      grad_value,
-                      grad_sampling_loc,
-                      grad_attn_weight);
-        break;
-      case 64:
-        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 64>
-        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-            0, stream>>>(
-                      num_kernels, 
-                      grad_col,
-                      data_value,
-                      data_spatial_shapes,
-                      data_level_start_index, 
-                      data_sampling_loc,
-                      data_attn_weight,
-                      batch_size, 
-                      spatial_size, 
-                      num_heads,
-                      channels, 
-                      num_levels,
-                      num_query,
-                      num_point,
-                      grad_value,
-                      grad_sampling_loc,
-                      grad_attn_weight);
-        break;
-      case 128:
-        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 128>
-        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-            0, stream>>>(
-                      num_kernels, 
-                      grad_col,
-                      data_value,
-                      data_spatial_shapes,
-                      data_level_start_index, 
-                      data_sampling_loc,
-                      data_attn_weight,
-                      batch_size, 
-                      spatial_size, 
-                      num_heads,
-                      channels, 
-                      num_levels,
-                      num_query,
-                      num_point,
-                      grad_value,
-                      grad_sampling_loc,
-                      grad_attn_weight);
-        break;
-      case 256:
-        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 256>
-        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-            0, stream>>>(
-                      num_kernels, 
-                      grad_col,
-                      data_value,
-                      data_spatial_shapes,
-                      data_level_start_index, 
-                      data_sampling_loc,
-                      data_attn_weight,
-                      batch_size, 
-                      spatial_size, 
-                      num_heads,
-                      channels, 
-                      num_levels,
-                      num_query,
-                      num_point,
-                      grad_value,
-                      grad_sampling_loc,
-                      grad_attn_weight);
-        break;
-      case 512:
-        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 512>
-        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-            0, stream>>>(
-                      num_kernels, 
-                      grad_col,
-                      data_value,
-                      data_spatial_shapes,
-                      data_level_start_index, 
-                      data_sampling_loc,
-                      data_attn_weight,
-                      batch_size, 
-                      spatial_size, 
-                      num_heads,
-                      channels, 
-                      num_levels,
-                      num_query,
-                      num_point,
-                      grad_value,
-                      grad_sampling_loc,
-                      grad_attn_weight);
-        break;
-      case 1024:
-        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 1024>
-        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-            0, stream>>>(
-                      num_kernels, 
-                      grad_col,
-                      data_value,
-                      data_spatial_shapes,
-                      data_level_start_index, 
-                      data_sampling_loc,
-                      data_attn_weight,
-                      batch_size, 
-                      spatial_size, 
-                      num_heads,
-                      channels, 
-                      num_levels,
-                      num_query,
-                      num_point,
-                      grad_value,
-                      grad_sampling_loc,
-                      grad_attn_weight);
-        break;
-      default:
-        if (channels < 64)
-        {
-          ms_deformable_col2im_gpu_kernel_shm_reduce_v1<scalar_t>
-          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-              num_threads*3*sizeof(scalar_t), stream>>>(
-                        num_kernels, 
-                        grad_col,
-                        data_value,
-                        data_spatial_shapes,
-                        data_level_start_index, 
-                        data_sampling_loc,
-                        data_attn_weight,
-                        batch_size, 
-                        spatial_size, 
-                        num_heads,
-                        channels, 
-                        num_levels,
-                        num_query,
-                        num_point,
-                        grad_value,
-                        grad_sampling_loc,
-                        grad_attn_weight);
-        }
-        else
-        {
-          ms_deformable_col2im_gpu_kernel_shm_reduce_v2<scalar_t>
-          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-              num_threads*3*sizeof(scalar_t), stream>>>(
-                        num_kernels, 
-                        grad_col,
-                        data_value,
-                        data_spatial_shapes,
-                        data_level_start_index, 
-                        data_sampling_loc,
-                        data_attn_weight,
-                        batch_size, 
-                        spatial_size, 
-                        num_heads,
-                        channels, 
-                        num_levels,
-                        num_query,
-                        num_point,
-                        grad_value,
-                        grad_sampling_loc,
-                        grad_attn_weight);
-        }
-    }
-  }
-  cudaError_t err = cudaGetLastError();
-  if (err != cudaSuccess)
-  {
-    printf("error in ms_deformable_col2im_cuda: %s\n", cudaGetErrorString(err));
-  }
-
-}
\ No newline at end of file
diff --git a/torchsig/models/spectrogram_models/mask2former/ops/src/ms_deform_attn.h b/torchsig/models/spectrogram_models/mask2former/ops/src/ms_deform_attn.h
deleted file mode 100644
index 2f80a1b..0000000
--- a/torchsig/models/spectrogram_models/mask2former/ops/src/ms_deform_attn.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/*!
-**************************************************************************************************
-* Deformable DETR
-* Copyright (c) 2020 SenseTime. All Rights Reserved.
-* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
-**************************************************************************************************
-* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
-**************************************************************************************************
-*/
-
-/*!
-* Copyright (c) Facebook, Inc. and its affiliates.
-* Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
-*/
-
-#pragma once
-
-#include "cpu/ms_deform_attn_cpu.h"
-
-#ifdef WITH_CUDA
-#include "cuda/ms_deform_attn_cuda.h"
-#endif
-
-
-at::Tensor
-ms_deform_attn_forward(
-    const at::Tensor &value, 
-    const at::Tensor &spatial_shapes,
-    const at::Tensor &level_start_index,
-    const at::Tensor &sampling_loc,
-    const at::Tensor &attn_weight,
-    const int im2col_step)
-{
-    if (value.type().is_cuda())
-    {
-#ifdef WITH_CUDA
-        return ms_deform_attn_cuda_forward(
-            value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step);
-#else
-        AT_ERROR("Not compiled with GPU support");
-#endif
-    }
-    AT_ERROR("Not implemented on the CPU");
-}
-
-std::vector<at::Tensor>
-ms_deform_attn_backward(
-    const at::Tensor &value, 
-    const at::Tensor &spatial_shapes,
-    const at::Tensor &level_start_index,
-    const at::Tensor &sampling_loc,
-    const at::Tensor &attn_weight,
-    const at::Tensor &grad_output,
-    const int im2col_step)
-{
-    if (value.type().is_cuda())
-    {
-#ifdef WITH_CUDA
-        return ms_deform_attn_cuda_backward(
-            value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step);
-#else
-        AT_ERROR("Not compiled with GPU support");
-#endif
-    }
-    AT_ERROR("Not implemented on the CPU");
-}
-
diff --git a/torchsig/models/spectrogram_models/mask2former/ops/src/vision.cpp b/torchsig/models/spectrogram_models/mask2former/ops/src/vision.cpp
deleted file mode 100644
index 4a08821..0000000
--- a/torchsig/models/spectrogram_models/mask2former/ops/src/vision.cpp
+++ /dev/null
@@ -1,21 +0,0 @@
-/*!
-**************************************************************************************************
-* Deformable DETR
-* Copyright (c) 2020 SenseTime. All Rights Reserved.
-* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
-**************************************************************************************************
-* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
-**************************************************************************************************
-*/
-
-/*!
-* Copyright (c) Facebook, Inc. and its affiliates.
-* Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
-*/
-
-#include "ms_deform_attn.h"
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward");
-  m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward");
-}
diff --git a/torchsig/models/spectrogram_models/mask2former/ops/test.py b/torchsig/models/spectrogram_models/mask2former/ops/test.py
deleted file mode 100644
index 6e1b545..0000000
--- a/torchsig/models/spectrogram_models/mask2former/ops/test.py
+++ /dev/null
@@ -1,92 +0,0 @@
-# ------------------------------------------------------------------------------------------------
-# Deformable DETR
-# Copyright (c) 2020 SenseTime. All Rights Reserved.
-# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
-# ------------------------------------------------------------------------------------------------
-# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
-# ------------------------------------------------------------------------------------------------
-
-# Copyright (c) Facebook, Inc. and its affiliates.
-# Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
-
-from __future__ import absolute_import
-from __future__ import print_function
-from __future__ import division
-
-import time
-import torch
-import torch.nn as nn
-from torch.autograd import gradcheck
-
-from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch
-
-
-N, M, D = 1, 2, 2
-Lq, L, P = 2, 2, 2
-shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda()
-level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1]))
-S = sum([(H*W).item() for H, W in shapes])
-
-
-torch.manual_seed(3)
-
-
-@torch.no_grad()
-def check_forward_equal_with_pytorch_double():
-    value = torch.rand(N, S, M, D).cuda() * 0.01
-    sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
-    attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
-    attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
-    im2col_step = 2
-    output_pytorch = ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()).detach().cpu()
-    output_cuda = MSDeformAttnFunction.apply(value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step).detach().cpu()
-    fwdok = torch.allclose(output_cuda, output_pytorch)
-    max_abs_err = (output_cuda - output_pytorch).abs().max()
-    max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
-
-    print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
-
-
-@torch.no_grad()
-def check_forward_equal_with_pytorch_float():
-    value = torch.rand(N, S, M, D).cuda() * 0.01
-    sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
-    attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
-    attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
-    im2col_step = 2
-    output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu()
-    output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step).detach().cpu()
-    fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3)
-    max_abs_err = (output_cuda - output_pytorch).abs().max()
-    max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
-
-    print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
-
-
-def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True):
-
-    value = torch.rand(N, S, M, channels).cuda() * 0.01
-    sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
-    attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
-    attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
-    im2col_step = 2
-    func = MSDeformAttnFunction.apply
-
-    value.requires_grad = grad_value
-    sampling_locations.requires_grad = grad_sampling_loc
-    attention_weights.requires_grad = grad_attn_weight
-
-    gradok = gradcheck(func, (value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step))
-
-    print(f'* {gradok} check_gradient_numerical(D={channels})')
-
-
-if __name__ == '__main__':
-    check_forward_equal_with_pytorch_double()
-    check_forward_equal_with_pytorch_float()
-
-    for channels in [30, 32, 64, 71, 1025, 2048, 3096]:
-        check_gradient_numerical(channels, True, True, True)
-
-
-
diff --git a/torchsig/models/spectrogram_models/mask2former/pixel_decoder.py b/torchsig/models/spectrogram_models/mask2former/pixel_decoder.py
deleted file mode 100644
index 992ae5e..0000000
--- a/torchsig/models/spectrogram_models/mask2former/pixel_decoder.py
+++ /dev/null
@@ -1,724 +0,0 @@
-import math
-import torch
-import warnings
-import numpy as np
-from torch import nn
-import torch.nn.functional as F
-from torch.autograd import Function
-from torch.autograd.function import once_differentiable
-from torch.nn.init import xavier_uniform_, constant_, uniform_, normal_
-from collections import namedtuple
-from typing import Dict, Optional, Union, Callable, List
-
-try:
-    import MultiScaleDeformableAttention as MSDA
-except ModuleNotFoundError as e:
-    info_string = (
-        "\n\nPlease compile MultiScaleDeformableAttention CUDA op with the following commands:\n"
-        "\t`cd spdata/spdata/models/spectrogram_models/mask2former/ops/`\n"
-        "\t`sh make.sh`\n"
-    )
-    raise ModuleNotFoundError(info_string)
-
-
-class MSDeformAttnFunction(Function):
-    @staticmethod
-    def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step):
-        ctx.im2col_step = im2col_step
-        output = MSDA.ms_deform_attn_forward(
-            value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step)
-        ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights)
-        return output
-
-    @staticmethod
-    @once_differentiable
-    def backward(ctx, grad_output):
-        value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors
-        grad_value, grad_sampling_loc, grad_attn_weight = \
-            MSDA.ms_deform_attn_backward(
-                value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step)
-
-        return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
-
-
-def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights):
-    # for debug and test only,
-    # need to use cuda version instead
-    N_, S_, M_, D_ = value.shape
-    _, Lq_, M_, L_, P_, _ = sampling_locations.shape
-    value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)
-    sampling_grids = 2 * sampling_locations - 1
-    sampling_value_list = []
-    for lid_, (H_, W_) in enumerate(value_spatial_shapes):
-        # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
-        value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_)
-        # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
-        sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1)
-        # N_*M_, D_, Lq_, P_
-        sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_,
-                                          mode='bilinear', padding_mode='zeros', align_corners=False)
-        sampling_value_list.append(sampling_value_l_)
-    # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_)
-    attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_)
-    output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_)
-    return output.transpose(1, 2).contiguous()
-
-
-class Conv2d(torch.nn.Conv2d):
-    """
-    A wrapper around :class:`torch.nn.Conv2d` to support empty inputs and more features.
-    """
-    def __init__(self, *args, **kwargs):
-        """
-        Extra keyword arguments supported in addition to those in `torch.nn.Conv2d`:
-
-        Args:
-            norm (nn.Module, optional): a normalization layer
-            activation (callable(Tensor) -> Tensor): a callable activation function
-
-        It assumes that norm layer is used before activation.
-        """
-        norm = kwargs.pop("norm", None)
-        activation = kwargs.pop("activation", None)
-        super().__init__(*args, **kwargs)
-
-        self.norm = norm
-        self.activation = activation
-
-    def forward(self, x):
-        # torchscript does not support SyncBatchNorm yet
-        # https://github.com/pytorch/pytorch/issues/40507
-        # and we skip these codes in torchscript since:
-        # 1. currently we only support torchscript in evaluation mode
-        # 2. features needed by exporting module to torchscript are added in PyTorch 1.6 or
-        # later version, `Conv2d` in these PyTorch versions has already supported empty inputs.
-        if not torch.jit.is_scripting():
-            if x.numel() == 0 and self.training:
-                # https://github.com/pytorch/pytorch/issues/12013
-                assert not isinstance(
-                    self.norm, torch.nn.SyncBatchNorm
-                ), "SyncBatchNorm does not support empty inputs!"
-
-        x = F.conv2d(
-            x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups
-        )
-        if self.norm is not None:
-            x = self.norm(x)
-        if self.activation is not None:
-            x = self.activation(x)
-        return x
-
-    
-def _get_clones(module, N):
-    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
-
-
-def _get_activation_fn(activation):
-    """Return an activation function given a string"""
-    if activation == "relu":
-        return F.relu
-    if activation == "gelu":
-        return F.gelu
-    if activation == "glu":
-        return F.glu
-    raise RuntimeError(f"activation should be relu/gelu, not {activation}.")
-
-
-class ShapeSpec(namedtuple("_ShapeSpec", ["channels", "height", "width", "stride"])):
-    """
-    A simple structure that contains basic shape specification about a tensor.
-    It is often used as the auxiliary inputs/outputs of models,
-    to complement the lack of shape inference ability among pytorch modules.
-    Attributes:
-        channels:
-        height:
-        width:
-        stride:
-    """
-
-    def __new__(cls, channels=None, height=None, width=None, stride=None):
-        return super().__new__(cls, channels, height, width, stride)
-    
-    
-def _is_power_of_2(n):
-    if (not isinstance(n, int)) or (n < 0):
-        raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n)))
-    return (n & (n-1) == 0) and n != 0
-
-
-class MSDeformAttn(nn.Module):
-    def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4):
-        """
-        Multi-Scale Deformable Attention Module
-        :param d_model      hidden dimension
-        :param n_levels     number of feature levels
-        :param n_heads      number of attention heads
-        :param n_points     number of sampling points per attention head per feature level
-        """
-        super().__init__()
-        if d_model % n_heads != 0:
-            raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads))
-        _d_per_head = d_model // n_heads
-        # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation
-        if not _is_power_of_2(_d_per_head):
-            warnings.warn("You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 "
-                          "which is more efficient in our CUDA implementation.")
-
-        self.im2col_step = 128
-
-        self.d_model = d_model
-        self.n_levels = n_levels
-        self.n_heads = n_heads
-        self.n_points = n_points
-
-        self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2)
-        self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points)
-        self.value_proj = nn.Linear(d_model, d_model)
-        self.output_proj = nn.Linear(d_model, d_model)
-
-        self._reset_parameters()
-
-    def _reset_parameters(self):
-        constant_(self.sampling_offsets.weight.data, 0.)
-        thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
-        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
-        grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, self.n_points, 1)
-        for i in range(self.n_points):
-            grid_init[:, :, i, :] *= i + 1
-        with torch.no_grad():
-            self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
-        constant_(self.attention_weights.weight.data, 0.)
-        constant_(self.attention_weights.bias.data, 0.)
-        xavier_uniform_(self.value_proj.weight.data)
-        constant_(self.value_proj.bias.data, 0.)
-        xavier_uniform_(self.output_proj.weight.data)
-        constant_(self.output_proj.bias.data, 0.)
-
-    def forward(self, query, reference_points, input_flatten, input_spatial_shapes, input_level_start_index, input_padding_mask=None):
-        """
-        :param query                       (N, Length_{query}, C)
-        :param reference_points            (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area
-                                        or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes
-        :param input_flatten               (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C)
-        :param input_spatial_shapes        (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
-        :param input_level_start_index     (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}]
-        :param input_padding_mask          (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements
-
-        :return output                     (N, Length_{query}, C)
-        """
-        N, Len_q, _ = query.shape
-        N, Len_in, _ = input_flatten.shape
-        assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == Len_in
-
-        value = self.value_proj(input_flatten)
-        if input_padding_mask is not None:
-            value = value.masked_fill(input_padding_mask[..., None], float(0))
-        value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads)
-        sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points, 2)
-        attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points)
-        attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points)
-        # N, Len_q, n_heads, n_levels, n_points, 2
-        if reference_points.shape[-1] == 2:
-            offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1)
-            sampling_locations = reference_points[:, :, None, :, None, :] \
-                                 + sampling_offsets / offset_normalizer[None, None, None, :, None, :]
-        elif reference_points.shape[-1] == 4:
-            sampling_locations = reference_points[:, :, None, :, None, :2] \
-                                 + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5
-        else:
-            raise ValueError(
-                'Last dim of reference_points must be 2 or 4, but get {} instead.'.format(reference_points.shape[-1]))
-        try:
-            output = MSDeformAttnFunction.apply(
-                value, input_spatial_shapes, input_level_start_index, sampling_locations, attention_weights, self.im2col_step)
-        except:
-            # CPU
-            output = ms_deform_attn_core_pytorch(value, input_spatial_shapes, sampling_locations, attention_weights)
-        # # For FLOPs calculation only
-        # output = ms_deform_attn_core_pytorch(value, input_spatial_shapes, sampling_locations, attention_weights)
-        output = self.output_proj(output)
-        return output
-
-
-class MSDeformAttnTransformerEncoderOnly(nn.Module):
-    def __init__(
-        self, 
-        d_model=256, 
-        nhead=8,
-        num_encoder_layers=6, 
-        dim_feedforward=1024, 
-        dropout=0.1,
-        activation="relu",
-        num_feature_levels=4, 
-        enc_n_points=4,
-    ):
-        super().__init__()
-
-        self.d_model = d_model
-        self.nhead = nhead
-
-        encoder_layer = MSDeformAttnTransformerEncoderLayer(
-            d_model, 
-            dim_feedforward,
-            dropout, 
-            activation,
-            num_feature_levels, 
-            nhead, 
-            enc_n_points,
-        )
-        self.encoder = MSDeformAttnTransformerEncoder(encoder_layer, num_encoder_layers)
-
-        self.level_embed = nn.Parameter(torch.Tensor(num_feature_levels, d_model))
-
-        self._reset_parameters()
-
-    def _reset_parameters(self):
-        for p in self.parameters():
-            if p.dim() > 1:
-                nn.init.xavier_uniform_(p)
-        for m in self.modules():
-            if isinstance(m, MSDeformAttn):
-                m._reset_parameters()
-        normal_(self.level_embed)
-
-    def get_valid_ratio(self, mask):
-        _, H, W = mask.shape
-        valid_H = torch.sum(~mask[:, :, 0], 1)
-        valid_W = torch.sum(~mask[:, 0, :], 1)
-        valid_ratio_h = valid_H.float() / H
-        valid_ratio_w = valid_W.float() / W
-        valid_ratio = torch.stack([valid_ratio_w, valid_ratio_h], -1)
-        return valid_ratio
-
-    def forward(self, srcs, pos_embeds):
-        masks = [torch.zeros((x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool) for x in srcs]
-        # prepare input for encoder
-        src_flatten = []
-        mask_flatten = []
-        lvl_pos_embed_flatten = []
-        spatial_shapes = []
-        for lvl, (src, mask, pos_embed) in enumerate(zip(srcs, masks, pos_embeds)):
-            bs, c, h, w = src.shape
-            spatial_shape = (h, w)
-            spatial_shapes.append(spatial_shape)
-            src = src.flatten(2).transpose(1, 2)
-            mask = mask.flatten(1)
-            pos_embed = pos_embed.flatten(2).transpose(1, 2)
-            lvl_pos_embed = pos_embed + self.level_embed[lvl].view(1, 1, -1)
-            lvl_pos_embed_flatten.append(lvl_pos_embed)
-            src_flatten.append(src)
-            mask_flatten.append(mask)
-        src_flatten = torch.cat(src_flatten, 1)
-        mask_flatten = torch.cat(mask_flatten, 1)
-        lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1)
-        spatial_shapes = torch.as_tensor(spatial_shapes, dtype=torch.long, device=src_flatten.device)
-        level_start_index = torch.cat((spatial_shapes.new_zeros((1, )), spatial_shapes.prod(1).cumsum(0)[:-1]))
-        valid_ratios = torch.stack([self.get_valid_ratio(m) for m in masks], 1)
-
-        # encoder
-        memory = self.encoder(src_flatten, spatial_shapes, level_start_index, valid_ratios, lvl_pos_embed_flatten, mask_flatten)
-
-        return memory, spatial_shapes, level_start_index
-
-
-class MSDeformAttnTransformerEncoderLayer(nn.Module):
-    def __init__(
-        self,
-        d_model=256, 
-        d_ffn=1024,
-        dropout=0.1, 
-        activation="relu",
-        n_levels=4, 
-        n_heads=8, 
-        n_points=4,
-    ):
-        super().__init__()
-
-        # self attention
-        self.self_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points)
-        self.dropout1 = nn.Dropout(dropout)
-        self.norm1 = nn.LayerNorm(d_model)
-
-        # ffn
-        self.linear1 = nn.Linear(d_model, d_ffn)
-        self.activation = _get_activation_fn(activation)
-        self.dropout2 = nn.Dropout(dropout)
-        self.linear2 = nn.Linear(d_ffn, d_model)
-        self.dropout3 = nn.Dropout(dropout)
-        self.norm2 = nn.LayerNorm(d_model)
-
-    @staticmethod
-    def with_pos_embed(tensor, pos):
-        return tensor if pos is None else tensor + pos
-
-    def forward_ffn(self, src):
-        src2 = self.linear2(self.dropout2(self.activation(self.linear1(src))))
-        src = src + self.dropout3(src2)
-        src = self.norm2(src)
-        return src
-
-    def forward(self, src, pos, reference_points, spatial_shapes, level_start_index, padding_mask=None):
-        # self attention
-        src2 = self.self_attn(self.with_pos_embed(src, pos), reference_points, src, spatial_shapes, level_start_index, padding_mask)
-        src = src + self.dropout1(src2)
-        src = self.norm1(src)
-
-        # ffn
-        src = self.forward_ffn(src)
-
-        return src
-
-
-class MSDeformAttnTransformerEncoder(nn.Module):
-    def __init__(self, encoder_layer, num_layers):
-        super().__init__()
-        self.layers = _get_clones(encoder_layer, num_layers)
-        self.num_layers = num_layers
-
-    @staticmethod
-    def get_reference_points(spatial_shapes, valid_ratios, device):
-        reference_points_list = []
-        for lvl, (H_, W_) in enumerate(spatial_shapes):
-
-            ref_y, ref_x = torch.meshgrid(torch.linspace(0.5, H_ - 0.5, H_, dtype=torch.float32, device=device),
-                                          torch.linspace(0.5, W_ - 0.5, W_, dtype=torch.float32, device=device))
-            ref_y = ref_y.reshape(-1)[None] / (valid_ratios[:, None, lvl, 1] * H_)
-            ref_x = ref_x.reshape(-1)[None] / (valid_ratios[:, None, lvl, 0] * W_)
-            ref = torch.stack((ref_x, ref_y), -1)
-            reference_points_list.append(ref)
-        reference_points = torch.cat(reference_points_list, 1)
-        reference_points = reference_points[:, :, None] * valid_ratios[:, None]
-        return reference_points
-
-    def forward(self, src, spatial_shapes, level_start_index, valid_ratios, pos=None, padding_mask=None):
-        output = src
-        reference_points = self.get_reference_points(spatial_shapes, valid_ratios, device=src.device)
-        for _, layer in enumerate(self.layers):
-            output = layer(output, pos, reference_points, spatial_shapes, level_start_index, padding_mask)
-
-        return output
-    
-    
-class MSDeformAttnPixelDecoder(nn.Module):
-    def __init__(
-        self,
-        # input_shape: Dict[str, ShapeSpec],
-        backbone: str = "resnet50",
-        *,
-        transformer_dropout: float,
-        transformer_nheads: int,
-        transformer_dim_feedforward: int,
-        transformer_enc_layers: int,
-        conv_dim: int,
-        mask_dim: int,
-        norm: Optional[Union[str, Callable]] = None,
-        # deformable transformer encoder args
-        # transformer_in_features: List[str],
-        common_stride: int,
-    ):
-        """
-        NOTE: this interface is experimental.
-        Args:
-            input_shape: shapes (channels and stride) of the input features
-            transformer_dropout: dropout probability in transformer
-            transformer_nheads: number of heads in transformer
-            transformer_dim_feedforward: dimension of feedforward network
-            transformer_enc_layers: number of transformer encoder layers
-            conv_dims: number of output channels for the intermediate conv layers.
-            mask_dim: number of output channels for the final conv layer.
-            norm (str or callable): normalization for all conv layers
-        """
-        super().__init__()
-        
-        if backbone == "resnet50":
-            input_shape = {
-                '0': ShapeSpec(channels=256, height=128, width=128, stride=4),
-                '1': ShapeSpec(channels=512, height=64, width=64, stride=8),
-                '2': ShapeSpec(channels=1024, height=32, width=32, stride=16),
-                '3': ShapeSpec(channels=2048, height=16, width=16, stride=32),
-            }
-            transformer_in_features = ['2', '3', '4']
-        elif backbone == "efficientnet_b0":
-            input_shape = {
-                '0': ShapeSpec(channels=16, height=256, width=256, stride=2), 
-                '1': ShapeSpec(channels=24, height=128, width=128, stride=4), 
-                '2': ShapeSpec(channels=40, height=64, width=64, stride=8), 
-                '3': ShapeSpec(channels=80, height=32, width=32, stride=16), 
-                '4': ShapeSpec(channels=112, height=32, width=32, stride=16), 
-                '5': ShapeSpec(channels=192, height=16, width=16, stride=32), 
-                '6': ShapeSpec(channels=320, height=16, width=16, stride=32),
-            }
-            transformer_in_features = ['2', '3', '4', '5', '6']
-        elif backbone == "efficientnet_b2":
-            input_shape = {
-                '0': ShapeSpec(channels=16, height=256, width=256, stride=2), 
-                '1': ShapeSpec(channels=24, height=128, width=128, stride=4), 
-                '2': ShapeSpec(channels=48, height=64, width=64, stride=8), 
-                '3': ShapeSpec(channels=88, height=32, width=32, stride=16), 
-                '4': ShapeSpec(channels=120, height=32, width=32, stride=16), 
-                '5': ShapeSpec(channels=208, height=16, width=16, stride=32), 
-                '6': ShapeSpec(channels=352, height=16, width=16, stride=32),
-            }
-            transformer_in_features = ['2', '3', '4', '5', '6']
-        elif backbone == "efficientnet_b4":
-            input_shape = {
-                '0': ShapeSpec(channels=24, height=256, width=256, stride=2), 
-                '1': ShapeSpec(channels=32, height=128, width=128, stride=4), 
-                '2': ShapeSpec(channels=56, height=64, width=64, stride=8), 
-                '3': ShapeSpec(channels=112, height=32, width=32, stride=16), 
-                '4': ShapeSpec(channels=160, height=32, width=32, stride=16), 
-                '5': ShapeSpec(channels=272, height=16, width=16, stride=32), 
-                '6': ShapeSpec(channels=448, height=16, width=16, stride=32),
-            }
-            transformer_in_features = ['2', '3', '4', '5', '6']
-        else:
-            raise NotImplemented('Please enter a backbone from list: [resnet50, efficientnet_b0, efficientnet_b2, efficientnet_b4]')
-        
-        transformer_input_shape = {
-            k: v for k, v in input_shape.items() if k in transformer_in_features
-        }
-
-        # this is the input shape of pixel decoder
-        input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride)
-        self.in_features = [k for k, v in input_shape]  # starting from "res2" to "res5"
-        self.feature_strides = [v.stride for k, v in input_shape]
-        self.feature_channels = [v.channels for k, v in input_shape]
-        
-        # this is the input shape of transformer encoder (could use less features than pixel decoder
-        transformer_input_shape = sorted(transformer_input_shape.items(), key=lambda x: x[1].stride)
-        self.transformer_in_features = [k for k, v in transformer_input_shape]  # starting from "res2" to "res5"
-        transformer_in_channels = [v.channels for k, v in transformer_input_shape]
-        self.transformer_feature_strides = [v.stride for k, v in transformer_input_shape]  # to decide extra FPN layers
-        
-        self.transformer_num_feature_levels = len(self.transformer_in_features)
-        if self.transformer_num_feature_levels > 1:
-            input_proj_list = []
-            # from low resolution to high resolution (res5 -> res2)
-            for in_channels in transformer_in_channels[::-1]:
-                input_proj_list.append(nn.Sequential(
-                    nn.Conv2d(in_channels, conv_dim, kernel_size=1),
-                    nn.GroupNorm(32, conv_dim),
-                ))
-            self.input_proj = nn.ModuleList(input_proj_list)
-        else:
-            self.input_proj = nn.ModuleList([
-                nn.Sequential(
-                    nn.Conv2d(transformer_in_channels[-1], conv_dim, kernel_size=1),
-                    nn.GroupNorm(32, conv_dim),
-                )])
-
-        for proj in self.input_proj:
-            nn.init.xavier_uniform_(proj[0].weight, gain=1)
-            nn.init.constant_(proj[0].bias, 0)
-
-        self.transformer = MSDeformAttnTransformerEncoderOnly(
-            d_model=conv_dim,
-            dropout=transformer_dropout,
-            nhead=transformer_nheads,
-            dim_feedforward=transformer_dim_feedforward,
-            num_encoder_layers=transformer_enc_layers,
-            num_feature_levels=self.transformer_num_feature_levels,
-        )
-        N_steps = conv_dim // 2
-        self.pe_layer = PositionEmbeddingSine(N_steps, normalize=True)
-
-        self.mask_dim = mask_dim
-        # use 1x1 conv instead
-        self.mask_features = Conv2d(
-            conv_dim,
-            mask_dim,
-            kernel_size=1,
-            stride=1,
-            padding=0,
-        )
-        c2_xavier_fill(self.mask_features)
-        
-        self.maskformer_num_feature_levels = 3  # always use 3 scales
-        self.common_stride = common_stride
-
-        # extra fpn levels
-        stride = min(self.transformer_feature_strides)
-        self.num_fpn_levels = int(np.log2(stride) - np.log2(self.common_stride))
-
-        lateral_convs = []
-        output_convs = []
-
-        use_bias = norm == ""
-        for idx, in_channels in enumerate(self.feature_channels[:self.num_fpn_levels]):
-            lateral_norm = get_norm(norm, conv_dim)
-            output_norm = get_norm(norm, conv_dim)
-
-            lateral_conv = Conv2d(
-                in_channels, conv_dim, kernel_size=1, bias=use_bias, norm=lateral_norm
-            )
-            output_conv = Conv2d(
-                conv_dim,
-                conv_dim,
-                kernel_size=3,
-                stride=1,
-                padding=1,
-                bias=use_bias,
-                norm=output_norm,
-                activation=F.relu,
-            )
-            c2_xavier_fill(lateral_conv)
-            c2_xavier_fill(output_conv)
-            self.add_module("adapter_{}".format(idx + 1), lateral_conv)
-            self.add_module("layer_{}".format(idx + 1), output_conv)
-
-            lateral_convs.append(lateral_conv)
-            output_convs.append(output_conv)
-        # Place convs into top-down order (from low to high resolution)
-        # to make the top-down computation in forward clearer.
-        self.lateral_convs = lateral_convs[::-1]
-        self.output_convs = output_convs[::-1]
-
-    def forward_features(self, features):
-        srcs = []
-        pos = []
-        # Reverse feature maps into top-down order (from low to high resolution)
-        for idx, f in enumerate(self.transformer_in_features[::-1]):
-            x = features[f].float()  # deformable detr does not support half precision
-            srcs.append(self.input_proj[idx](x))
-            pos.append(self.pe_layer(x))
-
-        y, spatial_shapes, level_start_index = self.transformer(srcs, pos)
-        bs = y.shape[0]
-
-        split_size_or_sections = [None] * self.transformer_num_feature_levels
-        for i in range(self.transformer_num_feature_levels):
-            if i < self.transformer_num_feature_levels - 1:
-                split_size_or_sections[i] = level_start_index[i + 1] - level_start_index[i]
-            else:
-                split_size_or_sections[i] = y.shape[1] - level_start_index[i]
-        y = torch.split(y, split_size_or_sections, dim=1)
-
-        out = []
-        multi_scale_features = []
-        num_cur_levels = 0
-        for i, z in enumerate(y):
-            out.append(z.transpose(1, 2).view(bs, -1, spatial_shapes[i][0], spatial_shapes[i][1]))
-
-        # append `out` with extra FPN levels
-        # Reverse feature maps into top-down order (from low to high resolution)
-        for idx, f in enumerate(self.in_features[:self.num_fpn_levels][::-1]):
-            x = features[f].float()
-            lateral_conv = self.lateral_convs[idx]
-            output_conv = self.output_convs[idx]
-            cur_fpn = lateral_conv(x)
-            # Following FPN implementation, we use nearest upsampling here
-            y = cur_fpn + F.interpolate(out[-1], size=cur_fpn.shape[-2:], mode="bilinear", align_corners=False)
-            y = output_conv(y)
-            out.append(y)
-
-        for o in out:
-            if num_cur_levels < self.maskformer_num_feature_levels:
-                multi_scale_features.append(o)
-                num_cur_levels += 1
-
-        return self.mask_features(out[-1]), out[0], multi_scale_features
-
-    
-class PositionEmbeddingSine(nn.Module):
-    """
-    This is a more standard version of the position embedding, very similar to the one
-    used by the Attention is all you need paper, generalized to work on images.
-    """
-
-    def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
-        super().__init__()
-        self.num_pos_feats = num_pos_feats
-        self.temperature = temperature
-        self.normalize = normalize
-        if scale is not None and normalize is False:
-            raise ValueError("normalize should be True if scale is passed")
-        if scale is None:
-            scale = 2 * math.pi
-        self.scale = scale
-
-    def forward(self, x, mask=None):
-        if mask is None:
-            mask = torch.zeros((x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool)
-        not_mask = ~mask
-        y_embed = not_mask.cumsum(1, dtype=torch.float32)
-        x_embed = not_mask.cumsum(2, dtype=torch.float32)
-        if self.normalize:
-            eps = 1e-6
-            y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
-            x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
-
-        dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
-        dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
-
-        pos_x = x_embed[:, :, :, None] / dim_t
-        pos_y = y_embed[:, :, :, None] / dim_t
-        pos_x = torch.stack(
-            (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4
-        ).flatten(3)
-        pos_y = torch.stack(
-            (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4
-        ).flatten(3)
-        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
-        return pos
-    
-    def __repr__(self, _repr_indent=4):
-        head = "Positional encoding " + self.__class__.__name__
-        body = [
-            "num_pos_feats: {}".format(self.num_pos_feats),
-            "temperature: {}".format(self.temperature),
-            "normalize: {}".format(self.normalize),
-            "scale: {}".format(self.scale),
-        ]
-        # _repr_indent = 4
-        lines = [head] + [" " * _repr_indent + line for line in body]
-        return "\n".join(lines)
-
-    
-def c2_xavier_fill(module: nn.Module) -> None:
-    """
-    Initialize `module.weight` using the "XavierFill" implemented in Caffe2.
-    Also initializes `module.bias` to 0.
-
-    Args:
-        module (torch.nn.Module): module to initialize.
-    """
-    # Caffe2 implementation of XavierFill in fact
-    # corresponds to kaiming_uniform_ in PyTorch
-    # pyre-fixme[6]: For 1st param expected `Tensor` but got `Union[Module, Tensor]`.
-    nn.init.kaiming_uniform_(module.weight, a=1)
-    if module.bias is not None:
-        # pyre-fixme[6]: Expected `Tensor` for 1st param but got `Union[nn.Module,
-        #  torch.Tensor]`.
-        nn.init.constant_(module.bias, 0)
-        
-        
-def get_norm(norm, out_channels):
-    """
-    Args:
-        norm (str or callable): either one of BN, SyncBN, FrozenBN, GN;
-            or a callable that takes a channel number and returns
-            the normalization layer as a nn.Module.
-
-    Returns:
-        nn.Module or None: the normalization layer
-    """
-    if norm is None:
-        return None
-    if isinstance(norm, str):
-        if len(norm) == 0:
-            return None
-        norm = {
-            "BN": nn.BatchNorm2d,
-            # Fixed in https://github.com/pytorch/pytorch/pull/36382
-            # "SyncBN": NaiveSyncBatchNorm if env.TORCH_VERSION <= (1, 5) else nn.SyncBatchNorm,
-            # "FrozenBN": FrozenBatchNorm2d,
-            "GN": lambda channels: nn.GroupNorm(32, channels),
-            # for debugging:
-            "nnSyncBN": nn.SyncBatchNorm,
-            # "naiveSyncBN": NaiveSyncBatchNorm,
-            # expose stats_mode N as an option to caller, required for zero-len inputs
-            # "naiveSyncBN_N": lambda channels: NaiveSyncBatchNorm(channels, stats_mode="N"),
-            "LN": lambda channels: LayerNorm(channels),
-        }[norm]
-    return norm(out_channels)
-
diff --git a/torchsig/models/spectrogram_models/mask2former/predictor.py b/torchsig/models/spectrogram_models/mask2former/predictor.py
deleted file mode 100644
index 7b22a64..0000000
--- a/torchsig/models/spectrogram_models/mask2former/predictor.py
+++ /dev/null
@@ -1,395 +0,0 @@
-import torch
-from torch import nn, Tensor
-import torch.nn.functional as F
-from typing import Dict, Optional, Union, Callable, List
-
-from .pixel_decoder import PositionEmbeddingSine, c2_xavier_fill
-
-
-class SelfAttentionLayer(nn.Module):
-    def __init__(self, d_model, nhead, dropout=0.0,
-                 activation="relu", normalize_before=False):
-        super().__init__()
-        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
-
-        self.norm = nn.LayerNorm(d_model)
-        self.dropout = nn.Dropout(dropout)
-
-        self.activation = _get_activation_fn(activation)
-        self.normalize_before = normalize_before
-
-        self._reset_parameters()
-    
-    def _reset_parameters(self):
-        for p in self.parameters():
-            if p.dim() > 1:
-                nn.init.xavier_uniform_(p)
-
-    def with_pos_embed(self, tensor, pos: Optional[Tensor]):
-        return tensor if pos is None else tensor + pos
-
-    def forward_post(self, tgt,
-                     tgt_mask: Optional[Tensor] = None,
-                     tgt_key_padding_mask: Optional[Tensor] = None,
-                     query_pos: Optional[Tensor] = None):
-        q = k = self.with_pos_embed(tgt, query_pos)
-        tgt2 = self.self_attn(q, k, value=tgt, attn_mask=tgt_mask,
-                              key_padding_mask=tgt_key_padding_mask)[0]
-        tgt = tgt + self.dropout(tgt2)
-        tgt = self.norm(tgt)
-
-        return tgt
-
-    def forward_pre(self, tgt,
-                    tgt_mask: Optional[Tensor] = None,
-                    tgt_key_padding_mask: Optional[Tensor] = None,
-                    query_pos: Optional[Tensor] = None):
-        tgt2 = self.norm(tgt)
-        q = k = self.with_pos_embed(tgt2, query_pos)
-        tgt2 = self.self_attn(q, k, value=tgt2, attn_mask=tgt_mask,
-                              key_padding_mask=tgt_key_padding_mask)[0]
-        tgt = tgt + self.dropout(tgt2)
-        
-        return tgt
-
-    def forward(self, tgt,
-                tgt_mask: Optional[Tensor] = None,
-                tgt_key_padding_mask: Optional[Tensor] = None,
-                query_pos: Optional[Tensor] = None):
-        if self.normalize_before:
-            return self.forward_pre(tgt, tgt_mask,
-                                    tgt_key_padding_mask, query_pos)
-        return self.forward_post(tgt, tgt_mask,
-                                 tgt_key_padding_mask, query_pos)
-
-
-class CrossAttentionLayer(nn.Module):
-    def __init__(self, d_model, nhead, dropout=0.0,
-                 activation="relu", normalize_before=False):
-        super().__init__()
-        self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
-
-        self.norm = nn.LayerNorm(d_model)
-        self.dropout = nn.Dropout(dropout)
-
-        self.activation = _get_activation_fn(activation)
-        self.normalize_before = normalize_before
-
-        self._reset_parameters()
-    
-    def _reset_parameters(self):
-        for p in self.parameters():
-            if p.dim() > 1:
-                nn.init.xavier_uniform_(p)
-
-    def with_pos_embed(self, tensor, pos: Optional[Tensor]):
-        return tensor if pos is None else tensor + pos
-
-    def forward_post(self, tgt, memory,
-                     memory_mask: Optional[Tensor] = None,
-                     memory_key_padding_mask: Optional[Tensor] = None,
-                     pos: Optional[Tensor] = None,
-                     query_pos: Optional[Tensor] = None):
-        tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt, query_pos),
-                                   key=self.with_pos_embed(memory, pos),
-                                   value=memory, attn_mask=memory_mask,
-                                   key_padding_mask=memory_key_padding_mask)[0]
-        tgt = tgt + self.dropout(tgt2)
-        tgt = self.norm(tgt)
-        
-        return tgt
-
-    def forward_pre(self, tgt, memory,
-                    memory_mask: Optional[Tensor] = None,
-                    memory_key_padding_mask: Optional[Tensor] = None,
-                    pos: Optional[Tensor] = None,
-                    query_pos: Optional[Tensor] = None):
-        tgt2 = self.norm(tgt)
-        tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt2, query_pos),
-                                   key=self.with_pos_embed(memory, pos),
-                                   value=memory, attn_mask=memory_mask,
-                                   key_padding_mask=memory_key_padding_mask)[0]
-        tgt = tgt + self.dropout(tgt2)
-
-        return tgt
-
-    def forward(self, tgt, memory,
-                memory_mask: Optional[Tensor] = None,
-                memory_key_padding_mask: Optional[Tensor] = None,
-                pos: Optional[Tensor] = None,
-                query_pos: Optional[Tensor] = None):
-        if self.normalize_before:
-            return self.forward_pre(tgt, memory, memory_mask,
-                                    memory_key_padding_mask, pos, query_pos)
-        return self.forward_post(tgt, memory, memory_mask,
-                                 memory_key_padding_mask, pos, query_pos)
-
-
-class FFNLayer(nn.Module):
-    def __init__(self, d_model, dim_feedforward=2048, dropout=0.0,
-                 activation="relu", normalize_before=False):
-        super().__init__()
-        # Implementation of Feedforward model
-        self.linear1 = nn.Linear(d_model, dim_feedforward)
-        self.dropout = nn.Dropout(dropout)
-        self.linear2 = nn.Linear(dim_feedforward, d_model)
-
-        self.norm = nn.LayerNorm(d_model)
-
-        self.activation = _get_activation_fn(activation)
-        self.normalize_before = normalize_before
-
-        self._reset_parameters()
-    
-    def _reset_parameters(self):
-        for p in self.parameters():
-            if p.dim() > 1:
-                nn.init.xavier_uniform_(p)
-
-    def with_pos_embed(self, tensor, pos: Optional[Tensor]):
-        return tensor if pos is None else tensor + pos
-
-    def forward_post(self, tgt):
-        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
-        tgt = tgt + self.dropout(tgt2)
-        tgt = self.norm(tgt)
-        return tgt
-
-    def forward_pre(self, tgt):
-        tgt2 = self.norm(tgt)
-        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2))))
-        tgt = tgt + self.dropout(tgt2)
-        return tgt
-
-    def forward(self, tgt):
-        if self.normalize_before:
-            return self.forward_pre(tgt)
-        return self.forward_post(tgt)
-
-
-def _get_activation_fn(activation):
-    """Return an activation function given a string"""
-    if activation == "relu":
-        return F.relu
-    if activation == "gelu":
-        return F.gelu
-    if activation == "glu":
-        return F.glu
-    raise RuntimeError(F"activation should be relu/gelu, not {activation}.")
-
-
-class MLP(nn.Module):
-    """ Very simple multi-layer perceptron (also called FFN)"""
-
-    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
-        super().__init__()
-        self.num_layers = num_layers
-        h = [hidden_dim] * (num_layers - 1)
-        self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
-
-    def forward(self, x):
-        for i, layer in enumerate(self.layers):
-            x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
-        return x
-    
-    
-class MultiScaleMaskedTransformerDecoder(nn.Module):
-    def __init__(
-        self,
-        in_channels,
-        mask_classification=True,
-        *,
-        num_classes: int,
-        hidden_dim: int,
-        num_queries: int,
-        nheads: int,
-        dim_feedforward: int,
-        dec_layers: int,
-        pre_norm: bool,
-        mask_dim: int,
-        enforce_input_project: bool,
-    ):
-        """
-        NOTE: this interface is experimental.
-        Args:
-            in_channels: channels of the input features
-            mask_classification: whether to add mask classifier or not
-            num_classes: number of classes
-            hidden_dim: Transformer feature dimension
-            num_queries: number of queries
-            nheads: number of heads
-            dim_feedforward: feature dimension in feedforward network
-            enc_layers: number of Transformer encoder layers
-            dec_layers: number of Transformer decoder layers
-            pre_norm: whether to use pre-LayerNorm or not
-            mask_dim: mask feature dimension
-            enforce_input_project: add input project 1x1 conv even if input
-                channels and hidden dim is identical
-        """
-        super().__init__()
-
-        assert mask_classification, "Only support mask classification model"
-        self.mask_classification = mask_classification
-
-        # positional encoding
-        N_steps = hidden_dim // 2
-        self.pe_layer = PositionEmbeddingSine(N_steps, normalize=True)
-        
-        # define Transformer decoder here
-        self.num_heads = nheads
-        self.num_layers = dec_layers
-        self.transformer_self_attention_layers = nn.ModuleList()
-        self.transformer_cross_attention_layers = nn.ModuleList()
-        self.transformer_ffn_layers = nn.ModuleList()
-
-        for _ in range(self.num_layers):
-            self.transformer_self_attention_layers.append(
-                SelfAttentionLayer(
-                    d_model=hidden_dim,
-                    nhead=nheads,
-                    dropout=0.0,
-                    normalize_before=pre_norm,
-                )
-            )
-
-            self.transformer_cross_attention_layers.append(
-                CrossAttentionLayer(
-                    d_model=hidden_dim,
-                    nhead=nheads,
-                    dropout=0.0,
-                    normalize_before=pre_norm,
-                )
-            )
-
-            self.transformer_ffn_layers.append(
-                FFNLayer(
-                    d_model=hidden_dim,
-                    dim_feedforward=dim_feedforward,
-                    dropout=0.0,
-                    normalize_before=pre_norm,
-                )
-            )
-
-        self.decoder_norm = nn.LayerNorm(hidden_dim)
-
-        self.num_queries = num_queries
-        # learnable query features
-        self.query_feat = nn.Embedding(num_queries, hidden_dim)
-        # learnable query p.e.
-        self.query_embed = nn.Embedding(num_queries, hidden_dim)
-
-        # level embedding (we always use 3 scales)
-        self.num_feature_levels = 3
-        self.level_embed = nn.Embedding(self.num_feature_levels, hidden_dim)
-        self.input_proj = nn.ModuleList()
-        for _ in range(self.num_feature_levels):
-            if in_channels != hidden_dim or enforce_input_project:
-                self.input_proj.append(nn.Conv2d(in_channels, hidden_dim, kernel_size=1))
-                c2_xavier_fill(self.input_proj[-1])
-            else:
-                self.input_proj.append(nn.Sequential())
-
-        # output FFNs
-        if self.mask_classification:
-            self.class_embed = nn.Linear(hidden_dim, num_classes + 1)
-        self.mask_embed = MLP(hidden_dim, hidden_dim, mask_dim, 3)
-
-    def forward(self, x, mask_features, mask = None):
-        # x is a list of multi-scale feature
-        assert len(x) == self.num_feature_levels
-        src = []
-        pos = []
-        size_list = []
-
-        # disable mask, it does not affect performance
-        del mask
-
-        for i in range(self.num_feature_levels):
-            size_list.append(x[i].shape[-2:])
-            pos.append(self.pe_layer(x[i], None).flatten(2))
-            src.append(self.input_proj[i](x[i]).flatten(2) + self.level_embed.weight[i][None, :, None])
-
-            # flatten NxCxHxW to HWxNxC
-            pos[-1] = pos[-1].permute(2, 0, 1)
-            src[-1] = src[-1].permute(2, 0, 1)
-
-        _, bs, _ = src[0].shape
-
-        # QxNxC
-        query_embed = self.query_embed.weight.unsqueeze(1).repeat(1, bs, 1)
-        output = self.query_feat.weight.unsqueeze(1).repeat(1, bs, 1)
-
-        predictions_class = []
-        predictions_mask = []
-
-        # prediction heads on learnable query features
-        outputs_class, outputs_mask, attn_mask = self.forward_prediction_heads(output, mask_features, attn_mask_target_size=size_list[0])
-        predictions_class.append(outputs_class)
-        predictions_mask.append(outputs_mask)
-
-        for i in range(self.num_layers):
-            level_index = i % self.num_feature_levels
-            attn_mask[torch.where(attn_mask.sum(-1) == attn_mask.shape[-1])] = False
-            # attention: cross-attention first
-            output = self.transformer_cross_attention_layers[i](
-                output, src[level_index],
-                memory_mask=attn_mask,
-                memory_key_padding_mask=None,  # here we do not apply masking on padded region
-                pos=pos[level_index], query_pos=query_embed
-            )
-
-            output = self.transformer_self_attention_layers[i](
-                output, tgt_mask=None,
-                tgt_key_padding_mask=None,
-                query_pos=query_embed
-            )
-            
-            # FFN
-            output = self.transformer_ffn_layers[i](
-                output
-            )
-
-            outputs_class, outputs_mask, attn_mask = self.forward_prediction_heads(output, mask_features, attn_mask_target_size=size_list[(i + 1) % self.num_feature_levels])
-            predictions_class.append(outputs_class)
-            predictions_mask.append(outputs_mask)
-
-        assert len(predictions_class) == self.num_layers + 1
-
-        out = {
-            'pred_logits': predictions_class[-1],
-            'pred_masks': predictions_mask[-1],
-            'aux_outputs': self._set_aux_loss(
-                predictions_class if self.mask_classification else None, predictions_mask
-            )
-        }
-        return out
-
-    def forward_prediction_heads(self, output, mask_features, attn_mask_target_size):
-        decoder_output = self.decoder_norm(output)
-        decoder_output = decoder_output.transpose(0, 1)
-        outputs_class = self.class_embed(decoder_output)
-        mask_embed = self.mask_embed(decoder_output)
-        outputs_mask = torch.einsum("bqc,bchw->bqhw", mask_embed, mask_features)
-
-        # NOTE: prediction is of higher-resolution
-        # [B, Q, H, W] -> [B, Q, H*W] -> [B, h, Q, H*W] -> [B*h, Q, HW]
-        attn_mask = F.interpolate(outputs_mask, size=attn_mask_target_size, mode="bilinear", align_corners=False)
-        # must use bool type
-        # If a BoolTensor is provided, positions with ``True`` are not allowed to attend while ``False`` values will be unchanged.
-        attn_mask = (attn_mask.sigmoid().flatten(2).unsqueeze(1).repeat(1, self.num_heads, 1, 1).flatten(0, 1) < 0.5).bool()
-        attn_mask = attn_mask.detach()
-
-        return outputs_class, outputs_mask, attn_mask
-
-    @torch.jit.unused
-    def _set_aux_loss(self, outputs_class, outputs_seg_masks):
-        # this is a workaround to make torchscript happy, as torchscript
-        # doesn't support dictionary with non-homogeneous values, such
-        # as a dict having both a Tensor and a list.
-        if self.mask_classification:
-            return [
-                {"pred_logits": a, "pred_masks": b}
-                for a, b in zip(outputs_class[:-1], outputs_seg_masks[:-1])
-            ]
-        else:
-            return [{"pred_masks": b} for b in outputs_seg_masks[:-1]]
diff --git a/torchsig/models/spectrogram_models/mask2former/utils.py b/torchsig/models/spectrogram_models/mask2former/utils.py
deleted file mode 100644
index 7ab3b54..0000000
--- a/torchsig/models/spectrogram_models/mask2former/utils.py
+++ /dev/null
@@ -1,298 +0,0 @@
-import math
-import numpy as np
-import sympy
-import timm
-import torch
-from torch import nn
-from torch import Tensor
-import torch.distributed as dist
-from torch.optim.lr_scheduler import LambdaLR
-import torchvision
-from torchvision.ops.boxes import box_area
-from torchvision.ops import masks_to_boxes
-from typing import List, Optional
-
-
-def drop_classifier(parent):
-    return torch.nn.Sequential(*list(parent.children())[:-2])
-
-
-def find_output_features(parent, num_features=0):
-    for n, m in parent.named_children():
-        if type(m) is torch.nn.Conv2d:
-            num_features = m.out_channels
-        else:
-            num_features = find_output_features(m, num_features)
-    return num_features
-
-
-# Several functions below pulled from public DETR repo: https://github.com/facebookresearch/detr
-def is_dist_avail_and_initialized():
-    if not dist.is_available():
-        return False
-    if not dist.is_initialized():
-        return False
-    return True
-
-
-def get_world_size():
-    if not is_dist_avail_and_initialized():
-        return 1
-    return dist.get_world_size()
-
-
-@torch.no_grad()
-def accuracy(output, target, topk=(1,)):
-    """Computes the precision@k for the specified values of k"""
-    if target.numel() == 0:
-        return [torch.zeros([], device=output.device)]
-    maxk = max(topk)
-    batch_size = target.size(0)
-
-    _, pred = output.topk(maxk, 1, True, True)
-    pred = pred.t()
-    correct = pred.eq(target.view(1, -1).expand_as(pred))
-
-    res = []
-    for k in topk:
-        correct_k = correct[:k].view(-1).float().sum(0)
-        res.append(correct_k.mul_(100.0 / batch_size))
-    return res
-
-
-def collate_fn(batch):
-    return tuple(zip(*batch))
-    
-
-def _max_by_axis(the_list):
-    # type: (List[List[int]]) -> List[int]
-    maxes = the_list[0]
-    for sublist in the_list[1:]:
-        for index, item in enumerate(sublist):
-            maxes[index] = max(maxes[index], item)
-    return maxes
-
-
-def box_cxcywh_to_xyxy(x):
-    x_c, y_c, w, h = x.unbind(-1)
-    b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
-         (x_c + 0.5 * w), (y_c + 0.5 * h)]
-    return torch.stack(b, dim=-1)
-
-
-# modified from torchvision to also return the union
-def box_iou(boxes1, boxes2):
-    area1 = box_area(boxes1)
-    area2 = box_area(boxes2)
-
-    lt = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
-    rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]
-
-    wh = (rb - lt).clamp(min=0)  # [N,M,2]
-    inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]
-
-    union = area1[:, None] + area2 - inter
-
-    iou = inter / union
-    return iou, union
-
-
-def generalized_box_iou(boxes1, boxes2):
-    """
-    Generalized IoU from https://giou.stanford.edu/
-    The boxes should be in [x0, y0, x1, y1] format
-    Returns a [N, M] pairwise matrix, where N = len(boxes1)
-    and M = len(boxes2)
-    """
-    # degenerate boxes gives inf / nan results
-    # so do an early check
-    assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
-    assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
-    iou, union = box_iou(boxes1, boxes2)
-
-    lt = torch.min(boxes1[:, None, :2], boxes2[:, :2])
-    rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
-
-    wh = (rb - lt).clamp(min=0)  # [N,M,2]
-    area = wh[:, :, 0] * wh[:, :, 1]
-
-    return iou - (area - union) / area
-
-
-def calc_area(box):
-    return max(0,(box[1] - box[0])) * max(0,(box[3] - box[2]))
-
-
-def calc_iou(box1, box2):
-    area1 = calc_area(box1)
-    area2 = calc_area(box2)
-    inter_x1 = max(box1[0], box2[0])
-    inter_x2 = min(box1[1], box2[1])
-    inter_y1 = max(box1[2], box2[2])
-    inter_y2 = min(box1[3], box2[3])
-    inter_area = max(0,calc_area([inter_x1, inter_x2, inter_y1, inter_y2]))
-    union = area1 + area2 - inter_area
-    iou = inter_area / union
-    return iou
-
-
-def non_max_suppression_df(detected_signals_df, iou_threshold=0.75):
-    valid_indices = list(detected_signals_df.index)
-    remove_indices = []
-    for det_idx in valid_indices:
-        for det_jdx in valid_indices:
-            if det_idx >= det_jdx:
-                continue
-                
-            # Check if same class
-            sig1_class = detected_signals_df.loc[det_idx]['Class']
-            sig2_class = detected_signals_df.loc[det_jdx]['Class']
-
-            if sig1_class != sig2_class:
-                continue
-
-            #  convert df to box lists: (x1,x2,y1,y2)
-            sig1 = [
-                detected_signals_df.loc[det_idx]['CenterTimePixel']-detected_signals_df.loc[det_idx]['DurationPixel']/2,
-                detected_signals_df.loc[det_idx]['CenterTimePixel']+detected_signals_df.loc[det_idx]['DurationPixel']/2,
-                detected_signals_df.loc[det_idx]['CenterFreqPixel']-detected_signals_df.loc[det_idx]['BandwidthPixel']/2,
-                detected_signals_df.loc[det_idx]['CenterFreqPixel']+detected_signals_df.loc[det_idx]['BandwidthPixel']/2
-            ]
-            sig2 = [
-                detected_signals_df.loc[det_jdx]['CenterTimePixel']-detected_signals_df.loc[det_jdx]['DurationPixel']/2,
-                detected_signals_df.loc[det_jdx]['CenterTimePixel']+detected_signals_df.loc[det_jdx]['DurationPixel']/2,
-                detected_signals_df.loc[det_jdx]['CenterFreqPixel']-detected_signals_df.loc[det_jdx]['BandwidthPixel']/2,
-                detected_signals_df.loc[det_jdx]['CenterFreqPixel']+detected_signals_df.loc[det_jdx]['BandwidthPixel']/2
-            ]
-
-            iou_score = calc_iou(sig1, sig2)
-
-            if iou_score > iou_threshold:
-                # Probably the same signal, take higher confidence signal
-                sig1_prob = detected_signals_df.loc[det_idx]['Probability']
-                sig2_prob = detected_signals_df.loc[det_jdx]['Probability']
-                dup_idx = det_idx if sig1_prob < sig2_prob else det_jdx
-                
-                # remove from valid_indices
-                if dup_idx in valid_indices and dup_idx not in remove_indices:
-                    remove_indices.append(dup_idx) 
-
-    remove_indices = sorted(remove_indices)
-    for idx in range(len(remove_indices)-1,-1,-1):
-        valid_indices.remove(remove_indices[idx])
-        
-    detected_signals_df = detected_signals_df.loc[valid_indices].reset_index(drop=True)
-    detected_signals_df['DetectionIdx'] = detected_signals_df.index
-    return detected_signals_df
-
-
-def get_cosine_schedule_with_warmup(
-    optimizer,
-    num_warmup_steps,
-    num_wait_steps,
-    num_training_steps,
-    num_cycles=0.5,
-    last_epoch=-1,
-):
-    def lr_lambda(current_step):
-        if current_step < num_wait_steps:
-            return 0.0
-        if current_step < num_warmup_steps + num_wait_steps:
-            return float(current_step - num_wait_steps) / max(
-                1, float(num_warmup_steps)
-            )
-        progress = float(current_step - (num_warmup_steps + num_wait_steps)) / float(
-            max(1, num_training_steps - (num_warmup_steps + num_wait_steps))
-        )
-        return max(
-            0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress))
-        )
-
-    return LambdaLR(optimizer, lr_lambda, last_epoch)
-
-
-def add_weight_decay(model):
-    decay = []
-    no_decay = []
-    for name, param in model.named_parameters():
-        if not param.requires_grad:
-            continue
-        if "bn" in name:
-            no_decay.append(param)
-        else:
-            decay.append(param)
-    return [{"params": no_decay, "weight_decay": 0.0}, {"params": decay}]
-
-
-def format_preds(preds):
-    map_preds = []
-    for (i, (det_logits, det_masks)) in enumerate(zip(preds['pred_logits'], preds['pred_masks'])):
-        boxes = []
-        scores = []
-        labels = []
-
-        # Convert Mask2Former output format to expected bboxes
-        num_objs = 0
-        pred = {}
-        pred['pred_logits'] = det_logits
-        pred['pred_masks'] = det_masks
-        
-        det_list = []
-        for obj_idx in range(pred['pred_logits'].shape[0]):
-            probs = pred['pred_logits'][obj_idx].softmax(-1)
-            max_prob = probs.max().cpu().detach().numpy()
-            max_class = probs.argmax().cpu().detach().numpy()
-            if max_class != (pred['pred_logits'].shape[1] - 1) and max_prob >= 0.5:
-                mask = torch.sigmoid(pred['pred_masks'][obj_idx])
-                mask[mask > 0.5] = 1.0
-                mask[mask != 1.0] = 0.0
-                if mask.sum() > 0.0:
-                    x1y1x2y2 = masks_to_boxes(mask.unsqueeze(0)).cpu().numpy()[0]
-                    x1y1x2y2 = x1y1x2y2 / (pred['pred_masks'].shape[-1]-1) * 511 # Upscale
-                    x1 = x1y1x2y2[0]
-                    y1 = x1y1x2y2[1]
-                    x2 = x1y1x2y2[2]
-                    y2 = x1y1x2y2[3]
-
-                    boxes.append([x1, y1, x2, y2])
-                    scores.extend([float(max_prob)])
-                    labels.extend([int(max_class)])
-
-        curr_pred = dict(
-            boxes=torch.tensor(boxes).to("cuda"),
-            scores=torch.tensor(scores).to("cuda"),
-            labels=torch.IntTensor(labels).to("cuda"),
-        )
-        
-        map_preds.append(curr_pred)
-            
-    return map_preds
-
-
-def format_targets(labels):
-    map_targets = []
-        
-    for i, label in enumerate(labels):
-        boxes = []
-        scores = []
-        labels = []
-    
-        for label_obj_idx in range(len(label['labels'])):
-            mask = label['masks'][label_obj_idx]
-            if mask.sum() > 0.0:    
-                x1y1x2y2 = masks_to_boxes(mask.unsqueeze(0)).numpy()[0]
-                x1 = x1y1x2y2[0]
-                y1 = x1y1x2y2[1]
-                x2 = x1y1x2y2[2]
-                y2 = x1y1x2y2[3]
-
-                boxes.append([x1, y1, x2, y2])
-                labels.extend([int(label['labels'][label_obj_idx])])
-            
-        curr_target = dict(
-            boxes=torch.tensor(boxes).to("cuda"),
-            labels=torch.IntTensor(labels).to("cuda"),
-        )
-        map_targets.append(curr_target)
-    
-    return map_targets
\ No newline at end of file
diff --git a/torchsig/models/spectrogram_models/pspnet/LICENSE.md b/torchsig/models/spectrogram_models/pspnet/LICENSE.md
deleted file mode 100644
index 8f14f4e..0000000
--- a/torchsig/models/spectrogram_models/pspnet/LICENSE.md
+++ /dev/null
@@ -1,21 +0,0 @@
-The MIT License
-
-Copyright (c) 2019, Pavel Yakubovskiy
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
\ No newline at end of file
diff --git a/torchsig/models/spectrogram_models/pspnet/README.md b/torchsig/models/spectrogram_models/pspnet/README.md
deleted file mode 100644
index ec05c32..0000000
--- a/torchsig/models/spectrogram_models/pspnet/README.md
+++ /dev/null
@@ -1,5 +0,0 @@
-# PSPNet
-
-The PSPNet code contained here relies on the [segmentation_models_pytorch](https://github.com/qubvel/segmentation_models.pytorch) library.
-
-The segmentation models pytorch library is licensed under an MIT license. This license is contained within this directory.
diff --git a/torchsig/models/spectrogram_models/pspnet/__init__.py b/torchsig/models/spectrogram_models/pspnet/__init__.py
deleted file mode 100644
index 00f4377..0000000
--- a/torchsig/models/spectrogram_models/pspnet/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .pspnet import *
\ No newline at end of file
diff --git a/torchsig/models/spectrogram_models/pspnet/modules.py b/torchsig/models/spectrogram_models/pspnet/modules.py
deleted file mode 100644
index feedc10..0000000
--- a/torchsig/models/spectrogram_models/pspnet/modules.py
+++ /dev/null
@@ -1,54 +0,0 @@
-import numpy as np
-import torch
-from torch import nn
-import segmentation_models_pytorch as smp
-
-from .utils import replace_bn
-
-
-class BootstrappedCrossEntropy(nn.Module):
-    def __init__(
-        self, K: float = 0.15, criterion: nn.Module = None, momentum: float = 0.99998,
-    ):
-        super(BootstrappedCrossEntropy, self).__init__()
-        assert criterion != None, "you must give a criterion function"
-        self.criterion = criterion
-        self.K = K
-        self.momentum = momentum
-
-    def forward(self, pred, target, step):
-        B, C, H, W = pred.shape
-        num = int(self.K * B * H * W * max((self.momentum ** step), self.K))
-        loss = self.criterion(pred, target)
-        loss = loss.view(-1)
-        tk = torch.argsort(loss, descending=True)
-        TK = loss[tk[num - 1]]
-        loss = loss[loss >= TK]
-        return loss.mean()
-            
-            
-def create_pspnet(
-    encoder: str = 'efficientnet-b0',
-    num_classes: int = 53,
-) -> torch.nn.Module:
-    """
-    Function used to build a PSPNet network
-    
-    Args:
-        TODO
-        
-    Returns:
-        torch.nn.Module
-    """
-    # Create PSPNet using the SMP library
-    # Note that the encoder is instantiated within the PSPNet call
-    network = smp.PSPNet(
-        encoder_name=encoder,
-        in_channels=2,
-        classes=num_classes,
-    )
-    
-    # Replace batch norm with group norm
-    replace_bn(network)
-    
-    return network
\ No newline at end of file
diff --git a/torchsig/models/spectrogram_models/pspnet/pspnet.py b/torchsig/models/spectrogram_models/pspnet/pspnet.py
deleted file mode 100644
index 47096a4..0000000
--- a/torchsig/models/spectrogram_models/pspnet/pspnet.py
+++ /dev/null
@@ -1,281 +0,0 @@
-import timm
-import gdown
-import torch
-import os.path
-import numpy as np
-from torch import nn
-
-from .modules import *
-from .utils import *
-
-__all__ = [
-    "pspnet_b0", "pspnet_b2", "pspnet_b4",
-    "pspnet_b0_mod_family", "pspnet_b2_mod_family", "pspnet_b4_mod_family",
-]
-
-model_urls = {
-    "pspnet_b0": "1dSxMHzfiiqH8uAbWLhOy4jOmIJCP2M35",
-    "pspnet_b2": "1VnDPdByVMihn1LMVRsU9-_Ndbzvzybvz",
-    "pspnet_b4": "13gLlx1sSi5t6njp6NnPsphDBN_yYvOu0",
-    "pspnet_b0_mod_family": "1I1FF0lek3APmrTHakz7LhmTMNkKSPcxg",
-    "pspnet_b2_mod_family": "1803E3cGMhi2QMmv-Yh27VgE438iheKyJ",
-    "pspnet_b4_mod_family": "1T8xVV2AnZIeEWIjXe9MKGK7kxdDfBxKM",
-}
-
-
-def pspnet_b0(
-    pretrained: bool = False, 
-    path: str = "pspnet_b0.pt",
-    num_classes: int = 1,
-):
-    """Constructs a PSPNet architecture with an EfficientNet-B0 backbone.
-    PSPNet from `"Pyramid Scene Parsing Network" <https://arxiv.org/pdf/1612.01105.pdf>`_.
-    EfficientNet from `"EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks" <https://arxiv.org/pdf/1905.11946.pdf>`_.
-    
-    Args:
-        pretrained (bool): 
-            If True, returns a model pre-trained on WBSig53
-        path (str): 
-            Path to existing model or where to download checkpoint to
-        num_classes (int): 
-            Number of output classes; if loading checkpoint and 
-            number does not equal 1, final layer will not be loaded from checkpoint
-            NOTE: num_classes should equal the total number of classes **without**
-            including the background class. That "class" is automatically included.
-        
-    """
-    # Create PSPNet-B0
-    mdl = create_pspnet(
-        encoder='efficientnet-b0',
-        num_classes=1+1,
-    )
-    if pretrained:
-        model_exists = os.path.exists(path)
-        if not model_exists:
-            file_id = model_urls['pspnet_b0']
-            dl = gdown.download(id=file_id, output=path)
-        mdl.load_state_dict(torch.load(path), strict=False)
-    if num_classes != 1:
-        mdl.segmentation_head[0] = torch.nn.Conv2d(
-            in_channels=mdl.segmentation_head[0].in_channels,
-            out_channels=num_classes+1,
-            kernel_size=mdl.segmentation_head[0].kernel_size,
-            stride=mdl.segmentation_head[0].stride,
-            padding=mdl.segmentation_head[0].padding,
-        )
-    return mdl
-
-
-def pspnet_b2(
-    pretrained: bool = False, 
-    path: str = "pspnet_b2.pt",
-    num_classes: int = 1,
-):
-    """Constructs a PSPNet architecture with an EfficientNet-B2 backbone.
-    PSPNet from `"Pyramid Scene Parsing Network" <https://arxiv.org/pdf/1612.01105.pdf>`_.
-    EfficientNet from `"EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks" <https://arxiv.org/pdf/1905.11946.pdf>`_.
-    
-    Args:
-        pretrained (bool): 
-            If True, returns a model pre-trained on WBSig53
-        path (str): 
-            Path to existing model or where to download checkpoint to
-        num_classes (int): 
-            Number of output classes; if loading checkpoint and 
-            number does not equal 1, final layer will not be loaded from checkpoint
-            NOTE: num_classes should equal the total number of classes **without**
-            including the background class. That "class" is automatically included.
-        
-    """
-    # Create PSPNet-B2
-    mdl = create_pspnet(
-        encoder='efficientnet-b2',
-        num_classes=1+1,
-    )
-    if pretrained:
-        model_exists = os.path.exists(path)
-        if not model_exists:
-            file_id = model_urls['pspnet_b2']
-            dl = gdown.download(id=file_id, output=path)
-        mdl.load_state_dict(torch.load(path), strict=False)
-    if num_classes != 1:
-        mdl.segmentation_head[0] = torch.nn.Conv2d(
-            in_channels=mdl.segmentation_head[0].in_channels,
-            out_channels=num_classes+1,
-            kernel_size=mdl.segmentation_head[0].kernel_size,
-            stride=mdl.segmentation_head[0].stride,
-            padding=mdl.segmentation_head[0].padding,
-        )
-    return mdl
-    
-
-def pspnet_b4(
-    pretrained: bool = False, 
-    path: str = "pspnet_b4.pt",
-    num_classes: int = 1,
-):
-    """Constructs a PSPNet architecture with an EfficientNet-B4 backbone.
-    PSPNet from `"Pyramid Scene Parsing Network" <https://arxiv.org/pdf/1612.01105.pdf>`_.
-    EfficientNet from `"EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks" <https://arxiv.org/pdf/1905.11946.pdf>`_.
-    
-    Args:
-        pretrained (bool): 
-            If True, returns a model pre-trained on WBSig53
-        path (str): 
-            Path to existing model or where to download checkpoint to
-        num_classes (int): 
-            Number of output classes; if loading checkpoint and 
-            number does not equal 1, final layer will not be loaded from checkpoint
-            NOTE: num_classes should equal the total number of classes **without**
-            including the background class. That "class" is automatically included.
-        
-    """
-    # Create PSPNet-B4
-    mdl = create_pspnet(
-        encoder='efficientnet-b4',
-        num_classes=1+1,
-    )
-    if pretrained:
-        model_exists = os.path.exists(path)
-        if not model_exists:
-            file_id = model_urls['pspnet_b4']
-            dl = gdown.download(id=file_id, output=path)
-        mdl.load_state_dict(torch.load(path), strict=False)
-    if num_classes != 1:
-        mdl.segmentation_head[0] = torch.nn.Conv2d(
-            in_channels=mdl.segmentation_head[0].in_channels,
-            out_channels=num_classes+1,
-            kernel_size=mdl.segmentation_head[0].kernel_size,
-            stride=mdl.segmentation_head[0].stride,
-            padding=mdl.segmentation_head[0].padding,
-        )
-    return mdl
-
-
-def pspnet_b0_mod_family(
-    pretrained: bool = False, 
-    path: str = "pspnet_b0_mod_family.pt",
-    num_classes: int = 6,
-):
-    """Constructs a PSPNet architecture with an EfficientNet-B0 backbone.
-    PSPNet from `"Pyramid Scene Parsing Network" <https://arxiv.org/pdf/1612.01105.pdf>`_.
-    EfficientNet from `"EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks" <https://arxiv.org/pdf/1905.11946.pdf>`_.
-    
-    Args:
-        pretrained (bool): 
-            If True, returns a model pre-trained on WBSig53
-        path (str): 
-            Path to existing model or where to download checkpoint to
-        num_classes (int): 
-            Number of output classes; if loading checkpoint and 
-            number does not equal 1, final layer will not be loaded from checkpoint
-            NOTE: num_classes should equal the total number of classes **without**
-            including the background class. That "class" is automatically included.
-        
-    """
-    # Create PSPNet-B0
-    mdl = create_pspnet(
-        encoder='efficientnet-b0',
-        num_classes=6+1,
-    )
-    if pretrained:
-        model_exists = os.path.exists(path)
-        if not model_exists:
-            file_id = model_urls['pspnet_b0_mod_family']
-            dl = gdown.download(id=file_id, output=path)
-        mdl.load_state_dict(torch.load(path), strict=False)
-    if num_classes != 6:
-        mdl.segmentation_head[0] = torch.nn.Conv2d(
-            in_channels=mdl.segmentation_head[0].in_channels,
-            out_channels=num_classes+1,
-            kernel_size=mdl.segmentation_head[0].kernel_size,
-            stride=mdl.segmentation_head[0].stride,
-            padding=mdl.segmentation_head[0].padding,
-        )
-    return mdl
-
-
-def pspnet_b2_mod_family(
-    pretrained: bool = False, 
-    path: str = "pspnet_b2_mod_family.pt",
-    num_classes: int = 6,
-):
-    """Constructs a PSPNet architecture with an EfficientNet-B2 backbone.
-    PSPNet from `"Pyramid Scene Parsing Network" <https://arxiv.org/pdf/1612.01105.pdf>`_.
-    EfficientNet from `"EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks" <https://arxiv.org/pdf/1905.11946.pdf>`_.
-    
-    Args:
-        pretrained (bool): 
-            If True, returns a model pre-trained on WBSig53
-        path (str): 
-            Path to existing model or where to download checkpoint to
-        num_classes (int): 
-            Number of output classes; if loading checkpoint and 
-            number does not equal 1, final layer will not be loaded from checkpoint
-            NOTE: num_classes should equal the total number of classes **without**
-            including the background class. That "class" is automatically included.
-        
-    """
-    # Create PSPNet-B2
-    mdl = create_pspnet(
-        encoder='efficientnet-b2',
-        num_classes=6+1,
-    )
-    if pretrained:
-        model_exists = os.path.exists(path)
-        if not model_exists:
-            file_id = model_urls['pspnet_b2_mod_family']
-            dl = gdown.download(id=file_id, output=path)
-        mdl.load_state_dict(torch.load(path), strict=False)
-    if num_classes != 6:
-        mdl.segmentation_head[0] = torch.nn.Conv2d(
-            in_channels=mdl.segmentation_head[0].in_channels,
-            out_channels=num_classes+1,
-            kernel_size=mdl.segmentation_head[0].kernel_size,
-            stride=mdl.segmentation_head[0].stride,
-            padding=mdl.segmentation_head[0].padding,
-        )
-    return mdl
-
-
-def pspnet_b4_mod_family(
-    pretrained: bool = False, 
-    path: str = "pspnet_b4_mod_family.pt",
-    num_classes: int = 6,
-):
-    """Constructs a PSPNet architecture with an EfficientNet-B4 backbone.
-    PSPNet from `"Pyramid Scene Parsing Network" <https://arxiv.org/pdf/1612.01105.pdf>`_.
-    EfficientNet from `"EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks" <https://arxiv.org/pdf/1905.11946.pdf>`_.
-    
-    Args:
-        pretrained (bool): 
-            If True, returns a model pre-trained on WBSig53
-        path (str): 
-            Path to existing model or where to download checkpoint to
-        num_classes (int): 
-            Number of output classes; if loading checkpoint and 
-            number does not equal 1, final layer will not be loaded from checkpoint
-            NOTE: num_classes should equal the total number of classes **without**
-            including the background class. That "class" is automatically included.
-        
-    """
-    # Create PSPNet-B4
-    mdl = create_pspnet(
-        encoder='efficientnet-b4',
-        num_classes=6+1,
-    )
-    if pretrained:
-        model_exists = os.path.exists(path)
-        if not model_exists:
-            file_id = model_urls['pspnet_b4_mod_family']
-            dl = gdown.download(id=file_id, output=path)
-        mdl.load_state_dict(torch.load(path), strict=False)
-    if num_classes != 6:
-        mdl.segmentation_head[0] = torch.nn.Conv2d(
-            in_channels=mdl.segmentation_head[0].in_channels,
-            out_channels=num_classes+1,
-            kernel_size=mdl.segmentation_head[0].kernel_size,
-            stride=mdl.segmentation_head[0].stride,
-            padding=mdl.segmentation_head[0].padding,
-        )
-    return mdl
diff --git a/torchsig/models/spectrogram_models/pspnet/utils.py b/torchsig/models/spectrogram_models/pspnet/utils.py
deleted file mode 100644
index 2c281a9..0000000
--- a/torchsig/models/spectrogram_models/pspnet/utils.py
+++ /dev/null
@@ -1,140 +0,0 @@
-import sympy
-import numpy as np
-import torch
-from torch import nn
-from scipy import ndimage
-
-
-def acc(y_hat, y):
-    y_hat = y_hat.argmax(1)
-    acc = ((y_hat == y)).float().mean()
-    return acc
-
-
-def iou(y_hat, y):
-    y_hat = y_hat.argmax(1)
-    intersection = ((y_hat == 1) & (y == 1)).sum((1, 2))
-    union = ((y_hat == 1) | (y == 1)).sum((1, 2))
-    iou = (intersection.float() / union.float()).mean()
-    return iou
-
-
-def class_iou(y_hat, y):
-    # print(y_hat.shape) # B, C, H, W
-    # print(y.shape) # B, H, W
-    y_hat = y_hat.argmax(1)
-    # print(y_hat.shape) # B, H, W
-    num_classes = 6
-    iou = 0
-    num_present = 0
-    for batch_idx in range(y.shape[0]):
-        for class_idx in range(1, num_classes+1):
-            if (y == class_idx).float().sum() > 0:
-                intersection = ((y_hat == class_idx) & (y == class_idx)).sum((1, 2))
-                union = ((y_hat == class_idx) | (y == class_idx)).sum((1, 2))
-                class_iou = ((intersection.float() + 1e-6) / (union.float() + 1e-6)).mean()
-                iou += class_iou
-                num_present += 1
-    return iou / num_present
-
-
-def replace_bn(parent):
-    for n, m in parent.named_children():
-        if type(m) is nn.BatchNorm2d:
-            setattr(
-                parent,
-                n,
-                nn.GroupNorm(
-                    min(
-                        sympy.divisors(m.num_features),
-                        key=lambda x: np.abs(np.sqrt(m.num_features) - x),
-                    ),
-                    m.num_features,
-                ),
-            )
-        else:
-            replace_bn(m)
-
-            
-def format_preds(preds, num_classes):
-    map_preds = []
-    
-    # Loop over examples in batch
-    for pred in preds:
-        boxes = []
-        scores = []
-        labels = []
-        
-        # Loop over classes
-        for class_idx in range(1,num_classes+1):
-            curr_pred = pred.argmax(0)
-            curr_indices = (curr_pred == class_idx).cpu().numpy()
-            curr_pred = np.zeros((preds.shape[-2], preds.shape[-1]))
-            curr_pred[curr_indices] = 1.0
-            if curr_pred.sum() == 0:
-                continue
-
-            image, num_features = ndimage.label(np.abs(curr_pred))
-            objs = ndimage.find_objects(image)
-
-            # # Remove small boxes and append to detected signal object
-            # min_dur = 2 # min time duration
-            # min_bw = 2 # min bw
-            # min_area = 4
-            
-            for i, ob in enumerate(objs):
-                bw = ob[0].stop - ob[0].start
-                dur = ob[1].stop - ob[1].start
-                # if (dur > min_dur) and (bw > min_bw) and (bw*dur > min_area):
-                center_time = (ob[1].stop + ob[1].start) / 2
-                center_freq = ob[0].start + bw/2
-
-                boxes.append([ob[1].start, ob[0].start, ob[1].stop, ob[0].stop])
-                scores.extend([1.0])
-                labels.extend([class_idx-1])
-    
-        curr_pred = dict(
-            boxes=torch.tensor(boxes).to("cuda"),
-            scores=torch.tensor(scores).to("cuda"),
-            labels=torch.IntTensor(labels).to("cuda"),
-        )
-        map_preds.append(curr_pred)
-    
-    return map_preds
-
-
-def format_targets(targets, num_classes):
-    map_targets = []
-    
-    # Loop over examples in batch
-    for target in targets:
-        boxes = []
-        labels = []
-        
-        # Loop over classes
-        for class_idx in range(1,num_classes+1):
-            curr_indices = (target == class_idx).cpu().numpy()
-            curr_target = np.zeros((targets.shape[-2], targets.shape[-1]))
-            curr_target[curr_indices] = 1.0
-            if curr_target.sum() == 0:
-                continue
-
-            image, num_features = ndimage.label(np.abs(curr_target))
-            objs = ndimage.find_objects(image)
-
-            for i, ob in enumerate(objs):
-                bw = ob[0].stop - ob[0].start
-                dur = ob[1].stop - ob[1].start
-                center_time = (ob[1].stop + ob[1].start) / 2
-                center_freq = ob[0].start + bw/2
-
-                boxes.append([ob[1].start, ob[0].start, ob[1].stop, ob[0].stop])
-                labels.extend([class_idx-1])
-    
-        curr_target = dict(
-            boxes=torch.tensor(boxes).to("cuda"),
-            labels=torch.IntTensor(labels).to("cuda"),
-        )
-        map_targets.append(curr_target)
-    
-    return map_targets
diff --git a/torchsig/models/spectrogram_models/yolov5/LICENSE.md b/torchsig/models/spectrogram_models/yolov5/LICENSE.md
deleted file mode 100644
index 9e419e0..0000000
--- a/torchsig/models/spectrogram_models/yolov5/LICENSE.md
+++ /dev/null
@@ -1,674 +0,0 @@
-GNU GENERAL PUBLIC LICENSE
-                       Version 3, 29 June 2007
-
- Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
- Everyone is permitted to copy and distribute verbatim copies
- of this license document, but changing it is not allowed.
-
-                            Preamble
-
-  The GNU General Public License is a free, copyleft license for
-software and other kinds of works.
-
-  The licenses for most software and other practical works are designed
-to take away your freedom to share and change the works.  By contrast,
-the GNU General Public License is intended to guarantee your freedom to
-share and change all versions of a program--to make sure it remains free
-software for all its users.  We, the Free Software Foundation, use the
-GNU General Public License for most of our software; it applies also to
-any other work released this way by its authors.  You can apply it to
-your programs, too.
-
-  When we speak of free software, we are referring to freedom, not
-price.  Our General Public Licenses are designed to make sure that you
-have the freedom to distribute copies of free software (and charge for
-them if you wish), that you receive source code or can get it if you
-want it, that you can change the software or use pieces of it in new
-free programs, and that you know you can do these things.
-
-  To protect your rights, we need to prevent others from denying you
-these rights or asking you to surrender the rights.  Therefore, you have
-certain responsibilities if you distribute copies of the software, or if
-you modify it: responsibilities to respect the freedom of others.
-
-  For example, if you distribute copies of such a program, whether
-gratis or for a fee, you must pass on to the recipients the same
-freedoms that you received.  You must make sure that they, too, receive
-or can get the source code.  And you must show them these terms so they
-know their rights.
-
-  Developers that use the GNU GPL protect your rights with two steps:
-(1) assert copyright on the software, and (2) offer you this License
-giving you legal permission to copy, distribute and/or modify it.
-
-  For the developers' and authors' protection, the GPL clearly explains
-that there is no warranty for this free software.  For both users' and
-authors' sake, the GPL requires that modified versions be marked as
-changed, so that their problems will not be attributed erroneously to
-authors of previous versions.
-
-  Some devices are designed to deny users access to install or run
-modified versions of the software inside them, although the manufacturer
-can do so.  This is fundamentally incompatible with the aim of
-protecting users' freedom to change the software.  The systematic
-pattern of such abuse occurs in the area of products for individuals to
-use, which is precisely where it is most unacceptable.  Therefore, we
-have designed this version of the GPL to prohibit the practice for those
-products.  If such problems arise substantially in other domains, we
-stand ready to extend this provision to those domains in future versions
-of the GPL, as needed to protect the freedom of users.
-
-  Finally, every program is threatened constantly by software patents.
-States should not allow patents to restrict development and use of
-software on general-purpose computers, but in those that do, we wish to
-avoid the special danger that patents applied to a free program could
-make it effectively proprietary.  To prevent this, the GPL assures that
-patents cannot be used to render the program non-free.
-
-  The precise terms and conditions for copying, distribution and
-modification follow.
-
-                       TERMS AND CONDITIONS
-
-  0. Definitions.
-
-  "This License" refers to version 3 of the GNU General Public License.
-
-  "Copyright" also means copyright-like laws that apply to other kinds of
-works, such as semiconductor masks.
-
-  "The Program" refers to any copyrightable work licensed under this
-License.  Each licensee is addressed as "you".  "Licensees" and
-"recipients" may be individuals or organizations.
-
-  To "modify" a work means to copy from or adapt all or part of the work
-in a fashion requiring copyright permission, other than the making of an
-exact copy.  The resulting work is called a "modified version" of the
-earlier work or a work "based on" the earlier work.
-
-  A "covered work" means either the unmodified Program or a work based
-on the Program.
-
-  To "propagate" a work means to do anything with it that, without
-permission, would make you directly or secondarily liable for
-infringement under applicable copyright law, except executing it on a
-computer or modifying a private copy.  Propagation includes copying,
-distribution (with or without modification), making available to the
-public, and in some countries other activities as well.
-
-  To "convey" a work means any kind of propagation that enables other
-parties to make or receive copies.  Mere interaction with a user through
-a computer network, with no transfer of a copy, is not conveying.
-
-  An interactive user interface displays "Appropriate Legal Notices"
-to the extent that it includes a convenient and prominently visible
-feature that (1) displays an appropriate copyright notice, and (2)
-tells the user that there is no warranty for the work (except to the
-extent that warranties are provided), that licensees may convey the
-work under this License, and how to view a copy of this License.  If
-the interface presents a list of user commands or options, such as a
-menu, a prominent item in the list meets this criterion.
-
-  1. Source Code.
-
-  The "source code" for a work means the preferred form of the work
-for making modifications to it.  "Object code" means any non-source
-form of a work.
-
-  A "Standard Interface" means an interface that either is an official
-standard defined by a recognized standards body, or, in the case of
-interfaces specified for a particular programming language, one that
-is widely used among developers working in that language.
-
-  The "System Libraries" of an executable work include anything, other
-than the work as a whole, that (a) is included in the normal form of
-packaging a Major Component, but which is not part of that Major
-Component, and (b) serves only to enable use of the work with that
-Major Component, or to implement a Standard Interface for which an
-implementation is available to the public in source code form.  A
-"Major Component", in this context, means a major essential component
-(kernel, window system, and so on) of the specific operating system
-(if any) on which the executable work runs, or a compiler used to
-produce the work, or an object code interpreter used to run it.
-
-  The "Corresponding Source" for a work in object code form means all
-the source code needed to generate, install, and (for an executable
-work) run the object code and to modify the work, including scripts to
-control those activities.  However, it does not include the work's
-System Libraries, or general-purpose tools or generally available free
-programs which are used unmodified in performing those activities but
-which are not part of the work.  For example, Corresponding Source
-includes interface definition files associated with source files for
-the work, and the source code for shared libraries and dynamically
-linked subprograms that the work is specifically designed to require,
-such as by intimate data communication or control flow between those
-subprograms and other parts of the work.
-
-  The Corresponding Source need not include anything that users
-can regenerate automatically from other parts of the Corresponding
-Source.
-
-  The Corresponding Source for a work in source code form is that
-same work.
-
-  2. Basic Permissions.
-
-  All rights granted under this License are granted for the term of
-copyright on the Program, and are irrevocable provided the stated
-conditions are met.  This License explicitly affirms your unlimited
-permission to run the unmodified Program.  The output from running a
-covered work is covered by this License only if the output, given its
-content, constitutes a covered work.  This License acknowledges your
-rights of fair use or other equivalent, as provided by copyright law.
-
-  You may make, run and propagate covered works that you do not
-convey, without conditions so long as your license otherwise remains
-in force.  You may convey covered works to others for the sole purpose
-of having them make modifications exclusively for you, or provide you
-with facilities for running those works, provided that you comply with
-the terms of this License in conveying all material for which you do
-not control copyright.  Those thus making or running the covered works
-for you must do so exclusively on your behalf, under your direction
-and control, on terms that prohibit them from making any copies of
-your copyrighted material outside their relationship with you.
-
-  Conveying under any other circumstances is permitted solely under
-the conditions stated below.  Sublicensing is not allowed; section 10
-makes it unnecessary.
-
-  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
-
-  No covered work shall be deemed part of an effective technological
-measure under any applicable law fulfilling obligations under article
-11 of the WIPO copyright treaty adopted on 20 December 1996, or
-similar laws prohibiting or restricting circumvention of such
-measures.
-
-  When you convey a covered work, you waive any legal power to forbid
-circumvention of technological measures to the extent such circumvention
-is effected by exercising rights under this License with respect to
-the covered work, and you disclaim any intention to limit operation or
-modification of the work as a means of enforcing, against the work's
-users, your or third parties' legal rights to forbid circumvention of
-technological measures.
-
-  4. Conveying Verbatim Copies.
-
-  You may convey verbatim copies of the Program's source code as you
-receive it, in any medium, provided that you conspicuously and
-appropriately publish on each copy an appropriate copyright notice;
-keep intact all notices stating that this License and any
-non-permissive terms added in accord with section 7 apply to the code;
-keep intact all notices of the absence of any warranty; and give all
-recipients a copy of this License along with the Program.
-
-  You may charge any price or no price for each copy that you convey,
-and you may offer support or warranty protection for a fee.
-
-  5. Conveying Modified Source Versions.
-
-  You may convey a work based on the Program, or the modifications to
-produce it from the Program, in the form of source code under the
-terms of section 4, provided that you also meet all of these conditions:
-
-    a) The work must carry prominent notices stating that you modified
-    it, and giving a relevant date.
-
-    b) The work must carry prominent notices stating that it is
-    released under this License and any conditions added under section
-    7.  This requirement modifies the requirement in section 4 to
-    "keep intact all notices".
-
-    c) You must license the entire work, as a whole, under this
-    License to anyone who comes into possession of a copy.  This
-    License will therefore apply, along with any applicable section 7
-    additional terms, to the whole of the work, and all its parts,
-    regardless of how they are packaged.  This License gives no
-    permission to license the work in any other way, but it does not
-    invalidate such permission if you have separately received it.
-
-    d) If the work has interactive user interfaces, each must display
-    Appropriate Legal Notices; however, if the Program has interactive
-    interfaces that do not display Appropriate Legal Notices, your
-    work need not make them do so.
-
-  A compilation of a covered work with other separate and independent
-works, which are not by their nature extensions of the covered work,
-and which are not combined with it such as to form a larger program,
-in or on a volume of a storage or distribution medium, is called an
-"aggregate" if the compilation and its resulting copyright are not
-used to limit the access or legal rights of the compilation's users
-beyond what the individual works permit.  Inclusion of a covered work
-in an aggregate does not cause this License to apply to the other
-parts of the aggregate.
-
-  6. Conveying Non-Source Forms.
-
-  You may convey a covered work in object code form under the terms
-of sections 4 and 5, provided that you also convey the
-machine-readable Corresponding Source under the terms of this License,
-in one of these ways:
-
-    a) Convey the object code in, or embodied in, a physical product
-    (including a physical distribution medium), accompanied by the
-    Corresponding Source fixed on a durable physical medium
-    customarily used for software interchange.
-
-    b) Convey the object code in, or embodied in, a physical product
-    (including a physical distribution medium), accompanied by a
-    written offer, valid for at least three years and valid for as
-    long as you offer spare parts or customer support for that product
-    model, to give anyone who possesses the object code either (1) a
-    copy of the Corresponding Source for all the software in the
-    product that is covered by this License, on a durable physical
-    medium customarily used for software interchange, for a price no
-    more than your reasonable cost of physically performing this
-    conveying of source, or (2) access to copy the
-    Corresponding Source from a network server at no charge.
-
-    c) Convey individual copies of the object code with a copy of the
-    written offer to provide the Corresponding Source.  This
-    alternative is allowed only occasionally and noncommercially, and
-    only if you received the object code with such an offer, in accord
-    with subsection 6b.
-
-    d) Convey the object code by offering access from a designated
-    place (gratis or for a charge), and offer equivalent access to the
-    Corresponding Source in the same way through the same place at no
-    further charge.  You need not require recipients to copy the
-    Corresponding Source along with the object code.  If the place to
-    copy the object code is a network server, the Corresponding Source
-    may be on a different server (operated by you or a third party)
-    that supports equivalent copying facilities, provided you maintain
-    clear directions next to the object code saying where to find the
-    Corresponding Source.  Regardless of what server hosts the
-    Corresponding Source, you remain obligated to ensure that it is
-    available for as long as needed to satisfy these requirements.
-
-    e) Convey the object code using peer-to-peer transmission, provided
-    you inform other peers where the object code and Corresponding
-    Source of the work are being offered to the general public at no
-    charge under subsection 6d.
-
-  A separable portion of the object code, whose source code is excluded
-from the Corresponding Source as a System Library, need not be
-included in conveying the object code work.
-
-  A "User Product" is either (1) a "consumer product", which means any
-tangible personal property which is normally used for personal, family,
-or household purposes, or (2) anything designed or sold for incorporation
-into a dwelling.  In determining whether a product is a consumer product,
-doubtful cases shall be resolved in favor of coverage.  For a particular
-product received by a particular user, "normally used" refers to a
-typical or common use of that class of product, regardless of the status
-of the particular user or of the way in which the particular user
-actually uses, or expects or is expected to use, the product.  A product
-is a consumer product regardless of whether the product has substantial
-commercial, industrial or non-consumer uses, unless such uses represent
-the only significant mode of use of the product.
-
-  "Installation Information" for a User Product means any methods,
-procedures, authorization keys, or other information required to install
-and execute modified versions of a covered work in that User Product from
-a modified version of its Corresponding Source.  The information must
-suffice to ensure that the continued functioning of the modified object
-code is in no case prevented or interfered with solely because
-modification has been made.
-
-  If you convey an object code work under this section in, or with, or
-specifically for use in, a User Product, and the conveying occurs as
-part of a transaction in which the right of possession and use of the
-User Product is transferred to the recipient in perpetuity or for a
-fixed term (regardless of how the transaction is characterized), the
-Corresponding Source conveyed under this section must be accompanied
-by the Installation Information.  But this requirement does not apply
-if neither you nor any third party retains the ability to install
-modified object code on the User Product (for example, the work has
-been installed in ROM).
-
-  The requirement to provide Installation Information does not include a
-requirement to continue to provide support service, warranty, or updates
-for a work that has been modified or installed by the recipient, or for
-the User Product in which it has been modified or installed.  Access to a
-network may be denied when the modification itself materially and
-adversely affects the operation of the network or violates the rules and
-protocols for communication across the network.
-
-  Corresponding Source conveyed, and Installation Information provided,
-in accord with this section must be in a format that is publicly
-documented (and with an implementation available to the public in
-source code form), and must require no special password or key for
-unpacking, reading or copying.
-
-  7. Additional Terms.
-
-  "Additional permissions" are terms that supplement the terms of this
-License by making exceptions from one or more of its conditions.
-Additional permissions that are applicable to the entire Program shall
-be treated as though they were included in this License, to the extent
-that they are valid under applicable law.  If additional permissions
-apply only to part of the Program, that part may be used separately
-under those permissions, but the entire Program remains governed by
-this License without regard to the additional permissions.
-
-  When you convey a copy of a covered work, you may at your option
-remove any additional permissions from that copy, or from any part of
-it.  (Additional permissions may be written to require their own
-removal in certain cases when you modify the work.)  You may place
-additional permissions on material, added by you to a covered work,
-for which you have or can give appropriate copyright permission.
-
-  Notwithstanding any other provision of this License, for material you
-add to a covered work, you may (if authorized by the copyright holders of
-that material) supplement the terms of this License with terms:
-
-    a) Disclaiming warranty or limiting liability differently from the
-    terms of sections 15 and 16 of this License; or
-
-    b) Requiring preservation of specified reasonable legal notices or
-    author attributions in that material or in the Appropriate Legal
-    Notices displayed by works containing it; or
-
-    c) Prohibiting misrepresentation of the origin of that material, or
-    requiring that modified versions of such material be marked in
-    reasonable ways as different from the original version; or
-
-    d) Limiting the use for publicity purposes of names of licensors or
-    authors of the material; or
-
-    e) Declining to grant rights under trademark law for use of some
-    trade names, trademarks, or service marks; or
-
-    f) Requiring indemnification of licensors and authors of that
-    material by anyone who conveys the material (or modified versions of
-    it) with contractual assumptions of liability to the recipient, for
-    any liability that these contractual assumptions directly impose on
-    those licensors and authors.
-
-  All other non-permissive additional terms are considered "further
-restrictions" within the meaning of section 10.  If the Program as you
-received it, or any part of it, contains a notice stating that it is
-governed by this License along with a term that is a further
-restriction, you may remove that term.  If a license document contains
-a further restriction but permits relicensing or conveying under this
-License, you may add to a covered work material governed by the terms
-of that license document, provided that the further restriction does
-not survive such relicensing or conveying.
-
-  If you add terms to a covered work in accord with this section, you
-must place, in the relevant source files, a statement of the
-additional terms that apply to those files, or a notice indicating
-where to find the applicable terms.
-
-  Additional terms, permissive or non-permissive, may be stated in the
-form of a separately written license, or stated as exceptions;
-the above requirements apply either way.
-
-  8. Termination.
-
-  You may not propagate or modify a covered work except as expressly
-provided under this License.  Any attempt otherwise to propagate or
-modify it is void, and will automatically terminate your rights under
-this License (including any patent licenses granted under the third
-paragraph of section 11).
-
-  However, if you cease all violation of this License, then your
-license from a particular copyright holder is reinstated (a)
-provisionally, unless and until the copyright holder explicitly and
-finally terminates your license, and (b) permanently, if the copyright
-holder fails to notify you of the violation by some reasonable means
-prior to 60 days after the cessation.
-
-  Moreover, your license from a particular copyright holder is
-reinstated permanently if the copyright holder notifies you of the
-violation by some reasonable means, this is the first time you have
-received notice of violation of this License (for any work) from that
-copyright holder, and you cure the violation prior to 30 days after
-your receipt of the notice.
-
-  Termination of your rights under this section does not terminate the
-licenses of parties who have received copies or rights from you under
-this License.  If your rights have been terminated and not permanently
-reinstated, you do not qualify to receive new licenses for the same
-material under section 10.
-
-  9. Acceptance Not Required for Having Copies.
-
-  You are not required to accept this License in order to receive or
-run a copy of the Program.  Ancillary propagation of a covered work
-occurring solely as a consequence of using peer-to-peer transmission
-to receive a copy likewise does not require acceptance.  However,
-nothing other than this License grants you permission to propagate or
-modify any covered work.  These actions infringe copyright if you do
-not accept this License.  Therefore, by modifying or propagating a
-covered work, you indicate your acceptance of this License to do so.
-
-  10. Automatic Licensing of Downstream Recipients.
-
-  Each time you convey a covered work, the recipient automatically
-receives a license from the original licensors, to run, modify and
-propagate that work, subject to this License.  You are not responsible
-for enforcing compliance by third parties with this License.
-
-  An "entity transaction" is a transaction transferring control of an
-organization, or substantially all assets of one, or subdividing an
-organization, or merging organizations.  If propagation of a covered
-work results from an entity transaction, each party to that
-transaction who receives a copy of the work also receives whatever
-licenses to the work the party's predecessor in interest had or could
-give under the previous paragraph, plus a right to possession of the
-Corresponding Source of the work from the predecessor in interest, if
-the predecessor has it or can get it with reasonable efforts.
-
-  You may not impose any further restrictions on the exercise of the
-rights granted or affirmed under this License.  For example, you may
-not impose a license fee, royalty, or other charge for exercise of
-rights granted under this License, and you may not initiate litigation
-(including a cross-claim or counterclaim in a lawsuit) alleging that
-any patent claim is infringed by making, using, selling, offering for
-sale, or importing the Program or any portion of it.
-
-  11. Patents.
-
-  A "contributor" is a copyright holder who authorizes use under this
-License of the Program or a work on which the Program is based.  The
-work thus licensed is called the contributor's "contributor version".
-
-  A contributor's "essential patent claims" are all patent claims
-owned or controlled by the contributor, whether already acquired or
-hereafter acquired, that would be infringed by some manner, permitted
-by this License, of making, using, or selling its contributor version,
-but do not include claims that would be infringed only as a
-consequence of further modification of the contributor version.  For
-purposes of this definition, "control" includes the right to grant
-patent sublicenses in a manner consistent with the requirements of
-this License.
-
-  Each contributor grants you a non-exclusive, worldwide, royalty-free
-patent license under the contributor's essential patent claims, to
-make, use, sell, offer for sale, import and otherwise run, modify and
-propagate the contents of its contributor version.
-
-  In the following three paragraphs, a "patent license" is any express
-agreement or commitment, however denominated, not to enforce a patent
-(such as an express permission to practice a patent or covenant not to
-sue for patent infringement).  To "grant" such a patent license to a
-party means to make such an agreement or commitment not to enforce a
-patent against the party.
-
-  If you convey a covered work, knowingly relying on a patent license,
-and the Corresponding Source of the work is not available for anyone
-to copy, free of charge and under the terms of this License, through a
-publicly available network server or other readily accessible means,
-then you must either (1) cause the Corresponding Source to be so
-available, or (2) arrange to deprive yourself of the benefit of the
-patent license for this particular work, or (3) arrange, in a manner
-consistent with the requirements of this License, to extend the patent
-license to downstream recipients.  "Knowingly relying" means you have
-actual knowledge that, but for the patent license, your conveying the
-covered work in a country, or your recipient's use of the covered work
-in a country, would infringe one or more identifiable patents in that
-country that you have reason to believe are valid.
-
-  If, pursuant to or in connection with a single transaction or
-arrangement, you convey, or propagate by procuring conveyance of, a
-covered work, and grant a patent license to some of the parties
-receiving the covered work authorizing them to use, propagate, modify
-or convey a specific copy of the covered work, then the patent license
-you grant is automatically extended to all recipients of the covered
-work and works based on it.
-
-  A patent license is "discriminatory" if it does not include within
-the scope of its coverage, prohibits the exercise of, or is
-conditioned on the non-exercise of one or more of the rights that are
-specifically granted under this License.  You may not convey a covered
-work if you are a party to an arrangement with a third party that is
-in the business of distributing software, under which you make payment
-to the third party based on the extent of your activity of conveying
-the work, and under which the third party grants, to any of the
-parties who would receive the covered work from you, a discriminatory
-patent license (a) in connection with copies of the covered work
-conveyed by you (or copies made from those copies), or (b) primarily
-for and in connection with specific products or compilations that
-contain the covered work, unless you entered into that arrangement,
-or that patent license was granted, prior to 28 March 2007.
-
-  Nothing in this License shall be construed as excluding or limiting
-any implied license or other defenses to infringement that may
-otherwise be available to you under applicable patent law.
-
-  12. No Surrender of Others' Freedom.
-
-  If conditions are imposed on you (whether by court order, agreement or
-otherwise) that contradict the conditions of this License, they do not
-excuse you from the conditions of this License.  If you cannot convey a
-covered work so as to satisfy simultaneously your obligations under this
-License and any other pertinent obligations, then as a consequence you may
-not convey it at all.  For example, if you agree to terms that obligate you
-to collect a royalty for further conveying from those to whom you convey
-the Program, the only way you could satisfy both those terms and this
-License would be to refrain entirely from conveying the Program.
-
-  13. Use with the GNU Affero General Public License.
-
-  Notwithstanding any other provision of this License, you have
-permission to link or combine any covered work with a work licensed
-under version 3 of the GNU Affero General Public License into a single
-combined work, and to convey the resulting work.  The terms of this
-License will continue to apply to the part which is the covered work,
-but the special requirements of the GNU Affero General Public License,
-section 13, concerning interaction through a network will apply to the
-combination as such.
-
-  14. Revised Versions of this License.
-
-  The Free Software Foundation may publish revised and/or new versions of
-the GNU General Public License from time to time.  Such new versions will
-be similar in spirit to the present version, but may differ in detail to
-address new problems or concerns.
-
-  Each version is given a distinguishing version number.  If the
-Program specifies that a certain numbered version of the GNU General
-Public License "or any later version" applies to it, you have the
-option of following the terms and conditions either of that numbered
-version or of any later version published by the Free Software
-Foundation.  If the Program does not specify a version number of the
-GNU General Public License, you may choose any version ever published
-by the Free Software Foundation.
-
-  If the Program specifies that a proxy can decide which future
-versions of the GNU General Public License can be used, that proxy's
-public statement of acceptance of a version permanently authorizes you
-to choose that version for the Program.
-
-  Later license versions may give you additional or different
-permissions.  However, no additional obligations are imposed on any
-author or copyright holder as a result of your choosing to follow a
-later version.
-
-  15. Disclaimer of Warranty.
-
-  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
-APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
-HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
-OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
-THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
-IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
-ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
-
-  16. Limitation of Liability.
-
-  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
-WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
-THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
-GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
-USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
-DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
-PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
-EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
-SUCH DAMAGES.
-
-  17. Interpretation of Sections 15 and 16.
-
-  If the disclaimer of warranty and limitation of liability provided
-above cannot be given local legal effect according to their terms,
-reviewing courts shall apply local law that most closely approximates
-an absolute waiver of all civil liability in connection with the
-Program, unless a warranty or assumption of liability accompanies a
-copy of the Program in return for a fee.
-
-                     END OF TERMS AND CONDITIONS
-
-            How to Apply These Terms to Your New Programs
-
-  If you develop a new program, and you want it to be of the greatest
-possible use to the public, the best way to achieve this is to make it
-free software which everyone can redistribute and change under these terms.
-
-  To do so, attach the following notices to the program.  It is safest
-to attach them to the start of each source file to most effectively
-state the exclusion of warranty; and each file should have at least
-the "copyright" line and a pointer to where the full notice is found.
-
-    <one line to give the program's name and a brief idea of what it does.>
-    Copyright (C) <year>  <name of author>
-
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
-
-Also add information on how to contact you by electronic and paper mail.
-
-  If the program does terminal interaction, make it output a short
-notice like this when it starts in an interactive mode:
-
-    <program>  Copyright (C) <year>  <name of author>
-    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
-    This is free software, and you are welcome to redistribute it
-    under certain conditions; type `show c' for details.
-
-The hypothetical commands `show w' and `show c' should show the appropriate
-parts of the General Public License.  Of course, your program's commands
-might be different; for a GUI interface, you would use an "about box".
-
-  You should also get your employer (if you work as a programmer) or school,
-if any, to sign a "copyright disclaimer" for the program, if necessary.
-For more information on this, and how to apply and follow the GNU GPL, see
-<http://www.gnu.org/licenses/>.
-
-  The GNU General Public License does not permit incorporating your program
-into proprietary programs.  If your program is a subroutine library, you
-may consider it more useful to permit linking proprietary applications with
-the library.  If this is what you want to do, use the GNU Lesser General
-Public License instead of this License.  But first, please read
-<http://www.gnu.org/philosophy/why-not-lgpl.html>.
\ No newline at end of file
diff --git a/torchsig/models/spectrogram_models/yolov5/README.md b/torchsig/models/spectrogram_models/yolov5/README.md
deleted file mode 100644
index ad9d5a1..0000000
--- a/torchsig/models/spectrogram_models/yolov5/README.md
+++ /dev/null
@@ -1,5 +0,0 @@
-# YOLOv5
-
-The YOLOv5 code contained here has been cloned, modified, and supplemented from its original [yolov5 github](https://github.com/ultralytics/yolov5).
-
-YOLOv5 is licensed under a GPL-3.0 license. This license for YOLOv5 is contained within this directory.
diff --git a/torchsig/models/spectrogram_models/yolov5/__init__.py b/torchsig/models/spectrogram_models/yolov5/__init__.py
deleted file mode 100644
index a444540..0000000
--- a/torchsig/models/spectrogram_models/yolov5/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .yolov5 import *
\ No newline at end of file
diff --git a/torchsig/models/spectrogram_models/yolov5/mean_ap.py b/torchsig/models/spectrogram_models/yolov5/mean_ap.py
deleted file mode 100644
index f8791c1..0000000
--- a/torchsig/models/spectrogram_models/yolov5/mean_ap.py
+++ /dev/null
@@ -1,802 +0,0 @@
-"""
-Code is taken from: https://github.com/PyTorchLightning/metrics/blob/a971c6b456e40728b34494ff9186af20da46cb5b/torchmetrics/detection/mean_ap.py
-
-Modified slightly to patch bugs with device mismatches between cpu and cuda
-
-"""
-# Copyright The PyTorch Lightning team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import logging
-from typing import Any, Dict, List, Optional, Sequence, Tuple
-
-import torch
-from torch import IntTensor, Tensor
-
-from torchmetrics.metric import Metric
-from torchmetrics.utilities.imports import _TORCHVISION_GREATER_EQUAL_0_8
-
-if _TORCHVISION_GREATER_EQUAL_0_8:
-    from torchvision.ops import box_area, box_convert, box_iou
-else:
-    box_convert = box_iou = box_area = None
-    __doctest_skip__ = ["MeanAveragePrecision"]
-
-log = logging.getLogger(__name__)
-
-
-class BaseMetricResults(dict):
-    """Base metric class, that allows fields for pre-defined metrics."""
-
-    def __getattr__(self, key: str) -> Tensor:
-        # Using this you get the correct error message, an AttributeError instead of a KeyError
-        if key in self:
-            return self[key]
-        raise AttributeError(f"No such attribute: {key}")
-
-    def __setattr__(self, key: str, value: Tensor) -> None:
-        self[key] = value
-
-    def __delattr__(self, key: str) -> None:
-        if key in self:
-            del self[key]
-        raise AttributeError(f"No such attribute: {key}")
-
-
-class MAPMetricResults(BaseMetricResults):
-    """Class to wrap the final mAP results."""
-
-    __slots__ = ("map", "map_50", "map_75", "map_small", "map_medium", "map_large")
-
-
-class MARMetricResults(BaseMetricResults):
-    """Class to wrap the final mAR results."""
-
-    __slots__ = ("mar_1", "mar_10", "mar_100", "mar_small", "mar_medium", "mar_large")
-
-
-class COCOMetricResults(BaseMetricResults):
-    """Class to wrap the final COCO metric results including various mAP/mAR values."""
-
-    __slots__ = (
-        "map",
-        "map_50",
-        "map_75",
-        "map_small",
-        "map_medium",
-        "map_large",
-        "mar_1",
-        "mar_10",
-        "mar_100",
-        "mar_small",
-        "mar_medium",
-        "mar_large",
-        "map_per_class",
-        "mar_100_per_class",
-    )
-
-
-def _input_validator(preds: Sequence[Dict[str, Tensor]], targets: Sequence[Dict[str, Tensor]]) -> None:
-    """Ensure the correct input format of `preds` and `targets`"""
-    if not isinstance(preds, Sequence):
-        raise ValueError("Expected argument `preds` to be of type Sequence")
-    if not isinstance(targets, Sequence):
-        raise ValueError("Expected argument `target` to be of type Sequence")
-    if len(preds) != len(targets):
-        raise ValueError("Expected argument `preds` and `target` to have the same length")
-
-    for k in ["boxes", "scores", "labels"]:
-        if any(k not in p for p in preds):
-            raise ValueError(f"Expected all dicts in `preds` to contain the `{k}` key")
-
-    for k in ["boxes", "labels"]:
-        if any(k not in p for p in targets):
-            raise ValueError(f"Expected all dicts in `target` to contain the `{k}` key")
-
-    if any(type(pred["boxes"]) is not Tensor for pred in preds):
-        raise ValueError("Expected all boxes in `preds` to be of type Tensor")
-    if any(type(pred["scores"]) is not Tensor for pred in preds):
-        raise ValueError("Expected all scores in `preds` to be of type Tensor")
-    if any(type(pred["labels"]) is not Tensor for pred in preds):
-        raise ValueError("Expected all labels in `preds` to be of type Tensor")
-    if any(type(target["boxes"]) is not Tensor for target in targets):
-        raise ValueError("Expected all boxes in `target` to be of type Tensor")
-    if any(type(target["labels"]) is not Tensor for target in targets):
-        raise ValueError("Expected all labels in `target` to be of type Tensor")
-
-    for i, item in enumerate(targets):
-        if item["boxes"].size(0) != item["labels"].size(0):
-            raise ValueError(
-                f"Input boxes and labels of sample {i} in targets have a"
-                f" different length (expected {item['boxes'].size(0)} labels, got {item['labels'].size(0)})"
-            )
-    for i, item in enumerate(preds):
-        if not (item["boxes"].size(0) == item["labels"].size(0) == item["scores"].size(0)):
-            raise ValueError(
-                f"Input boxes, labels and scores of sample {i} in predictions have a"
-                f" different length (expected {item['boxes'].size(0)} labels and scores,"
-                f" got {item['labels'].size(0)} labels and {item['scores'].size(0)})"
-            )
-
-
-def _fix_empty_tensors(boxes: Tensor) -> Tensor:
-    """Empty tensors can cause problems in DDP mode, this methods corrects them."""
-    if boxes.numel() == 0 and boxes.ndim == 1:
-        return boxes.unsqueeze(0)
-    return boxes
-
-
-class MeanAveragePrecision(Metric):
-    r"""
-    Computes the `Mean-Average-Precision (mAP) and Mean-Average-Recall (mAR)
-    <https://jonathan-hui.medium.com/map-mean-average-precision-for-object-detection-45c121a31173>`_
-    for object detection predictions.
-    Optionally, the mAP and mAR values can be calculated per class.
-
-    Predicted boxes and targets have to be in Pascal VOC format
-    (xmin-top left, ymin-top left, xmax-bottom right, ymax-bottom right).
-    See the :meth:`update` method for more information about the input format to this metric.
-
-    For an example on how to use this metric check the `torchmetrics examples
-    <https://github.com/PyTorchLightning/metrics/blob/master/tm_examples/detection_map.py>`_
-
-    .. note::
-        This metric is following the mAP implementation of
-        `pycocotools <https://github.com/cocodataset/cocoapi/tree/master/PythonAPI/pycocotools>`_,
-        a standard implementation for the mAP metric for object detection.
-
-    .. note::
-        This metric requires you to have `torchvision` version 0.8.0 or newer installed (with corresponding
-        version 1.7.0 of torch or newer). Please install with ``pip install torchvision`` or
-        ``pip install torchmetrics[detection]``.
-
-    Args:
-        box_format:
-            Input format of given boxes. Supported formats are ``[`xyxy`, `xywh`, `cxcywh`]``.
-        iou_thresholds:
-            IoU thresholds for evaluation. If set to ``None`` it corresponds to the stepped range ``[0.5,...,0.95]``
-            with step ``0.05``. Else provide a list of floats.
-        rec_thresholds:
-            Recall thresholds for evaluation. If set to ``None`` it corresponds to the stepped range ``[0,...,1]``
-            with step ``0.01``. Else provide a list of floats.
-        max_detection_thresholds:
-            Thresholds on max detections per image. If set to `None` will use thresholds ``[1, 10, 100]``.
-            Else, please provide a list of ints.
-        class_metrics:
-            Option to enable per-class metrics for mAP and mAR_100. Has a performance impact.
-        kwargs: Additional keyword arguments, see :ref:`Metric kwargs` for more info.
-
-    Example:
-        >>> import torch
-        >>> from torchmetrics.detection.mean_ap import MeanAveragePrecision
-        >>> preds = [
-        ...   dict(
-        ...     boxes=torch.tensor([[258.0, 41.0, 606.0, 285.0]]),
-        ...     scores=torch.tensor([0.536]),
-        ...     labels=torch.tensor([0]),
-        ...   )
-        ... ]
-        >>> target = [
-        ...   dict(
-        ...     boxes=torch.tensor([[214.0, 41.0, 562.0, 285.0]]),
-        ...     labels=torch.tensor([0]),
-        ...   )
-        ... ]
-        >>> metric = MeanAveragePrecision()
-        >>> metric.update(preds, target)
-        >>> from pprint import pprint
-        >>> pprint(metric.compute())
-        {'map': tensor(0.6000),
-         'map_50': tensor(1.),
-         'map_75': tensor(1.),
-         'map_large': tensor(0.6000),
-         'map_medium': tensor(-1.),
-         'map_per_class': tensor(-1.),
-         'map_small': tensor(-1.),
-         'mar_1': tensor(0.6000),
-         'mar_10': tensor(0.6000),
-         'mar_100': tensor(0.6000),
-         'mar_100_per_class': tensor(-1.),
-         'mar_large': tensor(0.6000),
-         'mar_medium': tensor(-1.),
-         'mar_small': tensor(-1.)}
-
-    Raises:
-        ModuleNotFoundError:
-            If ``torchvision`` is not installed or version installed is lower than 0.8.0
-        ValueError:
-            If ``class_metrics`` is not a boolean
-    """
-    is_differentiable: bool = False
-    higher_is_better: Optional[bool] = None
-    full_state_update: bool = True
-
-    detection_boxes: List[Tensor]
-    detection_scores: List[Tensor]
-    detection_labels: List[Tensor]
-    groundtruth_boxes: List[Tensor]
-    groundtruth_labels: List[Tensor]
-
-    def __init__(
-        self,
-        box_format: str = "xyxy",
-        iou_thresholds: Optional[List[float]] = None,
-        rec_thresholds: Optional[List[float]] = None,
-        max_detection_thresholds: Optional[List[int]] = None,
-        class_metrics: bool = False,
-        **kwargs: Dict[str, Any],
-    ) -> None:  # type: ignore
-        super().__init__(**kwargs)
-
-        if not _TORCHVISION_GREATER_EQUAL_0_8:
-            raise ModuleNotFoundError(
-                "`MeanAveragePrecision` metric requires that `torchvision` version 0.8.0 or newer is installed."
-                " Please install with `pip install torchvision>=0.8` or `pip install torchmetrics[detection]`."
-            )
-
-        allowed_box_formats = ("xyxy", "xywh", "cxcywh")
-        if box_format not in allowed_box_formats:
-            raise ValueError(f"Expected argument `box_format` to be one of {allowed_box_formats} but got {box_format}")
-        self.box_format = box_format
-        self.iou_thresholds = iou_thresholds or torch.linspace(0.5, 0.95, round((0.95 - 0.5) / 0.05) + 1).tolist()
-        self.rec_thresholds = rec_thresholds or torch.linspace(0.0, 1.00, round(1.00 / 0.01) + 1).tolist()
-        max_det_thr, _ = torch.sort(IntTensor(max_detection_thresholds or [1, 10, 100]))
-        self.max_detection_thresholds = max_det_thr.tolist()
-        self.bbox_area_ranges = {
-            "all": (0**2, int(1e5**2)),
-            "small": (0**2, 32**2),
-            "medium": (32**2, 96**2),
-            "large": (96**2, int(1e5**2)),
-        }
-
-        if not isinstance(class_metrics, bool):
-            raise ValueError("Expected argument `class_metrics` to be a boolean")
-
-        self.class_metrics = class_metrics
-        self.add_state("detection_boxes", default=[], dist_reduce_fx=None)
-        self.add_state("detection_scores", default=[], dist_reduce_fx=None)
-        self.add_state("detection_labels", default=[], dist_reduce_fx=None)
-        self.add_state("groundtruth_boxes", default=[], dist_reduce_fx=None)
-        self.add_state("groundtruth_labels", default=[], dist_reduce_fx=None)
-
-    def update(self, preds: List[Dict[str, Tensor]], target: List[Dict[str, Tensor]]) -> None:  # type: ignore
-        """Add detections and ground truth to the metric.
-
-        Args:
-            preds: A list consisting of dictionaries each containing the key-values
-                (each dictionary corresponds to a single image):
-
-                - ``boxes``: ``torch.FloatTensor`` of shape ``[num_boxes, 4]`` containing ``num_boxes`` detection boxes
-                  of the format specified in the constructor. By default, this method expects
-                  ``[xmin, ymin, xmax, ymax]`` in absolute image coordinates.
-                - ``scores``: ``torch.FloatTensor`` of shape ``[num_boxes]`` containing detection scores for the boxes.
-                - ``labels``: ``torch.IntTensor`` of shape ``[num_boxes]`` containing 0-indexed detection classes
-                  for the boxes.
-
-            target: A list consisting of dictionaries each containing the key-values
-                (each dictionary corresponds to a single image):
-
-                - ``boxes``: ``torch.FloatTensor`` of shape ``[num_boxes, 4]`` containing ``num_boxes``
-                  ground truth boxes of the format specified in the constructor. By default, this method expects
-                  ``[xmin, ymin, xmax, ymax]`` in absolute image coordinates.
-                - ``labels``: ``torch.IntTensor`` of shape ``[num_boxes]`` containing 1-indexed ground truth
-                   classes for the boxes.
-
-        Raises:
-            ValueError:
-                If ``preds`` is not of type ``List[Dict[str, Tensor]]``
-            ValueError:
-                If ``target`` is not of type ``List[Dict[str, Tensor]]``
-            ValueError:
-                If ``preds`` and ``target`` are not of the same length
-            ValueError:
-                If any of ``preds.boxes``, ``preds.scores`` and ``preds.labels`` are not of the same length
-            ValueError:
-                If any of ``target.boxes`` and ``target.labels`` are not of the same length
-            ValueError:
-                If any box is not type float and of length 4
-            ValueError:
-                If any class is not type int and of length 1
-            ValueError:
-                If any score is not type float and of length 1
-        """
-        _input_validator(preds, target)
-
-        for item in preds:
-            boxes = _fix_empty_tensors(item["boxes"])
-            boxes = box_convert(boxes, in_fmt=self.box_format, out_fmt="xyxy")
-            self.detection_boxes.append(boxes)
-            self.detection_labels.append(item["labels"])
-            self.detection_scores.append(item["scores"])
-
-        for item in target:
-            boxes = _fix_empty_tensors(item["boxes"])
-            boxes = box_convert(boxes, in_fmt=self.box_format, out_fmt="xyxy")
-            self.groundtruth_boxes.append(boxes)
-            self.groundtruth_labels.append(item["labels"])
-
-    def _get_classes(self) -> List:
-        """Returns a list of unique classes found in ground truth and detection data."""
-        if len(self.detection_labels) > 0 or len(self.groundtruth_labels) > 0:
-            return torch.cat(self.detection_labels + self.groundtruth_labels).unique().tolist()
-        return []
-
-    def _compute_iou(self, idx: int, class_id: int, max_det: int) -> Tensor:
-        """Computes the Intersection over Union (IoU) for ground truth and detection bounding boxes for the given
-        image and class.
-
-        Args:
-            idx:
-                Image Id, equivalent to the index of supplied samples
-            class_id:
-                Class Id of the supplied ground truth and detection labels
-            max_det:
-                Maximum number of evaluated detection bounding boxes
-        """
-        gt = self.groundtruth_boxes[idx]
-        det = self.detection_boxes[idx]
-        gt_label_mask = self.groundtruth_labels[idx] == class_id
-        det_label_mask = self.detection_labels[idx] == class_id
-        if len(gt_label_mask) == 0 or len(det_label_mask) == 0:
-            return Tensor([])
-        gt = gt[gt_label_mask]
-        det = det[det_label_mask]
-        if len(gt) == 0 or len(det) == 0:
-            return Tensor([])
-
-        # Sort by scores and use only max detections
-        scores = self.detection_scores[idx]
-        scores_filtered = scores[self.detection_labels[idx] == class_id]
-        inds = torch.argsort(scores_filtered, descending=True)
-        det = det[inds]
-        if len(det) > max_det:
-            det = det[:max_det]
-
-        # generalized_box_iou
-        ious = box_iou(det, gt)
-        return ious
-
-    def __evaluate_image_gt_no_preds(
-        self, gt: Tensor, gt_label_mask: Tensor, area_range: Tuple[int, int], nb_iou_thrs: int
-    ) -> Dict[str, Any]:
-        """Some GT but no predictions."""
-        # GTs
-        gt = gt[gt_label_mask]
-        nb_gt = len(gt)
-        areas = box_area(gt)
-        ignore_area = (areas < area_range[0]) | (areas > area_range[1])
-        gt_ignore, _ = torch.sort(ignore_area.to(torch.uint8))
-        gt_ignore = gt_ignore.to(torch.bool)
-
-        # Detections
-        nb_det = 0
-        det_ignore = torch.zeros((nb_iou_thrs, nb_det), dtype=torch.bool, device=self.device)
-
-        return {
-            "dtMatches": torch.zeros((nb_iou_thrs, nb_det), dtype=torch.bool, device=self.device),
-            "gtMatches": torch.zeros((nb_iou_thrs, nb_gt), dtype=torch.bool, device=self.device),
-            "dtScores": torch.zeros(nb_det, dtype=torch.bool, device=self.device),
-            "gtIgnore": gt_ignore,
-            "dtIgnore": det_ignore,
-        }
-
-    def __evaluate_image_preds_no_gt(
-        self, det: Tensor, idx: int, det_label_mask: Tensor, max_det: int, area_range: Tuple[int, int], nb_iou_thrs: int
-    ) -> Dict[str, Any]:
-        """Some predictions but no GT."""
-        # GTs
-        nb_gt = 0
-        gt_ignore = torch.zeros(nb_gt, dtype=torch.bool, device=self.device)
-
-        # Detections
-        det = det[det_label_mask]
-        scores = self.detection_scores[idx]
-        scores_filtered = scores[det_label_mask]
-        scores_sorted, dtind = torch.sort(scores_filtered, descending=True)
-        det = det[dtind]
-        if len(det) > max_det:
-            det = det[:max_det]
-        nb_det = len(det)
-        det_areas = box_area(det).to(self.device)
-        det_ignore_area = (det_areas < area_range[0]) | (det_areas > area_range[1])
-        ar = det_ignore_area.reshape((1, nb_det))
-        det_ignore = torch.repeat_interleave(ar, nb_iou_thrs, 0)
-
-        return {
-            "dtMatches": torch.zeros((nb_iou_thrs, nb_det), dtype=torch.bool, device=self.device),
-            "gtMatches": torch.zeros((nb_iou_thrs, nb_gt), dtype=torch.bool, device=self.device),
-            "dtScores": scores_sorted,
-            "gtIgnore": gt_ignore,
-            "dtIgnore": det_ignore,
-        }
-
-    def _evaluate_image(
-        self, idx: int, class_id: int, area_range: Tuple[int, int], max_det: int, ious: dict
-    ) -> Optional[dict]:
-        """Perform evaluation for single class and image.
-
-        Args:
-            idx:
-                Image Id, equivalent to the index of supplied samples.
-            class_id:
-                Class Id of the supplied ground truth and detection labels.
-            area_range:
-                List of lower and upper bounding box area threshold.
-            max_det:
-                Maximum number of evaluated detection bounding boxes.
-            ious:
-                IoU results for image and class.
-        """
-        gt = self.groundtruth_boxes[idx]
-        det = self.detection_boxes[idx]
-        gt_label_mask = self.groundtruth_labels[idx] == class_id
-        det_label_mask = self.detection_labels[idx] == class_id
-
-        # No Gt and No predictions --> ignore image
-        if len(gt_label_mask) == 0 and len(det_label_mask) == 0:
-            return None
-
-        nb_iou_thrs = len(self.iou_thresholds)
-
-        # Some GT but no predictions
-        if len(gt_label_mask) > 0 and len(det_label_mask) == 0:
-            return self.__evaluate_image_gt_no_preds(gt, gt_label_mask, area_range, nb_iou_thrs)
-
-        # Some predictions but no GT
-        if len(gt_label_mask) == 0 and len(det_label_mask) >= 0:
-            return self.__evaluate_image_preds_no_gt(det, idx, det_label_mask, max_det, area_range, nb_iou_thrs)
-
-        gt = gt[gt_label_mask]
-        det = det[det_label_mask]
-        if gt.numel() == 0 and det.numel() == 0:
-            return None
-
-        areas = box_area(gt)
-        ignore_area = (areas < area_range[0]) | (areas > area_range[1])
-
-        # sort dt highest score first, sort gt ignore last
-        ignore_area_sorted, gtind = torch.sort(ignore_area.to(torch.uint8))
-        # Convert to uint8 temporarily and back to bool, because "Sort currently does not support bool dtype on CUDA"
-        ignore_area_sorted = ignore_area_sorted.to(torch.bool)
-        gt = gt[gtind]
-        scores = self.detection_scores[idx]
-        scores_filtered = scores[det_label_mask]
-        scores_sorted, dtind = torch.sort(scores_filtered, descending=True)
-        det = det[dtind]
-        if len(det) > max_det:
-            det = det[:max_det]
-        # load computed ious
-        ious = ious[idx, class_id][:, gtind] if len(ious[idx, class_id]) > 0 else ious[idx, class_id]
-
-        nb_iou_thrs = len(self.iou_thresholds)
-        nb_gt = len(gt)
-        nb_det = len(det)
-        gt_matches = torch.zeros((nb_iou_thrs, nb_gt), dtype=torch.bool, device=gt.device)
-        det_matches = torch.zeros((nb_iou_thrs, nb_det), dtype=torch.bool, device=gt.device)
-        gt_ignore = ignore_area_sorted
-        det_ignore = torch.zeros((nb_iou_thrs, nb_det), dtype=torch.bool, device=gt.device)
-
-        if torch.numel(ious) > 0:
-            for idx_iou, t in enumerate(self.iou_thresholds):
-                for idx_det, _ in enumerate(det):
-                    m = MeanAveragePrecision._find_best_gt_match(t, gt_matches, idx_iou, gt_ignore, ious, idx_det)
-                    if m == -1:
-                        continue
-                    det_ignore[idx_iou, idx_det] = gt_ignore[m]
-                    det_matches[idx_iou, idx_det] = 1
-                    gt_matches[idx_iou, m] = 1
-
-        # set unmatched detections outside of area range to ignore
-        det_areas = box_area(det)
-        det_ignore_area = (det_areas < area_range[0]) | (det_areas > area_range[1])
-        ar = det_ignore_area.reshape((1, nb_det))
-        det_ignore = torch.logical_or(
-            det_ignore, torch.logical_and(det_matches == 0, torch.repeat_interleave(ar, nb_iou_thrs, 0))
-        )
-        return {
-            "dtMatches": det_matches.to(self.device),
-            "gtMatches": gt_matches.to(self.device),
-            "dtScores": scores_sorted.to(self.device),
-            "gtIgnore": gt_ignore.to(self.device),
-            "dtIgnore": det_ignore.to(self.device),
-        }
-
-    @staticmethod
-    def _find_best_gt_match(
-        thr: int, gt_matches: Tensor, idx_iou: float, gt_ignore: Tensor, ious: Tensor, idx_det: int
-    ) -> int:
-        """Return id of best ground truth match with current detection.
-
-        Args:
-            thr:
-                Current threshold value.
-            gt_matches:
-                Tensor showing if a ground truth matches for threshold ``t`` exists.
-            idx_iou:
-                Id of threshold ``t``.
-            gt_ignore:
-                Tensor showing if ground truth should be ignored.
-            ious:
-                IoUs for all combinations of detection and ground truth.
-            idx_det:
-                Id of current detection.
-        """
-        previously_matched = gt_matches[idx_iou]
-        # Remove previously matched or ignored gts
-        remove_mask = previously_matched | gt_ignore
-        gt_ious = ious[idx_det] * ~remove_mask
-        match_idx = gt_ious.argmax().item()
-        if gt_ious[match_idx] > thr:
-            return match_idx
-        return -1
-
-    def _summarize(
-        self,
-        results: Dict,
-        avg_prec: bool = True,
-        iou_threshold: Optional[float] = None,
-        area_range: str = "all",
-        max_dets: int = 100,
-    ) -> Tensor:
-        """Perform evaluation for single class and image.
-
-        Args:
-            results:
-                Dictionary including precision, recall and scores for all combinations.
-            avg_prec:
-                Calculate average precision. Else calculate average recall.
-            iou_threshold:
-                IoU threshold. If set to ``None`` it all values are used. Else results are filtered.
-            area_range:
-                Bounding box area range key.
-            max_dets:
-                Maximum detections.
-        """
-        area_inds = [i for i, k in enumerate(self.bbox_area_ranges.keys()) if k == area_range]
-        mdet_inds = [i for i, k in enumerate(self.max_detection_thresholds) if k == max_dets]
-        if avg_prec:
-            # dimension of precision: [TxRxKxAxM]
-            prec = results["precision"]
-            # IoU
-            if iou_threshold is not None:
-                thr = self.iou_thresholds.index(iou_threshold)
-                prec = prec[thr, :, :, area_inds, mdet_inds]
-            else:
-                prec = prec[:, :, :, area_inds, mdet_inds]
-        else:
-            # dimension of recall: [TxKxAxM]
-            prec = results["recall"]
-            if iou_threshold is not None:
-                thr = self.iou_thresholds.index(iou_threshold)
-                prec = prec[thr, :, :, area_inds, mdet_inds]
-            else:
-                prec = prec[:, :, area_inds, mdet_inds]
-
-        mean_prec = torch.tensor([-1.0]) if len(prec[prec > -1]) == 0 else torch.mean(prec[prec > -1])
-        return mean_prec
-
-    def _calculate(self, class_ids: List) -> Tuple[MAPMetricResults, MARMetricResults]:
-        """Calculate the precision and recall for all supplied classes to calculate mAP/mAR.
-
-        Args:
-            class_ids:
-                List of label class Ids.
-        """
-        img_ids = range(len(self.groundtruth_boxes))
-        max_detections = self.max_detection_thresholds[-1]
-        area_ranges = self.bbox_area_ranges.values()
-
-        ious = {
-            (idx, class_id): self._compute_iou(idx, class_id, max_detections)
-            for idx in img_ids
-            for class_id in class_ids
-        }
-
-        eval_imgs = [
-            self._evaluate_image(img_id, class_id, area, max_detections, ious)
-            for class_id in class_ids
-            for area in area_ranges
-            for img_id in img_ids
-        ]
-
-        nb_iou_thrs = len(self.iou_thresholds)
-        nb_rec_thrs = len(self.rec_thresholds)
-        nb_classes = len(class_ids)
-        nb_bbox_areas = len(self.bbox_area_ranges)
-        nb_max_det_thrs = len(self.max_detection_thresholds)
-        nb_imgs = len(img_ids)
-        precision = -torch.ones((nb_iou_thrs, nb_rec_thrs, nb_classes, nb_bbox_areas, nb_max_det_thrs))
-        recall = -torch.ones((nb_iou_thrs, nb_classes, nb_bbox_areas, nb_max_det_thrs))
-        scores = -torch.ones((nb_iou_thrs, nb_rec_thrs, nb_classes, nb_bbox_areas, nb_max_det_thrs))
-
-        # move tensors if necessary
-        rec_thresholds_tensor = torch.tensor(self.rec_thresholds)
-
-        # retrieve E at each category, area range, and max number of detections
-        for idx_cls, _ in enumerate(class_ids):
-            for idx_bbox_area, _ in enumerate(self.bbox_area_ranges):
-                for idx_max_det_thrs, max_det in enumerate(self.max_detection_thresholds):
-                    recall, precision, scores = MeanAveragePrecision.__calculate_recall_precision_scores(
-                        recall,
-                        precision,
-                        scores,
-                        idx_cls=idx_cls,
-                        idx_bbox_area=idx_bbox_area,
-                        idx_max_det_thrs=idx_max_det_thrs,
-                        eval_imgs=eval_imgs,
-                        rec_thresholds=rec_thresholds_tensor,
-                        max_det=max_det,
-                        nb_imgs=nb_imgs,
-                        nb_bbox_areas=nb_bbox_areas,
-                    )
-
-        return precision, recall
-
-    def _summarize_results(self, precisions: Tensor, recalls: Tensor) -> Tuple[MAPMetricResults, MARMetricResults]:
-        """Summarizes the precision and recall values to calculate mAP/mAR.
-
-        Args:
-            precisions:
-                Precision values for different thresholds
-            recalls:
-                Recall values for different thresholds
-        """
-        results = dict(precision=precisions, recall=recalls)
-        map_metrics = MAPMetricResults()
-        map_metrics.map = self._summarize(results, True)
-        last_max_det_thr = self.max_detection_thresholds[-1]
-        if 0.5 in self.iou_thresholds:
-            map_metrics.map_50 = self._summarize(results, True, iou_threshold=0.5, max_dets=last_max_det_thr)
-        else:
-            map_metrics.map_50 = torch.tensor([-1])
-        if 0.75 in self.iou_thresholds:
-            map_metrics.map_75 = self._summarize(results, True, iou_threshold=0.75, max_dets=last_max_det_thr)
-        else:
-            map_metrics.map_75 = torch.tensor([-1])
-        map_metrics.map_small = self._summarize(results, True, area_range="small", max_dets=last_max_det_thr)
-        map_metrics.map_medium = self._summarize(results, True, area_range="medium", max_dets=last_max_det_thr)
-        map_metrics.map_large = self._summarize(results, True, area_range="large", max_dets=last_max_det_thr)
-
-        mar_metrics = MARMetricResults()
-        for max_det in self.max_detection_thresholds:
-            mar_metrics[f"mar_{max_det}"] = self._summarize(results, False, max_dets=max_det)
-        mar_metrics.mar_small = self._summarize(results, False, area_range="small", max_dets=last_max_det_thr)
-        mar_metrics.mar_medium = self._summarize(results, False, area_range="medium", max_dets=last_max_det_thr)
-        mar_metrics.mar_large = self._summarize(results, False, area_range="large", max_dets=last_max_det_thr)
-
-        return map_metrics, mar_metrics
-
-    @staticmethod
-    def __calculate_recall_precision_scores(
-        recall: Tensor,
-        precision: Tensor,
-        scores: Tensor,
-        idx_cls: int,
-        idx_bbox_area: int,
-        idx_max_det_thrs: int,
-        eval_imgs: list,
-        rec_thresholds: Tensor,
-        max_det: int,
-        nb_imgs: int,
-        nb_bbox_areas: int,
-    ) -> Tuple[Tensor, Tensor, Tensor]:
-        nb_rec_thrs = len(rec_thresholds)
-        idx_cls_pointer = idx_cls * nb_bbox_areas * nb_imgs
-        idx_bbox_area_pointer = idx_bbox_area * nb_imgs
-        # Load all image evals for current class_id and area_range
-        img_eval_cls_bbox = [eval_imgs[idx_cls_pointer + idx_bbox_area_pointer + i] for i in range(nb_imgs)]
-        img_eval_cls_bbox = [e for e in img_eval_cls_bbox if e is not None]
-        if not img_eval_cls_bbox:
-            return recall, precision, scores
-        # det_scores = torch.cat([e["dtScores"][:max_det].to("cuda") for e in img_eval_cls_bbox])
-        det_scores = torch.cat([e["dtScores"][:max_det].to(torch.uint8).to("cuda") for e in img_eval_cls_bbox])
-
-        # different sorting method generates slightly different results.
-        # mergesort is used to be consistent as Matlab implementation.
-        # Sort in PyTorch does not support bool types on CUDA (yet, 1.11.0)
-        dtype = torch.uint8 if det_scores.is_cuda and det_scores.dtype is torch.bool else det_scores.dtype
-        # Explicitly cast to uint8 to avoid error for bool inputs on CUDA to argsort
-        inds = torch.argsort(det_scores.to(dtype), descending=True)
-        det_scores_sorted = det_scores[inds]
-
-        det_matches = torch.cat([e["dtMatches"][:, :max_det].to("cuda") for e in img_eval_cls_bbox], axis=1)[:, inds]
-        det_ignore = torch.cat([e["dtIgnore"][:, :max_det].to("cuda") for e in img_eval_cls_bbox], axis=1)[:, inds]
-        gt_ignore = torch.cat([e["gtIgnore"].to("cuda") for e in img_eval_cls_bbox])
-        npig = torch.count_nonzero(gt_ignore == False)  # noqa: E712
-        if npig == 0:
-            return recall, precision, scores
-        tps = torch.logical_and(det_matches, torch.logical_not(det_ignore))
-        fps = torch.logical_and(torch.logical_not(det_matches), torch.logical_not(det_ignore))
-
-        tp_sum = torch.cumsum(tps, axis=1, dtype=torch.float)
-        fp_sum = torch.cumsum(fps, axis=1, dtype=torch.float)
-        for idx, (tp, fp) in enumerate(zip(tp_sum, fp_sum)):
-            nd = len(tp)
-            rc = tp / npig
-            pr = tp / (fp + tp + torch.finfo(torch.float64).eps)
-            prec = torch.zeros((nb_rec_thrs,))
-            score = torch.zeros((nb_rec_thrs,))
-
-            recall[idx, idx_cls, idx_bbox_area, idx_max_det_thrs] = rc[-1] if nd else 0
-
-            # Remove zigzags for AUC
-            diff_zero = torch.zeros((1,), device=pr.device)
-            diff = torch.ones((1,), device=pr.device)
-            while not torch.all(diff == 0):
-                diff = torch.clamp(torch.cat((pr[1:] - pr[:-1], diff_zero), 0), min=0)
-                pr += diff
-
-            inds = torch.searchsorted(rc, rec_thresholds.to(rc.device), right=False)
-            num_inds = inds.argmax() if inds.max() >= nd else nb_rec_thrs
-            inds = inds[:num_inds]
-            prec[:num_inds] = pr[inds]
-            score[:num_inds] = det_scores_sorted[inds]
-            precision[idx, :, idx_cls, idx_bbox_area, idx_max_det_thrs] = prec
-            scores[idx, :, idx_cls, idx_bbox_area, idx_max_det_thrs] = score
-
-        return recall, precision, scores
-
-    def compute(self) -> dict:
-        """Compute the `Mean-Average-Precision (mAP) and Mean-Average-Recall (mAR)` scores.
-
-        Note:
-            ``map`` score is calculated with @[ IoU=self.iou_thresholds | area=all | max_dets=max_detection_thresholds ]
-
-            Caution: If the initialization parameters are changed, dictionary keys for mAR can change as well.
-            The default properties are also accessible via fields and will raise an ``AttributeError`` if not available.
-
-        Returns:
-            dict containing
-
-            - map: ``torch.Tensor``
-            - map_small: ``torch.Tensor``
-            - map_medium: ``torch.Tensor``
-            - map_large: ``torch.Tensor``
-            - mar_1: ``torch.Tensor``
-            - mar_10: ``torch.Tensor``
-            - mar_100: ``torch.Tensor``
-            - mar_small: ``torch.Tensor``
-            - mar_medium: ``torch.Tensor``
-            - mar_large: ``torch.Tensor``
-            - map_50: ``torch.Tensor`` (-1 if 0.5 not in the list of iou thresholds)
-            - map_75: ``torch.Tensor`` (-1 if 0.75 not in the list of iou thresholds)
-            - map_per_class: ``torch.Tensor`` (-1 if class metrics are disabled)
-            - mar_100_per_class: ``torch.Tensor`` (-1 if class metrics are disabled)
-        """
-        classes = self._get_classes()
-        precisions, recalls = self._calculate(classes)
-        map_val, mar_val = self._summarize_results(precisions, recalls)
-
-        # if class mode is enabled, evaluate metrics per class
-        map_per_class_values: Tensor = torch.tensor([-1.0])
-        mar_max_dets_per_class_values: Tensor = torch.tensor([-1.0])
-        if self.class_metrics:
-            map_per_class_list = []
-            mar_max_dets_per_class_list = []
-
-            for class_idx, _ in enumerate(classes):
-                cls_precisions = precisions[:, :, class_idx].unsqueeze(dim=2)
-                cls_recalls = recalls[:, class_idx].unsqueeze(dim=1)
-                cls_map, cls_mar = self._summarize_results(cls_precisions, cls_recalls)
-                map_per_class_list.append(cls_map.map)
-                mar_max_dets_per_class_list.append(cls_mar[f"mar_{self.max_detection_thresholds[-1]}"])
-
-            map_per_class_values = torch.tensor(map_per_class_list, dtype=torch.float)
-            mar_max_dets_per_class_values = torch.tensor(mar_max_dets_per_class_list, dtype=torch.float)
-
-        metrics = COCOMetricResults()
-        metrics.update(map_val)
-        metrics.update(mar_val)
-        metrics.map_per_class = map_per_class_values
-        metrics[f"mar_{self.max_detection_thresholds[-1]}_per_class"] = mar_max_dets_per_class_values
-        return metrics
\ No newline at end of file
diff --git a/torchsig/models/spectrogram_models/yolov5/modules.py b/torchsig/models/spectrogram_models/yolov5/modules.py
deleted file mode 100644
index 96dd3f6..0000000
--- a/torchsig/models/spectrogram_models/yolov5/modules.py
+++ /dev/null
@@ -1,804 +0,0 @@
-import os
-import sys
-import math
-import yaml
-import warnings
-import numpy as np
-from pathlib import Path
-from copy import copy, deepcopy
-import torch
-from torch import nn
-from torch.nn import functional as F
-
-from .utils import make_divisible, check_anchor_order, initialize_weights, model_info
-from .utils import bbox_iou, is_parallel, check_version
-
-try:
-    import thop  # for FLOPs computation
-except ImportError:
-    thop = None
-    
-    
-def smooth_BCE(eps=0.1):  
-    # https://github.com/ultralytics/yolov3/issues/238#issuecomment-598028441
-    # return positive, negative label smoothing BCE targets
-    return 1.0 - 0.5 * eps, 0.5 * eps
-
-
-class BCEBlurWithLogitsLoss(nn.Module):
-    # BCEwithLogitLoss() with reduced missing label effects.
-    def __init__(self, alpha=0.05):
-        super().__init__()
-        self.loss_fcn = nn.BCEWithLogitsLoss(reduction='none')  # must be nn.BCEWithLogitsLoss()
-        self.alpha = alpha
-
-    def forward(self, pred, true):
-        loss = self.loss_fcn(pred, true)
-        pred = torch.sigmoid(pred)  # prob from logits
-        dx = pred - true  # reduce only missing label effects
-        # dx = (pred - true).abs()  # reduce missing label and false label effects
-        alpha_factor = 1 - torch.exp((dx - 1) / (self.alpha + 1e-4))
-        loss *= alpha_factor
-        return loss.mean()
-
-
-class FocalLoss(nn.Module):
-    # Wraps focal loss around existing loss_fcn(), i.e. criteria = FocalLoss(nn.BCEWithLogitsLoss(), gamma=1.5)
-    def __init__(self, loss_fcn, gamma=1.5, alpha=0.25):
-        super().__init__()
-        self.loss_fcn = loss_fcn  # must be nn.BCEWithLogitsLoss()
-        self.gamma = gamma
-        self.alpha = alpha
-        self.reduction = loss_fcn.reduction
-        self.loss_fcn.reduction = 'none'  # required to apply FL to each element
-
-    def forward(self, pred, true):
-        loss = self.loss_fcn(pred, true)
-        # p_t = torch.exp(-loss)
-        # loss *= self.alpha * (1.000001 - p_t) ** self.gamma  # non-zero power for gradient stability
-
-        # TF implementation https://github.com/tensorflow/addons/blob/v0.7.1/tensorflow_addons/losses/focal_loss.py
-        pred_prob = torch.sigmoid(pred)  # prob from logits
-        p_t = true * pred_prob + (1 - true) * (1 - pred_prob)
-        alpha_factor = true * self.alpha + (1 - true) * (1 - self.alpha)
-        modulating_factor = (1.0 - p_t) ** self.gamma
-        loss *= alpha_factor * modulating_factor
-
-        if self.reduction == 'mean':
-            return loss.mean()
-        elif self.reduction == 'sum':
-            return loss.sum()
-        else:  # 'none'
-            return loss
-
-
-class QFocalLoss(nn.Module):
-    # Wraps Quality focal loss around existing loss_fcn(), i.e. criteria = FocalLoss(nn.BCEWithLogitsLoss(), gamma=1.5)
-    def __init__(self, loss_fcn, gamma=1.5, alpha=0.25):
-        super().__init__()
-        self.loss_fcn = loss_fcn  # must be nn.BCEWithLogitsLoss()
-        self.gamma = gamma
-        self.alpha = alpha
-        self.reduction = loss_fcn.reduction
-        self.loss_fcn.reduction = 'none'  # required to apply FL to each element
-
-    def forward(self, pred, true):
-        loss = self.loss_fcn(pred, true)
-
-        pred_prob = torch.sigmoid(pred)  # prob from logits
-        alpha_factor = true * self.alpha + (1 - true) * (1 - self.alpha)
-        modulating_factor = torch.abs(true - pred_prob) ** self.gamma
-        loss *= alpha_factor * modulating_factor
-
-        if self.reduction == 'mean':
-            return loss.mean()
-        elif self.reduction == 'sum':
-            return loss.sum()
-        else:  # 'none'
-            return loss
-
-
-class ComputeLoss:
-    # Compute losses
-    def __init__(
-        self, 
-        model, 
-        autobalance: bool = False,
-        box: float = 0.05, # box loss gain
-        cls: float = 0.5, # cls loss gain
-        obj: float = 1.0, # obj loss gain (scale with pixels)
-        cls_pw: float = 1.0, # cls BCELoss positive_weight
-        obj_pw: float = 1.0, # obj BCELoss positive_weight
-        anchor_t: float = 4.0, # anchor-multiple threshold
-        label_smoothing: float = 0.0, # label-smoothing epsilon
-        fl_gamma: float = 0.0, # focal loss gamma (EfficientDet default gamma=1.5)
-    ):
-        self.sort_obj_iou = False
-        device = next(model.parameters()).device  # get model device
-        h = {
-            "box": box,
-            "cls": cls,
-            "obj": obj,
-            "cls_pw": cls_pw,
-            "obj_pw": obj_pw,
-            "anchor_t": anchor_t,
-            "label_smoothing": label_smoothing,
-            "fl_gamma": fl_gamma,
-        }
-        
-        # Define criteria
-        BCEcls = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([h['cls_pw']], device=device))
-        BCEobj = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([h['obj_pw']], device=device))
-
-        # Class label smoothing https://arxiv.org/pdf/1902.04103.pdf eqn 3
-        self.cp, self.cn = smooth_BCE(eps=h.get('label_smoothing', 0.0))  # positive, negative BCE targets
-
-        # Focal loss
-        g = h['fl_gamma']  # focal loss gamma
-        if g > 0:
-            BCEcls, BCEobj = FocalLoss(BCEcls, g), FocalLoss(BCEobj, g)
-
-        det = model.module.model[-1] if is_parallel(model) else model.model[-1]  # Detect() module
-        self.balance = {3: [4.0, 1.0, 0.4]}.get(det.nl, [4.0, 1.0, 0.25, 0.06, 0.02])  # P3-P7
-        self.ssi = list(det.stride).index(16) if autobalance else 0  # stride 16 index
-        self.BCEcls = BCEcls
-        self.BCEobj = BCEobj
-        self.gr = 1.0
-        self.hyp = h
-        self.autobalance = autobalance
-        for k in 'na', 'nc', 'nl', 'anchors':
-            setattr(self, k, getattr(det, k))
-
-    def __call__(self, p, targets):  # predictions, targets, model
-        device = targets.device
-        self.BCEcls = self.BCEcls.to(device)
-        self.BCEobj = self.BCEobj.to(device)
-        lcls = torch.zeros(1, device=device)
-        lbox = torch.zeros(1, device=device)
-        lobj = torch.zeros(1, device=device)
-        tcls, tbox, indices, anchors = self.build_targets(p, targets)  # targets
-
-        # Losses
-        for i, pi in enumerate(p):  # layer index, layer predictions
-            b, a, gj, gi = indices[i]  # image, anchor, gridy, gridx
-            tobj = torch.zeros_like(pi[..., 0], device=device)  # target obj
-
-            n = b.shape[0]  # number of targets
-            if n:
-                ps = pi[b, a, gj, gi]  # prediction subset corresponding to targets
-
-                # Regression
-                pxy = ps[:, :2].sigmoid() * 2 - 0.5
-                pwh = (ps[:, 2:4].sigmoid() * 2) ** 2 * anchors[i]
-                pbox = torch.cat((pxy, pwh), 1)  # predicted box
-                iou = bbox_iou(pbox.T, tbox[i], x1y1x2y2=False, CIoU=True)  # iou(prediction, target)
-                lbox += (1.0 - iou).mean()  # iou loss
-
-                # Objectness
-                score_iou = iou.detach().clamp(0).type(tobj.dtype)
-                if self.sort_obj_iou:
-                    sort_id = torch.argsort(score_iou)
-                    b, a, gj, gi, score_iou = b[sort_id], a[sort_id], gj[sort_id], gi[sort_id], score_iou[sort_id]
-                tobj[b, a, gj, gi] = (1.0 - self.gr) + self.gr * score_iou  # iou ratio
-
-                # Classification
-                if self.nc > 1:  # cls loss (only if multiple classes)
-                    t = torch.full_like(ps[:, 5:], self.cn, device=device)  # targets
-                    t[range(n), tcls[i]] = self.cp
-                    lcls += self.BCEcls(ps[:, 5:], t)  # BCE
-
-            obji = self.BCEobj(pi[..., 4], tobj)
-            lobj += obji * self.balance[i]  # obj loss
-            if self.autobalance:
-                self.balance[i] = self.balance[i] * 0.9999 + 0.0001 / obji.detach().item()
-
-        if self.autobalance:
-            self.balance = [x / self.balance[self.ssi] for x in self.balance]
-        lbox *= self.hyp['box']
-        lobj *= self.hyp['obj']
-        lcls *= self.hyp['cls']
-        bs = tobj.shape[0]  # batch size
-
-        return (lbox + lobj + lcls) * bs, torch.cat((lbox, lobj, lcls)).detach()
-
-    def build_targets(self, p, targets):
-        # Build targets for compute_loss(), input targets(image,class,x,y,w,h)
-        na, nt = self.na, targets.shape[0]  # number of anchors, targets
-        tcls, tbox, indices, anch = [], [], [], []
-        gain = torch.ones(7, device=targets.device)  # normalized to gridspace gain
-        ai = torch.arange(na, device=targets.device).float().view(na, 1).repeat(1, nt)  # same as .repeat_interleave(nt)
-        targets = torch.cat((targets.repeat(na, 1, 1), ai[:, :, None]), 2)  # append anchor indices
-
-        g = 0.5  # bias
-        off = torch.tensor([[0, 0],
-                            [1, 0], [0, 1], [-1, 0], [0, -1],  # j,k,l,m
-                            # [1, 1], [1, -1], [-1, 1], [-1, -1],  # jk,jm,lk,lm
-                            ], device=targets.device).float() * g  # offsets
-
-        for i in range(self.nl):
-            anchors = self.anchors[i].to(targets.device)
-            gain[2:6] = torch.tensor(p[i].shape)[[3, 2, 3, 2]]  # xyxy gain
-
-            # Match targets to anchors
-            t = targets * gain
-            if nt:
-                # Matches
-                r = t[:, :, 4:6] / anchors[:, None]  # wh ratio
-                j = torch.max(r, 1 / r).max(2)[0] < self.hyp['anchor_t']  # compare
-                # j = wh_iou(anchors, t[:, 4:6]) > model.hyp['iou_t']  # iou(3,n)=wh_iou(anchors(3,2), gwh(n,2))
-                t = t[j]  # filter
-
-                # Offsets
-                gxy = t[:, 2:4]  # grid xy
-                gxi = gain[[2, 3]] - gxy  # inverse
-                j, k = ((gxy % 1 < g) & (gxy > 1)).T
-                l, m = ((gxi % 1 < g) & (gxi > 1)).T
-                j = torch.stack((torch.ones_like(j), j, k, l, m))
-                t = t.repeat((5, 1, 1))[j]
-                offsets = (torch.zeros_like(gxy)[None] + off[:, None])[j]
-            else:
-                t = targets[0]
-                offsets = 0
-
-            # Define
-            b, c = t[:, :2].long().T  # image, class
-            gxy = t[:, 2:4]  # grid xy
-            gwh = t[:, 4:6]  # grid wh
-            gij = (gxy - offsets).long()
-            gi, gj = gij.T  # grid xy indices
-
-            # Append
-            a = t[:, 6].long()  # anchor indices
-            indices.append((b, a, gj.clamp_(0, gain[3] - 1), gi.clamp_(0, gain[2] - 1)))  # image, anchor, grid indices
-            tbox.append(torch.cat((gxy - gij, gwh), 1))  # box
-            anch.append(anchors[a])  # anchors
-            tcls.append(c)  # class
-
-        return tcls, tbox, indices, anch
-
-    
-class Detect(nn.Module):
-    stride = None  # strides computed during build
-    onnx_dynamic = False  # ONNX export parameter
-
-    def __init__(self, nc=80, anchors=(), ch=(), inplace=True):  # detection layer
-        super().__init__()
-        self.nc = nc  # number of classes
-        self.no = nc + 5  # number of outputs per anchor
-        self.nl = len(anchors)  # number of detection layers
-        self.na = len(anchors[0]) // 2  # number of anchors
-        self.grid = [torch.zeros(1)] * self.nl  # init grid
-        self.anchor_grid = [torch.zeros(1)] * self.nl  # init anchor grid
-        self.register_buffer('anchors', torch.tensor(anchors).float().view(self.nl, -1, 2))  # shape(nl,na,2)
-        self.m = nn.ModuleList(nn.Conv2d(x, self.no * self.na, 1) for x in ch)  # output conv
-        self.inplace = inplace  # use in-place ops (e.g. slice assignment)
-
-    def forward(self, x):
-        z = []  # inference output
-        for i in range(self.nl):
-            x[i] = self.m[i](x[i])  # conv
-            bs, _, ny, nx = x[i].shape  # x(bs,255,20,20) to x(bs,3,20,20,85)
-            x[i] = x[i].view(bs, self.na, self.no, ny, nx).permute(0, 1, 3, 4, 2).contiguous()
-
-            if not self.training:  # inference
-                if self.onnx_dynamic or self.grid[i].shape[2:4] != x[i].shape[2:4]:
-                    self.grid[i], self.anchor_grid[i] = self._make_grid(nx, ny, i)
-
-                y = x[i].sigmoid()
-                if self.inplace:
-                    y[..., 0:2] = (y[..., 0:2] * 2 - 0.5 + self.grid[i]) * self.stride[i]  # xy
-                    y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i]  # wh
-                else:  # for YOLOv5 on AWS Inferentia https://github.com/ultralytics/yolov5/pull/2953
-                    xy = (y[..., 0:2] * 2 - 0.5 + self.grid[i]) * self.stride[i]  # xy
-                    wh = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i]  # wh
-                    y = torch.cat((xy, wh, y[..., 4:]), -1)
-                z.append(y.view(bs, -1, self.no))
-
-        return x if self.training else (torch.cat(z, 1), x)
-
-    def _make_grid(self, nx=20, ny=20, i=0):
-        d = self.anchors[i].device
-        if check_version(torch.__version__, '1.10.0'):  # torch>=1.10.0 meshgrid workaround for torch>=0.7 compatibility
-            yv, xv = torch.meshgrid([torch.arange(ny, device=d), torch.arange(nx, device=d)], indexing='ij')
-        else:
-            yv, xv = torch.meshgrid([torch.arange(ny, device=d), torch.arange(nx, device=d)])
-        grid = torch.stack((xv, yv), 2).expand((1, self.na, ny, nx, 2)).float()
-        anchor_grid = (self.anchors[i].clone() * self.stride[i]) \
-            .view((1, self.na, 1, 1, 2)).expand((1, self.na, ny, nx, 2)).float()
-        return grid, anchor_grid
-
-
-class YOLOModel(nn.Module):
-    def __init__(
-        self, 
-        config='yolov5s.yaml', 
-        in_chans=2, 
-        num_classes=None, 
-        anchors=None,
-    ):
-        super().__init__()
-        if isinstance(config, dict):
-            self.yaml = config  # model dict
-        else:  # is *.yaml
-            import yaml  # for torch hub
-            dir_path = os.path.dirname(os.path.realpath(__file__))
-            config = dir_path + "/" + config
-            self.yaml_file = Path(config).name
-            with open(config, encoding='ascii', errors='ignore') as f:
-                self.yaml = yaml.safe_load(f)  # model dict
-
-        # Define model
-        in_chans = self.yaml['ch'] = self.yaml.get('ch', in_chans)  # input channels
-        if num_classes and num_classes != self.yaml['nc']:
-            self.yaml['nc'] = num_classes  # override yaml value
-        if anchors:
-            self.yaml['anchors'] = round(anchors)  # override yaml value
-        self.model, self.save = parse_model(deepcopy(self.yaml), ch=[in_chans])  # model, savelist
-        self.names = [str(i) for i in range(self.yaml['nc'])]  # default names
-        self.inplace = self.yaml.get('inplace', True)
-
-        # Build strides, anchors
-        m = self.model[-1]  # Detect()
-        if isinstance(m, Detect):
-            s = 256  # 2x min stride
-            m.inplace = self.inplace
-            m.stride = torch.tensor([s / x.shape[-2] for x in self.forward(torch.zeros(1, in_chans, s, s))])  # forward
-            m.anchors /= m.stride.view(-1, 1, 1)
-            check_anchor_order(m)
-            self.stride = m.stride
-            self._initialize_biases()  # only run once
-
-        # Init weights, biases
-        initialize_weights(self)
-        self.info()
-
-    def forward(self, x, augment=False, profile=False, visualize=False):
-        if augment:
-            return self._forward_augment(x)  # augmented inference, None
-        return self._forward_once(x, profile, visualize)  # single-scale inference, train
-
-    def _forward_augment(self, x):
-        img_size = x.shape[-2:]  # height, width
-        s = [1, 0.83, 0.67]  # scales
-        f = [None, 3, None]  # flips (2-ud, 3-lr)
-        y = []  # outputs
-        for si, fi in zip(s, f):
-            xi = scale_img(x.flip(fi) if fi else x, si, gs=int(self.stride.max()))
-            yi = self._forward_once(xi)[0]  # forward
-            yi = self._descale_pred(yi, fi, si, img_size)
-            #yi = self.dequant(yi)
-            y.append(yi)
-        y = self._clip_augmented(y)  # clip augmented tails
-
-        return torch.cat(y, 1), None  # augmented inference, train
-
-    def _forward_once(self, x, profile=False, visualize=False):
-        y, dt = [], []  # outputs
-        for m in self.model:
-            if m.f != -1:  # if not from previous layer
-                if isinstance(m.f, int):
-                    x = y[m.f]
-                else:
-                    # from earlier layers
-                    x = [x if j == -1 else y[j] for j in m.f]
-
-            if profile:
-                self._profile_one_layer(m, x, dt)
-
-            x = m(x)  # run
-            y.append(x if m.i in self.save else None)  # save output
-            if visualize:
-                feature_visualization(x, m.type, m.i, save_dir=visualize)
-
-        return x
-
-    def _descale_pred(self, p, flips, scale, img_size):
-        # de-scale predictions following augmented inference (inverse operation)
-        if self.inplace:
-            p[..., :4] /= scale  # de-scale
-            if flips == 2:
-                p[..., 1] = img_size[0] - p[..., 1]  # de-flip ud
-            elif flips == 3:
-                p[..., 0] = img_size[1] - p[..., 0]  # de-flip lr
-        else:
-            x, y, wh = p[..., 0:1] / scale, p[..., 1:2] / scale, p[..., 2:4] / scale  # de-scale
-            if flips == 2:
-                y = img_size[0] - y  # de-flip ud
-            elif flips == 3:
-                x = img_size[1] - x  # de-flip lr
-            p = torch.cat((x, y, wh, p[..., 4:]), -1)
-        return p
-
-    def _clip_augmented(self, y):
-        # Clip YOLOv5 augmented inference tails
-        nl = self.model[-1].nl  # number of detection layers (P3-P5)
-        g = sum(4 ** x for x in range(nl))  # grid points
-        e = 1  # exclude layer count
-        i = (y[0].shape[1] // g) * sum(4 ** x for x in range(e))  # indices
-        y[0] = y[0][:, :-i]  # large
-        i = (y[-1].shape[1] // g) * sum(4 ** (nl - 1 - x) for x in range(e))  # indices
-        y[-1] = y[-1][:, i:]  # small
-        return y
-
-    def _profile_one_layer(self, m, x, dt):
-        c = isinstance(m, Detect)  # is final layer, copy input as inplace fix
-        o = thop.profile(m, inputs=(x.copy() if c else x,), verbose=False)[0] / 1E9 * 2 if thop else 0  # FLOPs
-        t = time_sync()
-        for _ in range(10):
-            m(x.copy() if c else x)
-        dt.append((time_sync() - t) * 100)
-
-    def _initialize_biases(self, cf=None):  # initialize biases into Detect(), cf is class frequency
-        # https://arxiv.org/abs/1708.02002 section 3.3
-        # cf = torch.bincount(torch.tensor(np.concatenate(dataset.labels, 0)[:, 0]).long(), minlength=nc) + 1.
-        m = self.model[-1]  # Detect() module
-        for mi, s in zip(m.m, m.stride):  # from
-            b = mi.bias.view(m.na, -1)  # conv.bias(255) to (3,85)
-            b.data[:, 4] += math.log(8 / (640 / s) ** 2)  # obj (8 objects per 640 image)
-            b.data[:, 5:] += math.log(0.6 / (m.nc - 0.999999)) if cf is None else torch.log(cf / cf.sum())  # cls
-            mi.bias = torch.nn.Parameter(b.view(-1), requires_grad=True)
-
-    def _print_biases(self):
-        m = self.model[-1]  # Detect() module
-        for mi in m.m:  # from
-            b = mi.bias.detach().view(m.na, -1).T  # conv.bias(255) to (3,85)
-
-    def fuse(self):  # fuse model Conv2d() + BatchNorm2d() layers
-        for m in self.model.modules():
-            if isinstance(m, (Conv, DWConv)) and hasattr(m, 'bn'):
-                m.conv = fuse_conv_and_bn(m.conv, m.bn)  # update conv
-                delattr(m, 'bn')  # remove batchnorm
-                m.forward = m.forward_fuse  # update forward
-        self.info()
-        return self
-
-    def info(self, verbose=False, img_size=640):  # print model information
-        model_info(self, verbose, img_size)
-
-    def _apply(self, fn):
-        # Apply to(), cpu(), cuda(), half() to model tensors that are not parameters or registered buffers
-        self = super()._apply(fn)
-        m = self.model[-1]  # Detect()
-        if isinstance(m, Detect):
-            m.stride = fn(m.stride)
-            m.grid = list(map(fn, m.grid))
-            if isinstance(m.anchor_grid, list):
-                m.anchor_grid = list(map(fn, m.anchor_grid))
-        return self
-
-
-def parse_model(d, ch):  # model_dict, input_channels(3)
-    anchors, nc, gd, gw = d['anchors'], d['nc'], d['depth_multiple'], d['width_multiple']
-    na = (len(anchors[0]) // 2) if isinstance(anchors, list) else anchors  # number of anchors
-    no = na * (nc + 5)  # number of outputs = anchors * (classes + 5)
-
-    layers, save, c2 = [], [], ch[-1]  # layers, savelist, ch out
-    for i, (f, n, m, args) in enumerate(d['backbone'] + d['head']):  # from, number, module, args
-        m = eval(m) if isinstance(m, str) else m  # eval strings
-        for j, a in enumerate(args):
-            try:
-                args[j] = eval(a) if isinstance(a, str) else a  # eval strings
-            except NameError:
-                pass
-
-        n = n_ = max(round(n * gd), 1) if n > 1 else n  # depth gain
-        if m in [Conv, GhostConv, Bottleneck, GhostBottleneck, SPP, SPPF, DWConv, MixConv2d, Focus, CrossConv,
-                 BottleneckCSP, C3, C3TR, C3SPP, C3Ghost]:
-            c1, c2 = ch[f], args[0]
-            if c2 != no:  # if not output
-                c2 = make_divisible(c2 * gw, 8)
-
-            args = [c1, c2, *args[1:]]
-            if m in [BottleneckCSP, C3, C3TR, C3Ghost]:
-                args.insert(2, n)  # number of repeats
-                n = 1
-        elif m is nn.BatchNorm2d:
-            args = [ch[f]]
-        elif m is Concat:
-            c2 = sum(ch[x] for x in f)
-        elif m is Detect:
-            args.append([ch[x] for x in f])
-            if isinstance(args[1], int):  # number of anchors
-                args[1] = [list(range(args[1] * 2))] * len(f)
-        elif m is Contract:
-            c2 = ch[f] * args[0] ** 2
-        elif m is Expand:
-            c2 = ch[f] // args[0] ** 2
-        else:
-            c2 = ch[f]
-
-        m_ = nn.Sequential(*(m(*args) for _ in range(n))) if n > 1 else m(*args)  # module
-        t = str(m)[8:-2].replace('__main__.', '')  # module type
-        np = sum(x.numel() for x in m_.parameters())  # number params
-        m_.i, m_.f, m_.type, m_.np = i, f, t, np  # attach index, 'from' index, type, number params
-        save.extend(x % i for x in ([f] if isinstance(f, int) else f) if x != -1)  # append to savelist
-        layers.append(m_)
-        if i == 0:
-            ch = []
-        ch.append(c2)
-    return nn.Sequential(*layers), sorted(save)
-
-
-def autopad(k, p=None):  # kernel, padding
-    # Pad to 'same'
-    if p is None:
-        p = k // 2 if isinstance(k, int) else [x // 2 for x in k]  # auto-pad
-    return p
-
-
-class Conv(nn.Module):
-    # Standard convolution
-    def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True):  # ch_in, ch_out, kernel, stride, padding, groups
-        super().__init__()
-        self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p), groups=g, bias=False)
-        self.bn = nn.BatchNorm2d(c2)
-        self.act = nn.SiLU() if act is True else (act if isinstance(act, nn.Module) else nn.Identity())
-
-    def forward(self, x):
-        return self.act(self.bn(self.conv(x)))
-
-    def forward_fuse(self, x):
-        return self.act(self.conv(x))
-
-
-class DWConv(Conv):
-    # Depth-wise convolution class
-    def __init__(self, c1, c2, k=1, s=1, act=True):  # ch_in, ch_out, kernel, stride, padding, groups
-        super().__init__(c1, c2, k, s, g=math.gcd(c1, c2), act=act)
-
-
-class Bottleneck(nn.Module):
-    # Standard bottleneck
-    def __init__(self, c1, c2, shortcut=True, g=1, e=0.5):  # ch_in, ch_out, shortcut, groups, expansion
-        super().__init__()
-        c_ = int(c2 * e)  # hidden channels
-        self.cv1 = Conv(c1, c_, 1, 1)
-        self.cv2 = Conv(c_, c2, 3, 1, g=g)
-        self.add = shortcut and c1 == c2
-
-    def forward(self, x):
-        return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x))
-
-
-class BottleneckCSP(nn.Module):
-    # CSP Bottleneck https://github.com/WongKinYiu/CrossStagePartialNetworks
-    def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):  # ch_in, ch_out, number, shortcut, groups, expansion
-        super().__init__()
-        c_ = int(c2 * e)  # hidden channels
-        self.cv1 = Conv(c1, c_, 1, 1)
-        self.cv2 = nn.Conv2d(c1, c_, 1, 1, bias=False)
-        self.cv3 = nn.Conv2d(c_, c_, 1, 1, bias=False)
-        self.cv4 = Conv(2 * c_, c2, 1, 1)
-        self.bn = nn.BatchNorm2d(2 * c_)  # applied to cat(cv2, cv3)
-        self.act = nn.SiLU()
-        self.m = nn.Sequential(*(Bottleneck(c_, c_, shortcut, g, e=1.0) for _ in range(n)))
-
-    def forward(self, x):
-        y1 = self.cv3(self.m(self.cv1(x)))
-        y2 = self.cv2(x)
-        return self.cv4(self.act(self.bn(torch.cat((y1, y2), dim=1))))
-
-
-class C3(nn.Module):
-    # CSP Bottleneck with 3 convolutions
-    def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):  # ch_in, ch_out, number, shortcut, groups, expansion
-        super().__init__()
-        c_ = int(c2 * e)  # hidden channels
-        self.cv1 = Conv(c1, c_, 1, 1)
-        self.cv2 = Conv(c1, c_, 1, 1)
-        self.cv3 = Conv(2 * c_, c2, 1)  # act=FReLU(c2)
-        self.m = nn.Sequential(*(Bottleneck(c_, c_, shortcut, g, e=1.0) for _ in range(n)))
-        # self.m = nn.Sequential(*[CrossConv(c_, c_, 3, 1, g, 1.0, shortcut) for _ in range(n)])
-
-    def forward(self, x):
-        return self.cv3(torch.cat((self.m(self.cv1(x)), self.cv2(x)), dim=1))
-
-
-class C3TR(C3):
-    # C3 module with TransformerBlock()
-    def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
-        super().__init__(c1, c2, n, shortcut, g, e)
-        c_ = int(c2 * e)
-        self.m = TransformerBlock(c_, c_, 4, n)
-
-
-class C3SPP(C3):
-    # C3 module with SPP()
-    def __init__(self, c1, c2, k=(5, 9, 13), n=1, shortcut=True, g=1, e=0.5):
-        super().__init__(c1, c2, n, shortcut, g, e)
-        c_ = int(c2 * e)
-        self.m = SPP(c_, c_, k)
-
-
-class C3Ghost(C3):
-    # C3 module with GhostBottleneck()
-    def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
-        super().__init__(c1, c2, n, shortcut, g, e)
-        c_ = int(c2 * e)  # hidden channels
-        self.m = nn.Sequential(*(GhostBottleneck(c_, c_) for _ in range(n)))
-
-
-class SPP(nn.Module):
-    # Spatial Pyramid Pooling (SPP) layer https://arxiv.org/abs/1406.4729
-    def __init__(self, c1, c2, k=(5, 9, 13)):
-        super().__init__()
-        c_ = c1 // 2  # hidden channels
-        self.cv1 = Conv(c1, c_, 1, 1)
-        self.cv2 = Conv(c_ * (len(k) + 1), c2, 1, 1)
-        self.m = nn.ModuleList([nn.MaxPool2d(kernel_size=x, stride=1, padding=x // 2) for x in k])
-
-    def forward(self, x):
-        x = self.cv1(x)
-        with warnings.catch_warnings():
-            warnings.simplefilter('ignore')  # suppress torch 1.9.0 max_pool2d() warning
-            return self.cv2(torch.cat([x] + [m(x) for m in self.m], 1))
-
-
-class SPPF(nn.Module):
-    # Spatial Pyramid Pooling - Fast (SPPF) layer for YOLOv5 by Glenn Jocher
-    def __init__(self, c1, c2, k=5):  # equivalent to SPP(k=(5, 9, 13))
-        super().__init__()
-        c_ = c1 // 2  # hidden channels
-        self.cv1 = Conv(c1, c_, 1, 1)
-        self.cv2 = Conv(c_ * 4, c2, 1, 1)
-        self.m = nn.MaxPool2d(kernel_size=k, stride=1, padding=k // 2)
-
-    def forward(self, x):
-        x = self.cv1(x)
-        with warnings.catch_warnings():
-            warnings.simplefilter('ignore')  # suppress torch 1.9.0 max_pool2d() warning
-            y1 = self.m(x)
-            y2 = self.m(y1)
-            return self.cv2(torch.cat([x, y1, y2, self.m(y2)], 1))
-
-
-class Focus(nn.Module):
-    # Focus wh information into c-space
-    def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True):  # ch_in, ch_out, kernel, stride, padding, groups
-        super().__init__()
-        self.conv = Conv(c1 * 4, c2, k, s, p, g, act)
-        # self.contract = Contract(gain=2)
-
-    def forward(self, x):  # x(b,c,w,h) -> y(b,4c,w/2,h/2)
-        return self.conv(torch.cat([x[..., ::2, ::2], x[..., 1::2, ::2], x[..., ::2, 1::2], x[..., 1::2, 1::2]], 1))
-        # return self.conv(self.contract(x))
-
-
-class GhostConv(nn.Module):
-    # Ghost Convolution https://github.com/huawei-noah/ghostnet
-    def __init__(self, c1, c2, k=1, s=1, g=1, act=True):  # ch_in, ch_out, kernel, stride, groups
-        super().__init__()
-        c_ = c2 // 2  # hidden channels
-        self.cv1 = Conv(c1, c_, k, s, None, g, act)
-        self.cv2 = Conv(c_, c_, 5, 1, None, c_, act)
-
-    def forward(self, x):
-        y = self.cv1(x)
-        return torch.cat([y, self.cv2(y)], 1)
-
-
-class GhostBottleneck(nn.Module):
-    # Ghost Bottleneck https://github.com/huawei-noah/ghostnet
-    def __init__(self, c1, c2, k=3, s=1):  # ch_in, ch_out, kernel, stride
-        super().__init__()
-        c_ = c2 // 2
-        self.conv = nn.Sequential(GhostConv(c1, c_, 1, 1),  # pw
-                                  DWConv(c_, c_, k, s, act=False) if s == 2 else nn.Identity(),  # dw
-                                  GhostConv(c_, c2, 1, 1, act=False))  # pw-linear
-        self.shortcut = nn.Sequential(DWConv(c1, c1, k, s, act=False),
-                                      Conv(c1, c2, 1, 1, act=False)) if s == 2 else nn.Identity()
-
-    def forward(self, x):
-        return self.conv(x) + self.shortcut(x)
-
-
-class Contract(nn.Module):
-    # Contract width-height into channels, i.e. x(1,64,80,80) to x(1,256,40,40)
-    def __init__(self, gain=2):
-        super().__init__()
-        self.gain = gain
-
-    def forward(self, x):
-        b, c, h, w = x.size()  # assert (h / s == 0) and (W / s == 0), 'Indivisible gain'
-        s = self.gain
-        x = x.view(b, c, h // s, s, w // s, s)  # x(1,64,40,2,40,2)
-        x = x.permute(0, 3, 5, 1, 2, 4).contiguous()  # x(1,2,2,64,40,40)
-        return x.view(b, c * s * s, h // s, w // s)  # x(1,256,40,40)
-
-
-class Expand(nn.Module):
-    # Expand channels into width-height, i.e. x(1,64,80,80) to x(1,16,160,160)
-    def __init__(self, gain=2):
-        super().__init__()
-        self.gain = gain
-
-    def forward(self, x):
-        b, c, h, w = x.size()  # assert C / s ** 2 == 0, 'Indivisible gain'
-        s = self.gain
-        x = x.view(b, s, s, c // s ** 2, h, w)  # x(1,2,2,16,80,80)
-        x = x.permute(0, 3, 4, 1, 5, 2).contiguous()  # x(1,16,80,2,80,2)
-        return x.view(b, c // s ** 2, h * s, w * s)  # x(1,16,160,160)
-
-
-class Concat(nn.Module):
-    # Concatenate a list of tensors along dimension
-    def __init__(self, dimension=1):
-        super().__init__()
-        self.d = dimension
-
-    def forward(self, x):
-        return torch.cat(x, self.d)
-
-
-class CrossConv(nn.Module):
-    # Cross Convolution Downsample
-    def __init__(self, c1, c2, k=3, s=1, g=1, e=1.0, shortcut=False):
-        # ch_in, ch_out, kernel, stride, groups, expansion, shortcut
-        super().__init__()
-        c_ = int(c2 * e)  # hidden channels
-        self.cv1 = Conv(c1, c_, (1, k), (1, s))
-        self.cv2 = Conv(c_, c2, (k, 1), (s, 1), g=g)
-        self.add = shortcut and c1 == c2
-
-    def forward(self, x):
-        return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x))
-
-
-class MixConv2d(nn.Module):
-    # Mixed Depth-wise Conv https://arxiv.org/abs/1907.09595
-    def __init__(self, c1, c2, k=(1, 3), s=1, equal_ch=True):  # ch_in, ch_out, kernel, stride, ch_strategy
-        super().__init__()
-        n = len(k)  # number of convolutions
-        if equal_ch:  # equal c_ per group
-            i = torch.linspace(0, n - 1E-6, c2).floor()  # c2 indices
-            c_ = [(i == g).sum() for g in range(n)]  # intermediate channels
-        else:  # equal weight.numel() per group
-            b = [c2] + [0] * n
-            a = np.eye(n + 1, n, k=-1)
-            a -= np.roll(a, 1, axis=1)
-            a *= np.array(k) ** 2
-            a[0] = 1
-            c_ = np.linalg.lstsq(a, b, rcond=None)[0].round()  # solve for equal weight indices, ax = b
-
-        self.m = nn.ModuleList(
-            [nn.Conv2d(c1, int(c_), k, s, k // 2, groups=math.gcd(c1, int(c_)), bias=False) for k, c_ in zip(k, c_)])
-        self.bn = nn.BatchNorm2d(c2)
-        self.act = nn.SiLU()
-
-    def forward(self, x):
-        return self.act(self.bn(torch.cat([m(x) for m in self.m], 1)))
-
-
-def create_yolov5(
-    network: str = 'yolov5s',
-    num_classes: int = 53,
-) -> torch.nn.Module:
-    """
-    Function used to build a YOLOv5 network
-    
-    Args:
-        TODO
-        
-    Returns:
-        torch.nn.Module
-    
-    """
-    if not (
-        network == "yolov5f" or \
-        network == "yolov5p" or \
-        network == "yolov5n" or \
-        network == "yolov5s" or \
-        network == "yolov5m" or \
-        network == "yolov5l" or \
-        network == "yolov5x"
-    ):
-        raise NotImplemented("YOLO network specified not implemented.")
-    
-    # Build full YOLOv5 network
-    network = YOLOModel(
-        config="{}.yaml".format(network),
-        num_classes=num_classes,
-        in_chans=2,
-    )
-
-    return network
\ No newline at end of file
diff --git a/torchsig/models/spectrogram_models/yolov5/utils.py b/torchsig/models/spectrogram_models/yolov5/utils.py
deleted file mode 100644
index 1ef5ec0..0000000
--- a/torchsig/models/spectrogram_models/yolov5/utils.py
+++ /dev/null
@@ -1,343 +0,0 @@
-import time
-import math
-import numpy as np
-import pkg_resources as pkg
-from typing import List, Optional, Iterable, List, Any
-import torch
-from torch import nn
-from torch import Tensor
-import torch.distributed as dist
-from torch.optim.lr_scheduler import LambdaLR
-import torchvision
-
-
-def prep_targets(targets: List, device: torch.device = 'cuda') -> torch.Tensor:
-    device = targets[0]['labels'].device
-    t_targets = []
-    for (i, t) in enumerate(targets):
-        idx = torch.as_tensor([i], device=device).repeat(len(t['labels'])).reshape(-1,1)
-        t_targets.append(
-            torch.cat(
-                (idx, t['labels'].reshape(-1,1), t['boxes']),
-                dim=-1
-            )
-        )
-    return torch.cat(t_targets)
-
-
-def make_divisible(x, divisor):
-    # Returns nearest x divisible by divisor
-    if isinstance(divisor, torch.Tensor):
-        divisor = int(divisor.max())  # to int
-    return math.ceil(x / divisor) * divisor
-
-
-def check_anchor_order(m):
-    # Check anchor order against stride order for YOLOv5 Detect() module m, and correct if necessary
-    a = m.anchors.prod(-1).view(-1)  # anchor area
-    da = a[-1] - a[0]  # delta a
-    ds = m.stride[-1] - m.stride[0]  # delta s
-    if da.sign() != ds.sign():  # same order
-        m.anchors[:] = m.anchors.flip(0)
-
-        
-def initialize_weights(model):
-    for m in model.modules():
-        t = type(m)
-        if t is nn.Conv2d:
-            pass  # nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
-        elif t is nn.BatchNorm2d:
-            m.eps = 1e-3
-            m.momentum = 0.03
-        elif t in [nn.Hardswish, nn.LeakyReLU, nn.ReLU, nn.ReLU6, nn.SiLU]:
-            m.inplace = True
-            
-
-def model_info(model, verbose=False, img_size=640):
-    # Model information. img_size may be int or list, i.e. img_size=640 or img_size=[640, 320]
-    n_p = sum(x.numel() for x in model.parameters())  # number parameters
-    n_g = sum(x.numel() for x in model.parameters() if x.requires_grad)  # number gradients
-    if verbose:
-        print(f"{'layer':>5} {'name':>40} {'gradient':>9} {'parameters':>12} {'shape':>20} {'mu':>10} {'sigma':>10}")
-        for i, (name, p) in enumerate(model.named_parameters()):
-            name = name.replace('module_list.', '')
-            print('%5g %40s %9s %12g %20s %10.3g %10.3g' %
-                  (i, name, p.requires_grad, p.numel(), list(p.shape), p.mean(), p.std()))
-
-    try:  # FLOPs
-        from thop import profile
-        stride = max(int(model.stride.max()), 32) if hasattr(model, 'stride') else 32
-        img = torch.zeros((1, model.yaml.get('ch', 3), stride, stride), device=next(model.parameters()).device)  # input
-        flops = profile(deepcopy(model), inputs=(img,), verbose=False)[0] / 1E9 * 2  # stride GFLOPs
-        img_size = img_size if isinstance(img_size, list) else [img_size, img_size]  # expand if int/float
-        fs = ', %.1f GFLOPs' % (flops * img_size[0] / stride * img_size[1] / stride)  # 640x640 GFLOPs
-    except (ImportError, Exception):
-        fs = ''
-        
-        
-def box_iou(box1, box2):
-    # https://github.com/pytorch/vision/blob/master/torchvision/ops/boxes.py
-    """
-    Return intersection-over-union (Jaccard index) of boxes.
-    Both sets of boxes are expected to be in (x1, y1, x2, y2) format.
-    Arguments:
-        box1 (Tensor[N, 4])
-        box2 (Tensor[M, 4])
-    Returns:
-        iou (Tensor[N, M]): the NxM matrix containing the pairwise
-            IoU values for every element in boxes1 and boxes2
-    """
-    def box_area(box):
-        # box = 4xn
-        return (box[2] - box[0]) * (box[3] - box[1])
-
-    area1 = box_area(box1.T)
-    area2 = box_area(box2.T)
-
-    # inter(N,M) = (rb(N,M,2) - lt(N,M,2)).clamp(0).prod(2)
-    inter = (torch.min(box1[:, None, 2:], box2[:, 2:]) - torch.max(box1[:, None, :2], box2[:, :2])).clamp(0).prod(2)
-    return inter / (area1[:, None] + area2 - inter)  # iou = inter / (area1 + area2 - inter)
-
-
-def bbox_iou(box1, box2, x1y1x2y2=True, GIoU=False, DIoU=False, CIoU=False, eps=1e-7):
-    # Returns the IoU of box1 to box2. box1 is 4, box2 is nx4
-    box2 = box2.T
-
-    # Get the coordinates of bounding boxes
-    if x1y1x2y2:  # x1, y1, x2, y2 = box1
-        b1_x1, b1_y1, b1_x2, b1_y2 = box1[0], box1[1], box1[2], box1[3]
-        b2_x1, b2_y1, b2_x2, b2_y2 = box2[0], box2[1], box2[2], box2[3]
-    else:  # transform from xywh to xyxy
-        b1_x1, b1_x2 = box1[0] - box1[2] / 2, box1[0] + box1[2] / 2
-        b1_y1, b1_y2 = box1[1] - box1[3] / 2, box1[1] + box1[3] / 2
-        b2_x1, b2_x2 = box2[0] - box2[2] / 2, box2[0] + box2[2] / 2
-        b2_y1, b2_y2 = box2[1] - box2[3] / 2, box2[1] + box2[3] / 2
-
-    # Intersection area
-    inter = (torch.min(b1_x2, b2_x2) - torch.max(b1_x1, b2_x1)).clamp(0) * \
-            (torch.min(b1_y2, b2_y2) - torch.max(b1_y1, b2_y1)).clamp(0)
-
-    # Union Area
-    w1, h1 = b1_x2 - b1_x1, b1_y2 - b1_y1 + eps
-    w2, h2 = b2_x2 - b2_x1, b2_y2 - b2_y1 + eps
-    union = w1 * h1 + w2 * h2 - inter + eps
-
-    iou = inter / union
-    if CIoU or DIoU or GIoU:
-        cw = torch.max(b1_x2, b2_x2) - torch.min(b1_x1, b2_x1)  # convex (smallest enclosing box) width
-        ch = torch.max(b1_y2, b2_y2) - torch.min(b1_y1, b2_y1)  # convex height
-        if CIoU or DIoU:  # Distance or Complete IoU https://arxiv.org/abs/1911.08287v1
-            c2 = cw ** 2 + ch ** 2 + eps  # convex diagonal squared
-            rho2 = ((b2_x1 + b2_x2 - b1_x1 - b1_x2) ** 2 +
-                    (b2_y1 + b2_y2 - b1_y1 - b1_y2) ** 2) / 4  # center distance squared
-            if CIoU:  # https://github.com/Zzh-tju/DIoU-SSD-pytorch/blob/master/utils/box/box_utils.py#L47
-                v = (4 / math.pi ** 2) * torch.pow(torch.atan(w2 / h2) - torch.atan(w1 / h1), 2)
-                with torch.no_grad():
-                    alpha = v / (v - iou + (1 + eps))
-                return iou - (rho2 / c2 + v * alpha)  # CIoU
-            return iou - rho2 / c2  # DIoU
-        c_area = cw * ch + eps  # convex area
-        return iou - (c_area - union) / c_area  # GIoU https://arxiv.org/pdf/1902.09630.pdf
-    return iou  # IoU
-
-
-def is_parallel(model):
-    # Returns True if model is of type DP or DDP
-    return type(model) in (nn.parallel.DataParallel, nn.parallel.DistributedDataParallel)
-
-
-def check_version(current='0.0.0', minimum='0.0.0', name='version ', pinned=False, hard=False, verbose=False):
-    # Check version vs. required version
-    current, minimum = (pkg.parse_version(x) for x in (current, minimum))
-    result = (current == minimum) if pinned else (current >= minimum)  # bool
-    s = f'{name}{minimum} required by YOLOv5, but {name}{current} is currently installed'  # string
-    if hard:
-        assert result, s  # assert min requirements met
-    return result
-
-
-def xywhn2xyxy(x, w=640, h=640, padw=0, padh=0):
-    # Convert nx4 boxes from [x, y, w, h] normalized to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
-    y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
-    y[:, 0] = w * (x[:, 0] - x[:, 2] / 2) + padw  # top left x
-    y[:, 1] = h * (x[:, 1] - x[:, 3] / 2) + padh  # top left y
-    y[:, 2] = w * (x[:, 0] + x[:, 2] / 2) + padw  # bottom right x
-    y[:, 3] = h * (x[:, 1] + x[:, 3] / 2) + padh  # bottom right y
-    return y
-
-
-def xywh2xyxy(x):
-    # Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
-    y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
-    y[:, 0] = x[:, 0] - x[:, 2] / 2  # top left x
-    y[:, 1] = x[:, 1] - x[:, 3] / 2  # top left y
-    y[:, 2] = x[:, 0] + x[:, 2] / 2  # bottom right x
-    y[:, 3] = x[:, 1] + x[:, 3] / 2  # bottom right y
-    return y
-
-
-def non_max_suppression(
-    prediction, 
-    conf_thres=0.25, 
-    iou_thres=0.45, 
-    classes=None, 
-    agnostic=False, 
-    multi_label=False,
-    labels=(), 
-    max_det=300,
-):
-    """Runs Non-Maximum Suppression (NMS) on inference results
-
-    Returns:
-         list of detections, on (n,6) tensor per image [xyxy, conf, cls]
-    """
-    nc = prediction.shape[2] - 5  # number of classes
-    xc = prediction[..., 4] > conf_thres  # candidates
-
-    # Checks
-    assert 0 <= conf_thres <= 1, f'Invalid Confidence threshold {conf_thres}, valid values are between 0.0 and 1.0'
-    assert 0 <= iou_thres <= 1, f'Invalid IoU {iou_thres}, valid values are between 0.0 and 1.0'
-
-    # Settings
-    min_wh, max_wh = 2, 7680  # (pixels) minimum and maximum box width and height
-    max_nms = 30000  # maximum number of boxes into torchvision.ops.nms()
-    time_limit = 10.0  # seconds to quit after
-    redundant = True  # require redundant detections
-    multi_label &= nc > 1  # multiple labels per box (adds 0.5ms/img)
-    merge = False  # use merge-NMS
-
-    t = time.time()
-    output = [torch.zeros((0, 6), device=prediction.device)] * prediction.shape[0]
-    for xi, x in enumerate(prediction):  # image index, image inference
-        # Apply constraints
-        # x[((x[..., 2:4] < min_wh) | (x[..., 2:4] > max_wh)).any(1), 4] = 0  # width-height
-        x = x[xc[xi]]  # confidence
-
-        # Cat apriori labels if autolabelling
-        if labels and len(labels[xi]):
-            l = labels[xi]
-            v = torch.zeros((len(l), nc + 5), device=x.device)
-            v[:, :4] = l[:, 1:5]  # box
-            v[:, 4] = 1.0  # conf
-            v[range(len(l)), l[:, 0].long() + 5] = 1.0  # cls
-            x = torch.cat((x, v), 0)
-
-        # If none remain process next image
-        if not x.shape[0]:
-            continue
-
-        # Compute conf
-        x[:, 5:] *= x[:, 4:5]  # conf = obj_conf * cls_conf
-
-        # Box (center x, center y, width, height) to (x1, y1, x2, y2)
-        box = xywh2xyxy(x[:, :4])
-
-        # Detections matrix nx6 (xyxy, conf, cls)
-        if multi_label:
-            i, j = (x[:, 5:] > conf_thres).nonzero(as_tuple=False).T
-            x = torch.cat((box[i], x[i, j + 5, None], j[:, None].float()), 1)
-        else:  # best class only
-            conf, j = x[:, 5:].max(1, keepdim=True)
-            x = torch.cat((box, conf, j.float()), 1)[conf.view(-1) > conf_thres]
-
-        # Filter by class
-        if classes is not None:
-            x = x[(x[:, 5:6] == torch.tensor(classes, device=x.device)).any(1)]
-
-        # Apply finite constraint
-        # if not torch.isfinite(x).all():
-        #     x = x[torch.isfinite(x).all(1)]
-
-        # Check shape
-        n = x.shape[0]  # number of boxes
-        if not n:  # no boxes
-            continue
-        elif n > max_nms:  # excess boxes
-            x = x[x[:, 4].argsort(descending=True)[:max_nms]]  # sort by confidence
-
-        # Batched NMS
-        c = x[:, 5:6] * (0 if agnostic else max_wh)  # classes
-        boxes, scores = x[:, :4] + c, x[:, 4]  # boxes (offset by class), scores
-        i = torchvision.ops.nms(boxes, scores, iou_thres)  # NMS
-        if i.shape[0] > max_det:  # limit detections
-            i = i[:max_det]
-        if merge and (1 < n < 3E3):  # Merge NMS (boxes merged using weighted mean)
-            # update boxes as boxes(i,4) = weights(i,n) * boxes(n,4)
-            iou = box_iou(boxes[i], boxes) > iou_thres  # iou matrix
-            weights = iou * scores[None]  # box weights
-            x[i, :4] = torch.mm(weights, x[:, :4]).float() / weights.sum(1, keepdim=True)  # merged boxes
-            if redundant:
-                i = i[iou.sum(1) > 1]  # require redundancy
-
-        output[xi] = x[i]
-        if (time.time() - t) > time_limit:
-            print(f'WARNING: NMS time limit {time_limit}s exceeded')
-            break  # time limit exceeded
-
-    return output
-
-
-def format_preds(preds, num_classes=1, threshold=0.5):
-    map_preds = []
-    
-    # Loop over examples in batch
-    for pred in preds:
-        boxes = []
-        scores = []
-        labels = []
-        
-        # Interpret YOLO outputs
-        pred = torch.unsqueeze(pred, 0)
-        pred = non_max_suppression(pred, iou_thres=threshold)
-        pred = torch.cat(pred).cpu().numpy()
-
-        for obj_idx, obj in enumerate(pred):
-            center_time = (obj[0] + obj[2]) / 2
-            center_freq = (obj[1] + obj[3]) / 2
-            duration = obj[2] - obj[0]
-            bandwidth = obj[3] - obj[1]
-
-            boxes.append([max(0,obj[0]), max(0,obj[1]), min(512,obj[2]), min(512,obj[3])])
-            scores.extend([obj[4]])
-            labels.extend([int(1)]) if num_classes == 1 else labels.extend([int(obj[5])])
-    
-        curr_pred = dict(
-            boxes=torch.tensor(boxes).to("cuda"),
-            scores=torch.tensor(scores).to("cuda"),
-            labels=torch.IntTensor(labels).to("cuda"),
-        )
-        map_preds.append(curr_pred)
-    
-    return map_preds
-
-
-def format_targets(labels, num_classes=1):
-    map_targets = []
-        
-    for i, label in enumerate(labels):
-        boxes = []
-        scores = []
-        labels = []
-        
-        for label_obj_idx in range(len(label['labels'])):
-            center_time = label["boxes"][label_obj_idx][0]
-            center_freq = label["boxes"][label_obj_idx][1]
-            duration = label["boxes"][label_obj_idx][2]
-            bandwidth = label["boxes"][label_obj_idx][3]
-            class_idx = label["labels"][label_obj_idx]
-            
-            x1 = (center_time - duration / 2) * 512
-            y1 = (center_freq - bandwidth / 2) * 512
-            x2 = (center_time + duration / 2) * 512
-            y2 = (center_freq + bandwidth / 2) * 512
-            
-            boxes.append([x1, y1, x2, y2])
-            labels.extend([int(1)]) if num_classes == 1 else labels.extend([int(class_idx)])
-            
-        curr_target = dict(
-            boxes=torch.tensor(boxes).to("cuda"),
-            labels=torch.IntTensor(labels).to("cuda"),
-        )
-        map_targets.append(curr_target)
-    
-    return map_targets
\ No newline at end of file
diff --git a/torchsig/models/spectrogram_models/yolov5/yolov5.py b/torchsig/models/spectrogram_models/yolov5/yolov5.py
deleted file mode 100644
index bd8fbdd..0000000
--- a/torchsig/models/spectrogram_models/yolov5/yolov5.py
+++ /dev/null
@@ -1,247 +0,0 @@
-import timm
-import gdown
-import torch
-import os.path
-import numpy as np
-from torch import nn
-
-from .modules import *
-from .utils import *
-from .mean_ap import *
-
-
-__all__ = [
-    "yolov5p", "yolov5n", "yolov5s",
-    "yolov5p_mod_family", "yolov5n_mod_family", "yolov5s_mod_family",
-]
-
-model_urls = {
-    "yolov5p": "1d1ihKbtGQciRwqmBDrHiESZ22zx9W01S",
-    "yolov5n": "184h1f8-DV3FDYd01X7TdKmxWZ2s73FiH",
-    "yolov5s": "1t7hHB4uXJ0BaSEmq_li2oj1tEDXgZh0z",
-    "yolov5p_mod_family": "1z8VLEpVqQEFPW3u4T3Yd6c5J0e__UDqf",
-    "yolov5n_mod_family": "1B2ke51DGbpZXOMhuWTQLXZaDM59VC5Mm",
-    "yolov5s_mod_family": "1HzcKfM4URtAqhCIQr_obXWWbYIFsEE4s",
-}
-
-
-def yolov5p(
-    pretrained: bool = False, 
-    path: str = "yolov5p.pt",
-    num_classes: int = 1,
-):
-    """Constructs a YOLOv5 architecture with Pico scaling.
-    YOLOv5 from `"YOLOv5 GitHub" <https://github.com/ultralytics/yolov5>`_.
-    
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on WBSig53
-        path (str): Path to existing model or where to download checkpoint to
-        num_classes (int): Number of output classes; if loading checkpoint and number does not equal 1, final layer will not be loaded from checkpoint
-        
-    """
-    # Create YOLOv5p
-    mdl = create_yolov5(
-        network='yolov5p',
-        num_classes=1,
-    )
-    if pretrained:
-        model_exists = os.path.exists(path)
-        if not model_exists:
-            file_id = model_urls['yolov5p']
-            dl = gdown.download(id=file_id, output=path)
-        mdl.load_state_dict(torch.load(path), strict=False)
-    if num_classes != 1:
-        mdl.model[-1].no = int(mdl.model[-1].no / (1 + 5) * (num_classes + 5))
-        for det_conv_idx in range(len(mdl.model[-1].m)):
-            mdl.model[-1].m[det_conv_idx] = torch.nn.Conv2d(
-                in_channels=mdl.model[-1].m[det_conv_idx].in_channels,
-                out_channels=int(mdl.model[-1].m[det_conv_idx].out_channels / (1+5) * (num_classes + 5)),
-                kernel_size=mdl.model[-1].m[det_conv_idx].kernel_size,
-                stride=mdl.model[-1].m[det_conv_idx].stride,
-            )
-    return mdl
-    
-    
-def yolov5n(
-    pretrained: bool = False, 
-    path: str = "yolov5n.pt",
-    num_classes: int = 1,
-):
-    """Constructs a YOLOv5 architecture with Nano scaling.
-    YOLOv5 from `"YOLOv5 GitHub" <https://github.com/ultralytics/yolov5>`_.
-    
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on WBSig53
-        path (str): Path to existing model or where to download checkpoint to
-        num_classes (int): Number of output classes; if loading checkpoint and number does not equal 1, final layer will not be loaded from checkpoint
-        
-    """
-    # Create YOLOv5p
-    mdl = create_yolov5(
-        network='yolov5n',
-        num_classes=1,
-    )
-    if pretrained:
-        model_exists = os.path.exists(path)
-        if not model_exists:
-            file_id = model_urls['yolov5n']
-            dl = gdown.download(id=file_id, output=path)
-        mdl.load_state_dict(torch.load(path), strict=False)
-    if num_classes != 1:
-        mdl.model[-1].no = int(mdl.model[-1].no / (1 + 5) * (num_classes + 5))
-        for det_conv_idx in range(len(mdl.model[-1].m)):
-            mdl.model[-1].m[det_conv_idx] = torch.nn.Conv2d(
-                in_channels=mdl.model[-1].m[det_conv_idx].in_channels,
-                out_channels=int(mdl.model[-1].m[det_conv_idx].out_channels / (1+5) * (num_classes + 5)),
-                kernel_size=mdl.model[-1].m[det_conv_idx].kernel_size,
-                stride=mdl.model[-1].m[det_conv_idx].stride,
-            )    
-    return mdl
-
-
-def yolov5s(
-    pretrained: bool = False, 
-    path: str = "yolov5s.pt",
-    num_classes: int = 1,
-):
-    """Constructs a YOLOv5 architecture with Small scaling.
-    YOLOv5 from `"YOLOv5 GitHub" <https://github.com/ultralytics/yolov5>`_.
-    
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on WBSig53
-        path (str): Path to existing model or where to download checkpoint to
-        num_classes (int): Number of output classes; if loading checkpoint and number does not equal 1, final layer will not be loaded from checkpoint
-        
-    """
-    # Create YOLOv5p
-    mdl = create_yolov5(
-        network='yolov5s',
-        num_classes=1,
-    )
-    if pretrained:
-        model_exists = os.path.exists(path)
-        if not model_exists:
-            file_id = model_urls['yolov5s']
-            dl = gdown.download(id=file_id, output=path)
-        mdl.load_state_dict(torch.load(path), strict=False)
-    if num_classes != 1:
-        mdl.model[-1].no = int(mdl.model[-1].no / (1 + 5) * (num_classes + 5))
-        for det_conv_idx in range(len(mdl.model[-1].m)):
-            mdl.model[-1].m[det_conv_idx] = torch.nn.Conv2d(
-                in_channels=mdl.model[-1].m[det_conv_idx].in_channels,
-                out_channels=int(mdl.model[-1].m[det_conv_idx].out_channels / (1+5) * (num_classes + 5)),
-                kernel_size=mdl.model[-1].m[det_conv_idx].kernel_size,
-                stride=mdl.model[-1].m[det_conv_idx].stride,
-            )
-    return mdl
-
-
-def yolov5p_mod_family(
-    pretrained: bool = False, 
-    path: str = "yolov5p.pt",
-    num_classes: int = 6,
-):
-    """Constructs a YOLOv5 architecture with Pico scaling.
-    YOLOv5 from `"YOLOv5 GitHub" <https://github.com/ultralytics/yolov5>`_.
-    
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on WBSig53
-        path (str): Path to existing model or where to download checkpoint to
-        num_classes (int): Number of output classes; if loading checkpoint and number does not equal 1, final layer will not be loaded from checkpoint
-        
-    """
-    # Create YOLOv5p
-    mdl = create_yolov5(
-        network='yolov5p',
-        num_classes=6,
-    )
-    if pretrained:
-        model_exists = os.path.exists(path)
-        if not model_exists:
-            file_id = model_urls['yolov5p_mod_family']
-            dl = gdown.download(id=file_id, output=path)
-        mdl.load_state_dict(torch.load(path), strict=False)
-    if num_classes != 6:
-        mdl.model[-1].no = int(mdl.model[-1].no / (6 + 5) * (num_classes + 5))
-        for det_conv_idx in range(len(mdl.model[-1].m)):
-            mdl.model[-1].m[det_conv_idx] = torch.nn.Conv2d(
-                in_channels=mdl.model[-1].m[det_conv_idx].in_channels,
-                out_channels=int(mdl.model[-1].m[det_conv_idx].out_channels / (6+5) * (num_classes + 5)),
-                kernel_size=mdl.model[-1].m[det_conv_idx].kernel_size,
-                stride=mdl.model[-1].m[det_conv_idx].stride,
-            )
-    return mdl
-    
-    
-def yolov5n_mod_family(
-    pretrained: bool = False, 
-    path: str = "yolov5n.pt",
-    num_classes: int = 6,
-):
-    """Constructs a YOLOv5 architecture with Nano scaling.
-    YOLOv5 from `"YOLOv5 GitHub" <https://github.com/ultralytics/yolov5>`_.
-    
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on WBSig53
-        path (str): Path to existing model or where to download checkpoint to
-        num_classes (int): Number of output classes; if loading checkpoint and number does not equal 1, final layer will not be loaded from checkpoint
-        
-    """
-    # Create YOLOv5p
-    mdl = create_yolov5(
-        network='yolov5n',
-        num_classes=6,
-    )
-    if pretrained:
-        model_exists = os.path.exists(path)
-        if not model_exists:
-            file_id = model_urls['yolov5n_mod_family']
-            dl = gdown.download(id=file_id, output=path)
-        mdl.load_state_dict(torch.load(path), strict=False)
-    if num_classes != 6:
-        mdl.model[-1].no = int(mdl.model[-1].no / (6 + 5) * (num_classes + 5))
-        for det_conv_idx in range(len(mdl.model[-1].m)):
-            mdl.model[-1].m[det_conv_idx] = torch.nn.Conv2d(
-                in_channels=mdl.model[-1].m[det_conv_idx].in_channels,
-                out_channels=int(mdl.model[-1].m[det_conv_idx].out_channels / (6+5) * (num_classes + 5)),
-                kernel_size=mdl.model[-1].m[det_conv_idx].kernel_size,
-                stride=mdl.model[-1].m[det_conv_idx].stride,
-            )
-    return mdl
-
-
-def yolov5s_mod_family(
-    pretrained: bool = False, 
-    path: str = "yolov5s.pt",
-    num_classes: int = 6,
-):
-    """Constructs a YOLOv5 architecture with Small scaling.
-    YOLOv5 from `"YOLOv5 GitHub" <https://github.com/ultralytics/yolov5>`_.
-    
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on WBSig53
-        path (str): Path to existing model or where to download checkpoint to
-        num_classes (int): Number of output classes; if loading checkpoint and number does not equal 1, final layer will not be loaded from checkpoint
-        
-    """
-    # Create YOLOv5p
-    mdl = create_yolov5(
-        network='yolov5s',
-        num_classes=6,
-    )
-    if pretrained:
-        model_exists = os.path.exists(path)
-        if not model_exists:
-            file_id = model_urls['yolov5s_mod_family']
-            dl = gdown.download(id=file_id, output=path)
-        mdl.load_state_dict(torch.load(path), strict=False)
-    if num_classes != 6:
-        mdl.model[-1].no = int(mdl.model[-1].no / (6 + 5) * (num_classes + 5))
-        for det_conv_idx in range(len(mdl.model[-1].m)):
-            mdl.model[-1].m[det_conv_idx] = torch.nn.Conv2d(
-                in_channels=mdl.model[-1].m[det_conv_idx].in_channels,
-                out_channels=int(mdl.model[-1].m[det_conv_idx].out_channels / (6+5) * (num_classes + 5)),
-                kernel_size=mdl.model[-1].m[det_conv_idx].kernel_size,
-                stride=mdl.model[-1].m[det_conv_idx].stride,
-            )
-    return mdl
diff --git a/torchsig/models/spectrogram_models/yolov5/yolov5f.yaml b/torchsig/models/spectrogram_models/yolov5/yolov5f.yaml
deleted file mode 100644
index 9f76f04..0000000
--- a/torchsig/models/spectrogram_models/yolov5/yolov5f.yaml
+++ /dev/null
@@ -1,50 +0,0 @@
-# YOLOv5 🚀 by Ultralytics, GPL-3.0 license
-
-# Parameters
-nc: 80  # number of classes
-#depth_multiple: 0.23  # model depth multiple
-#width_multiple: 0.1875  # layer channel multiple
-depth_multiple: 0.1
-width_multiple: 0.05
-anchors:
-  - [10,13, 16,30, 33,23]  # P3/8
-  - [30,61, 62,45, 59,119]  # P4/16
-  - [116,90, 156,198, 373,326]  # P5/32
-
-# YOLOv5 v6.0 backbone
-backbone:
-  # [from, number, module, args]
-  [[-1, 1, Conv, [64, 6, 2, 2]],  # 0-P1/2
-   [-1, 1, Conv, [128, 3, 2]],  # 1-P2/4
-   [-1, 3, C3, [128]],
-   [-1, 1, Conv, [256, 3, 2]],  # 3-P3/8
-   [-1, 6, C3, [256]],
-   [-1, 1, Conv, [512, 3, 2]],  # 5-P4/16
-   [-1, 9, C3, [512]],
-   [-1, 1, Conv, [1024, 3, 2]],  # 7-P5/32
-   [-1, 3, C3, [1024]],
-   [-1, 1, SPPF, [1024, 5]],  # 9
-  ]
-
-# YOLOv5 v6.0 head
-head:
-  [[-1, 1, Conv, [512, 1, 1]],
-   [-1, 1, nn.Upsample, [None, 2, 'nearest']],
-   [[-1, 6], 1, Concat, [1]],  # cat backbone P4
-   [-1, 3, C3, [512, False]],  # 13
-
-   [-1, 1, Conv, [256, 1, 1]],
-   [-1, 1, nn.Upsample, [None, 2, 'nearest']],
-   [[-1, 4], 1, Concat, [1]],  # cat backbone P3
-   [-1, 3, C3, [256, False]],  # 17 (P3/8-small)
-
-   [-1, 1, Conv, [256, 3, 2]],
-   [[-1, 14], 1, Concat, [1]],  # cat head P4
-   [-1, 3, C3, [512, False]],  # 20 (P4/16-medium)
-
-   [-1, 1, Conv, [512, 3, 2]],
-   [[-1, 10], 1, Concat, [1]],  # cat head P5
-   [-1, 3, C3, [1024, False]],  # 23 (P5/32-large)
-
-   [[17, 20, 23], 1, Detect, [nc, anchors]],  # Detect(P3, P4, P5)
-  ]
diff --git a/torchsig/models/spectrogram_models/yolov5/yolov5l.yaml b/torchsig/models/spectrogram_models/yolov5/yolov5l.yaml
deleted file mode 100644
index ce8a5de..0000000
--- a/torchsig/models/spectrogram_models/yolov5/yolov5l.yaml
+++ /dev/null
@@ -1,48 +0,0 @@
-# YOLOv5 🚀 by Ultralytics, GPL-3.0 license
-
-# Parameters
-nc: 80  # number of classes
-depth_multiple: 1.0  # model depth multiple
-width_multiple: 1.0  # layer channel multiple
-anchors:
-  - [10,13, 16,30, 33,23]  # P3/8
-  - [30,61, 62,45, 59,119]  # P4/16
-  - [116,90, 156,198, 373,326]  # P5/32
-
-# YOLOv5 v6.0 backbone
-backbone:
-  # [from, number, module, args]
-  [[-1, 1, Conv, [64, 6, 2, 2]],  # 0-P1/2
-   [-1, 1, Conv, [128, 3, 2]],  # 1-P2/4
-   [-1, 3, C3, [128]],
-   [-1, 1, Conv, [256, 3, 2]],  # 3-P3/8
-   [-1, 6, C3, [256]],
-   [-1, 1, Conv, [512, 3, 2]],  # 5-P4/16
-   [-1, 9, C3, [512]],
-   [-1, 1, Conv, [1024, 3, 2]],  # 7-P5/32
-   [-1, 3, C3, [1024]],
-   [-1, 1, SPPF, [1024, 5]],  # 9
-  ]
-
-# YOLOv5 v6.0 head
-head:
-  [[-1, 1, Conv, [512, 1, 1]],
-   [-1, 1, nn.Upsample, [None, 2, 'nearest']],
-   [[-1, 6], 1, Concat, [1]],  # cat backbone P4
-   [-1, 3, C3, [512, False]],  # 13
-
-   [-1, 1, Conv, [256, 1, 1]],
-   [-1, 1, nn.Upsample, [None, 2, 'nearest']],
-   [[-1, 4], 1, Concat, [1]],  # cat backbone P3
-   [-1, 3, C3, [256, False]],  # 17 (P3/8-small)
-
-   [-1, 1, Conv, [256, 3, 2]],
-   [[-1, 14], 1, Concat, [1]],  # cat head P4
-   [-1, 3, C3, [512, False]],  # 20 (P4/16-medium)
-
-   [-1, 1, Conv, [512, 3, 2]],
-   [[-1, 10], 1, Concat, [1]],  # cat head P5
-   [-1, 3, C3, [1024, False]],  # 23 (P5/32-large)
-
-   [[17, 20, 23], 1, Detect, [nc, anchors]],  # Detect(P3, P4, P5)
-  ]
diff --git a/torchsig/models/spectrogram_models/yolov5/yolov5m.yaml b/torchsig/models/spectrogram_models/yolov5/yolov5m.yaml
deleted file mode 100644
index ad13ab3..0000000
--- a/torchsig/models/spectrogram_models/yolov5/yolov5m.yaml
+++ /dev/null
@@ -1,48 +0,0 @@
-# YOLOv5 🚀 by Ultralytics, GPL-3.0 license
-
-# Parameters
-nc: 80  # number of classes
-depth_multiple: 0.67  # model depth multiple
-width_multiple: 0.75  # layer channel multiple
-anchors:
-  - [10,13, 16,30, 33,23]  # P3/8
-  - [30,61, 62,45, 59,119]  # P4/16
-  - [116,90, 156,198, 373,326]  # P5/32
-
-# YOLOv5 v6.0 backbone
-backbone:
-  # [from, number, module, args]
-  [[-1, 1, Conv, [64, 6, 2, 2]],  # 0-P1/2
-   [-1, 1, Conv, [128, 3, 2]],  # 1-P2/4
-   [-1, 3, C3, [128]],
-   [-1, 1, Conv, [256, 3, 2]],  # 3-P3/8
-   [-1, 6, C3, [256]],
-   [-1, 1, Conv, [512, 3, 2]],  # 5-P4/16
-   [-1, 9, C3, [512]],
-   [-1, 1, Conv, [1024, 3, 2]],  # 7-P5/32
-   [-1, 3, C3, [1024]],
-   [-1, 1, SPPF, [1024, 5]],  # 9
-  ]
-
-# YOLOv5 v6.0 head
-head:
-  [[-1, 1, Conv, [512, 1, 1]],
-   [-1, 1, nn.Upsample, [None, 2, 'nearest']],
-   [[-1, 6], 1, Concat, [1]],  # cat backbone P4
-   [-1, 3, C3, [512, False]],  # 13
-
-   [-1, 1, Conv, [256, 1, 1]],
-   [-1, 1, nn.Upsample, [None, 2, 'nearest']],
-   [[-1, 4], 1, Concat, [1]],  # cat backbone P3
-   [-1, 3, C3, [256, False]],  # 17 (P3/8-small)
-
-   [-1, 1, Conv, [256, 3, 2]],
-   [[-1, 14], 1, Concat, [1]],  # cat head P4
-   [-1, 3, C3, [512, False]],  # 20 (P4/16-medium)
-
-   [-1, 1, Conv, [512, 3, 2]],
-   [[-1, 10], 1, Concat, [1]],  # cat head P5
-   [-1, 3, C3, [1024, False]],  # 23 (P5/32-large)
-
-   [[17, 20, 23], 1, Detect, [nc, anchors]],  # Detect(P3, P4, P5)
-  ]
diff --git a/torchsig/models/spectrogram_models/yolov5/yolov5n.yaml b/torchsig/models/spectrogram_models/yolov5/yolov5n.yaml
deleted file mode 100644
index 8a28a40..0000000
--- a/torchsig/models/spectrogram_models/yolov5/yolov5n.yaml
+++ /dev/null
@@ -1,48 +0,0 @@
-# YOLOv5 🚀 by Ultralytics, GPL-3.0 license
-
-# Parameters
-nc: 80  # number of classes
-depth_multiple: 0.33  # model depth multiple
-width_multiple: 0.25  # layer channel multiple
-anchors:
-  - [10,13, 16,30, 33,23]  # P3/8
-  - [30,61, 62,45, 59,119]  # P4/16
-  - [116,90, 156,198, 373,326]  # P5/32
-
-# YOLOv5 v6.0 backbone
-backbone:
-  # [from, number, module, args]
-  [[-1, 1, Conv, [64, 6, 2, 2]],  # 0-P1/2
-   [-1, 1, Conv, [128, 3, 2]],  # 1-P2/4
-   [-1, 3, C3, [128]],
-   [-1, 1, Conv, [256, 3, 2]],  # 3-P3/8
-   [-1, 6, C3, [256]],
-   [-1, 1, Conv, [512, 3, 2]],  # 5-P4/16
-   [-1, 9, C3, [512]],
-   [-1, 1, Conv, [1024, 3, 2]],  # 7-P5/32
-   [-1, 3, C3, [1024]],
-   [-1, 1, SPPF, [1024, 5]],  # 9
-  ]
-
-# YOLOv5 v6.0 head
-head:
-  [[-1, 1, Conv, [512, 1, 1]],
-   [-1, 1, nn.Upsample, [None, 2, 'nearest']],
-   [[-1, 6], 1, Concat, [1]],  # cat backbone P4
-   [-1, 3, C3, [512, False]],  # 13
-
-   [-1, 1, Conv, [256, 1, 1]],
-   [-1, 1, nn.Upsample, [None, 2, 'nearest']],
-   [[-1, 4], 1, Concat, [1]],  # cat backbone P3
-   [-1, 3, C3, [256, False]],  # 17 (P3/8-small)
-
-   [-1, 1, Conv, [256, 3, 2]],
-   [[-1, 14], 1, Concat, [1]],  # cat head P4
-   [-1, 3, C3, [512, False]],  # 20 (P4/16-medium)
-
-   [-1, 1, Conv, [512, 3, 2]],
-   [[-1, 10], 1, Concat, [1]],  # cat head P5
-   [-1, 3, C3, [1024, False]],  # 23 (P5/32-large)
-
-   [[17, 20, 23], 1, Detect, [nc, anchors]],  # Detect(P3, P4, P5)
-  ]
diff --git a/torchsig/models/spectrogram_models/yolov5/yolov5p.yaml b/torchsig/models/spectrogram_models/yolov5/yolov5p.yaml
deleted file mode 100644
index a399a01..0000000
--- a/torchsig/models/spectrogram_models/yolov5/yolov5p.yaml
+++ /dev/null
@@ -1,50 +0,0 @@
-# YOLOv5 🚀 by Ultralytics, GPL-3.0 license
-
-# Parameters
-nc: 80  # number of classes
-#depth_multiple: 0.23  # model depth multiple
-#width_multiple: 0.1875  # layer channel multiple
-depth_multiple: 0.2
-width_multiple: 0.1
-anchors:
-  - [10,13, 16,30, 33,23]  # P3/8
-  - [30,61, 62,45, 59,119]  # P4/16
-  - [116,90, 156,198, 373,326]  # P5/32
-
-# YOLOv5 v6.0 backbone
-backbone:
-  # [from, number, module, args]
-  [[-1, 1, Conv, [64, 6, 2, 2]],  # 0-P1/2
-   [-1, 1, Conv, [128, 3, 2]],  # 1-P2/4
-   [-1, 3, C3, [128]],
-   [-1, 1, Conv, [256, 3, 2]],  # 3-P3/8
-   [-1, 6, C3, [256]],
-   [-1, 1, Conv, [512, 3, 2]],  # 5-P4/16
-   [-1, 9, C3, [512]],
-   [-1, 1, Conv, [1024, 3, 2]],  # 7-P5/32
-   [-1, 3, C3, [1024]],
-   [-1, 1, SPPF, [1024, 5]],  # 9
-  ]
-
-# YOLOv5 v6.0 head
-head:
-  [[-1, 1, Conv, [512, 1, 1]],
-   [-1, 1, nn.Upsample, [None, 2, 'nearest']],
-   [[-1, 6], 1, Concat, [1]],  # cat backbone P4
-   [-1, 3, C3, [512, False]],  # 13
-
-   [-1, 1, Conv, [256, 1, 1]],
-   [-1, 1, nn.Upsample, [None, 2, 'nearest']],
-   [[-1, 4], 1, Concat, [1]],  # cat backbone P3
-   [-1, 3, C3, [256, False]],  # 17 (P3/8-small)
-
-   [-1, 1, Conv, [256, 3, 2]],
-   [[-1, 14], 1, Concat, [1]],  # cat head P4
-   [-1, 3, C3, [512, False]],  # 20 (P4/16-medium)
-
-   [-1, 1, Conv, [512, 3, 2]],
-   [[-1, 10], 1, Concat, [1]],  # cat head P5
-   [-1, 3, C3, [1024, False]],  # 23 (P5/32-large)
-
-   [[17, 20, 23], 1, Detect, [nc, anchors]],  # Detect(P3, P4, P5)
-  ]
diff --git a/torchsig/models/spectrogram_models/yolov5/yolov5s.yaml b/torchsig/models/spectrogram_models/yolov5/yolov5s.yaml
deleted file mode 100644
index f35beab..0000000
--- a/torchsig/models/spectrogram_models/yolov5/yolov5s.yaml
+++ /dev/null
@@ -1,48 +0,0 @@
-# YOLOv5 🚀 by Ultralytics, GPL-3.0 license
-
-# Parameters
-nc: 80  # number of classes
-depth_multiple: 0.33  # model depth multiple
-width_multiple: 0.50  # layer channel multiple
-anchors:
-  - [10,13, 16,30, 33,23]  # P3/8
-  - [30,61, 62,45, 59,119]  # P4/16
-  - [116,90, 156,198, 373,326]  # P5/32
-
-# YOLOv5 v6.0 backbone
-backbone:
-  # [from, number, module, args]
-  [[-1, 1, Conv, [64, 6, 2, 2]],  # 0-P1/2
-   [-1, 1, Conv, [128, 3, 2]],  # 1-P2/4
-   [-1, 3, C3, [128]],
-   [-1, 1, Conv, [256, 3, 2]],  # 3-P3/8
-   [-1, 6, C3, [256]],
-   [-1, 1, Conv, [512, 3, 2]],  # 5-P4/16
-   [-1, 9, C3, [512]],
-   [-1, 1, Conv, [1024, 3, 2]],  # 7-P5/32
-   [-1, 3, C3, [1024]],
-   [-1, 1, SPPF, [1024, 5]],  # 9
-  ]
-
-# YOLOv5 v6.0 head
-head:
-  [[-1, 1, Conv, [512, 1, 1]],
-   [-1, 1, nn.Upsample, [None, 2, 'nearest']],
-   [[-1, 6], 1, Concat, [1]],  # cat backbone P4
-   [-1, 3, C3, [512, False]],  # 13
-
-   [-1, 1, Conv, [256, 1, 1]],
-   [-1, 1, nn.Upsample, [None, 2, 'nearest']],
-   [[-1, 4], 1, Concat, [1]],  # cat backbone P3
-   [-1, 3, C3, [256, False]],  # 17 (P3/8-small)
-
-   [-1, 1, Conv, [256, 3, 2]],
-   [[-1, 14], 1, Concat, [1]],  # cat head P4
-   [-1, 3, C3, [512, False]],  # 20 (P4/16-medium)
-
-   [-1, 1, Conv, [512, 3, 2]],
-   [[-1, 10], 1, Concat, [1]],  # cat head P5
-   [-1, 3, C3, [1024, False]],  # 23 (P5/32-large)
-
-   [[17, 20, 23], 1, Detect, [nc, anchors]],  # Detect(P3, P4, P5)
-  ]
diff --git a/torchsig/models/spectrogram_models/yolov5/yolov5x.yaml b/torchsig/models/spectrogram_models/yolov5/yolov5x.yaml
deleted file mode 100644
index f617a02..0000000
--- a/torchsig/models/spectrogram_models/yolov5/yolov5x.yaml
+++ /dev/null
@@ -1,48 +0,0 @@
-# YOLOv5 🚀 by Ultralytics, GPL-3.0 license
-
-# Parameters
-nc: 80  # number of classes
-depth_multiple: 1.33  # model depth multiple
-width_multiple: 1.25  # layer channel multiple
-anchors:
-  - [10,13, 16,30, 33,23]  # P3/8
-  - [30,61, 62,45, 59,119]  # P4/16
-  - [116,90, 156,198, 373,326]  # P5/32
-
-# YOLOv5 v6.0 backbone
-backbone:
-  # [from, number, module, args]
-  [[-1, 1, Conv, [64, 6, 2, 2]],  # 0-P1/2
-   [-1, 1, Conv, [128, 3, 2]],  # 1-P2/4
-   [-1, 3, C3, [128]],
-   [-1, 1, Conv, [256, 3, 2]],  # 3-P3/8
-   [-1, 6, C3, [256]],
-   [-1, 1, Conv, [512, 3, 2]],  # 5-P4/16
-   [-1, 9, C3, [512]],
-   [-1, 1, Conv, [1024, 3, 2]],  # 7-P5/32
-   [-1, 3, C3, [1024]],
-   [-1, 1, SPPF, [1024, 5]],  # 9
-  ]
-
-# YOLOv5 v6.0 head
-head:
-  [[-1, 1, Conv, [512, 1, 1]],
-   [-1, 1, nn.Upsample, [None, 2, 'nearest']],
-   [[-1, 6], 1, Concat, [1]],  # cat backbone P4
-   [-1, 3, C3, [512, False]],  # 13
-
-   [-1, 1, Conv, [256, 1, 1]],
-   [-1, 1, nn.Upsample, [None, 2, 'nearest']],
-   [[-1, 4], 1, Concat, [1]],  # cat backbone P3
-   [-1, 3, C3, [256, False]],  # 17 (P3/8-small)
-
-   [-1, 1, Conv, [256, 3, 2]],
-   [[-1, 14], 1, Concat, [1]],  # cat head P4
-   [-1, 3, C3, [512, False]],  # 20 (P4/16-medium)
-
-   [-1, 1, Conv, [512, 3, 2]],
-   [[-1, 10], 1, Concat, [1]],  # cat head P5
-   [-1, 3, C3, [1024, False]],  # 23 (P5/32-large)
-
-   [[17, 20, 23], 1, Detect, [nc, anchors]],  # Detect(P3, P4, P5)
-  ]
diff --git a/torchsig/transforms/spectrogram_transforms/spec.py b/torchsig/transforms/spectrogram_transforms/spec.py
index 54adf14..77879ee 100644
--- a/torchsig/transforms/spectrogram_transforms/spec.py
+++ b/torchsig/transforms/spectrogram_transforms/spec.py
@@ -1,179 +1,21 @@
 import numpy as np
 from copy import deepcopy
 from typing import Optional, Any, Union, List
-
 from torchsig.utils.dataset import SignalDataset
 from torchsig.utils.types import SignalData, SignalDescription
 from torchsig.transforms.transforms import SignalTransform
 from torchsig.transforms.spectrogram_transforms import functional
-from torchsig.transforms.functional import NumericParameter, FloatParameter, IntParameter
-from torchsig.transforms.functional import to_distribution, uniform_continuous_distribution, uniform_discrete_distribution
-
-
-class SpectrogramResize(SignalTransform):
-    """SpectrogramResize inputs data that has already been transformed into a
-    spectrogram, and then it crops and/or pads both the time and frequency
-    dimensions to reach a specified target width (time) and height (frequency).
-
-    Args:
-        width (:obj:`int`):
-            Target output width (time) of the spectrogram
-        height (:obj:`int`):
-            Target output height (frequency) of the spectrogram
-
-    Example:
-        >>> import torchsig.transforms as ST
-        >>> # Resize input spectrogram to (512,512)
-        >>> transform = ST.SpectrogramResize(width=512, height=512)
-
-    """
-    def __init__(
-        self,
-        width: int = 512,
-        height: int = 512,
-    ):
-        super(SpectrogramResize, self).__init__()
-        self.width = width
-        self.height = height
+from torchsig.transforms.functional import (
+    NumericParameter,
+    FloatParameter,
+    IntParameter,
+)
+from torchsig.transforms.functional import (
+    to_distribution,
+    uniform_continuous_distribution,
+    uniform_discrete_distribution,
+)
 
-    def __call__(self, data: Any) -> Any:
-        spec_data = data.iq_data if isinstance(data, SignalData) else data
-        
-        # Next, perform the random cropping/padding
-        channels, curr_height, curr_width = spec_data.shape
-        pad_height, crop_height = False, False
-        pad_width, crop_width = False, False
-        pad_height_samps, pad_width_samps = 0, 0
-        if curr_height < self.height:
-            pad_height = True
-            pad_height_samps = self.height - curr_height
-        elif curr_height > self.height:
-            crop_height = True
-        if curr_width < self.width:
-            pad_width = True
-            pad_width_samps = self.width - curr_width
-        elif curr_width > self.width:
-            crop_width = True
-        
-        if pad_height or pad_width:
-            def pad_func(vector, pad_width, iaxis, kwargs):
-                vector[:pad_width[0]] = np.random.rand(len(vector[:pad_width[0]]))*kwargs['pad_value']
-                vector[-pad_width[1]:] = np.random.rand(len(vector[-pad_width[1]:]))*kwargs['pad_value']
-
-            if channels == 2:
-                new_data_real = np.pad(
-                    spec_data[0], 
-                    (
-                        (pad_height_samps//2+1,pad_height_samps//2+1),
-                        (pad_width_samps//2+1,pad_width_samps//2+1),
-                    ), 
-                    pad_func,
-                    pad_value = np.percentile(np.abs(spec_data[0]),50),
-                )
-                new_data_imag = np.pad(
-                    spec_data[1], 
-                    (
-                        (pad_height_samps//2+1,pad_height_samps//2+1),
-                        (pad_width_samps//2+1,pad_width_samps//2+1),
-                    ), 
-                    pad_func,
-                    pad_value = np.percentile(np.abs(spec_data[1]),50),
-                )
-                spec_data = np.concatenate(
-                    [
-                        np.expand_dims(new_data_real,axis=0),
-                        np.expand_dims(new_data_imag,axis=0)
-                    ],
-                    axis=0,
-                )
-            else:
-                spec_data = np.pad(
-                    spec_data, 
-                    (
-                        (pad_height_samps//2+1,pad_height_samps//2+1),
-                        (pad_width_samps//2+1,pad_width_samps//2+1),
-                    ), 
-                    pad_func,
-                    min_value = np.percentile(np.abs(spec_data[0]),50),
-                )
-
-        spec_data = spec_data[:,:self.height,:self.width]
-        
-        # Update SignalData object if necessary, otherwise return
-        if isinstance(data, SignalData):
-            # Create new SignalData object for transformed data
-            new_data = SignalData(
-                data=None,
-                item_type=np.dtype(np.float64),
-                data_type=np.dtype(np.complex128),
-                signal_description=[],
-            )
-            new_data.iq_data = spec_data
-            
-            # Update SignalDescription
-            new_signal_description = []
-            signal_description = [data.signal_description] if isinstance(data.signal_description, SignalDescription) else data.signal_description
-            for signal_desc in signal_description:
-                new_signal_desc = deepcopy(signal_desc)
-                
-                # Check bounds for partial signals
-                new_signal_desc.lower_frequency = -0.5 if new_signal_desc.lower_frequency < -0.5 else new_signal_desc.lower_frequency
-                new_signal_desc.upper_frequency = 0.5 if new_signal_desc.upper_frequency > 0.5 else new_signal_desc.upper_frequency
-                new_signal_desc.bandwidth = new_signal_desc.upper_frequency - new_signal_desc.lower_frequency
-                new_signal_desc.center_frequency = new_signal_desc.lower_frequency + new_signal_desc.bandwidth * 0.5
-
-                # Update labels based on padding/cropping
-                if pad_height:
-                    new_signal_desc.lower_frequency = ((new_signal_desc.lower_frequency+0.5)*curr_height + pad_height_samps//2+1) / self.height - 0.5
-                    new_signal_desc.upper_frequency = ((new_signal_desc.upper_frequency+0.5)*curr_height + pad_height_samps//2+1) / self.height - 0.5
-                    new_signal_desc.center_frequency = ((new_signal_desc.center_frequency+0.5)*curr_height + pad_height_samps//2+1) / self.height - 0.5
-                    new_signal_desc.bandwidth = new_signal_desc.upper_frequency - new_signal_desc.lower_frequency
-
-                if crop_height:
-                    if (new_signal_desc.lower_frequency+0.5)*curr_height >= crop_height_start+self.height or \
-                        (new_signal_desc.upper_frequency+0.5)*curr_height <= crop_height_start:
-                        continue
-                    if (new_signal_desc.lower_frequency+0.5)*curr_height <= crop_height_start:
-                        new_signal_desc.lower_frequency = -0.5
-                    else:
-                        new_signal_desc.lower_frequency = ((new_signal_desc.lower_frequency+0.5)*curr_height) / self.height - 0.5
-                    if (new_signal_desc.upper_frequency+0.5)*curr_height >= crop_height_start+self.height:
-                        new_signal_desc.upper_frequency = crop_height_start+self.height
-                    else:
-                        new_signal_desc.upper_frequency = ((new_signal_desc.upper_frequency+0.5)*curr_height) / self.height - 0.5
-                    new_signal_desc.bandwidth = new_signal_desc.upper_frequency - new_signal_desc.lower_frequency
-                    new_signal_desc.center_frequency = new_signal_desc.lower_frequency + new_signal_desc.bandwidth / 2
-                    
-                if pad_width:
-                    new_signal_desc.start = (new_signal_desc.start * curr_width + pad_width_samps//2+1) / self.width
-                    new_signal_desc.stop = (new_signal_desc.stop * curr_width + pad_width_samps//2+1) / self.width
-                    new_signal_desc.duration = new_signal_desc.stop - new_signal_desc.start
-                
-                if crop_width:
-                    if new_signal_desc.start*curr_width <= 0:
-                        new_signal_desc.start = 0.0
-                    elif new_signal_desc.start*curr_width >= self.width:
-                        continue
-                    else:
-                        new_signal_desc.start = (new_signal_desc.start * curr_width) / self.width    
-                    if new_signal_desc.stop*curr_width >= self.width:
-                        new_signal_desc.stop = 1.0
-                    elif new_signal_desc.stop*curr_width <= 0:
-                        continue
-                    else:
-                        new_signal_desc.stop = (new_signal_desc.stop * curr_width) / self.width
-                    new_signal_desc.duration = new_signal_desc.stop - new_signal_desc.start
-                
-                # Append SignalDescription to list
-                new_signal_description.append(new_signal_desc)
-                
-            new_data.signal_description = new_signal_description
-        
-        else:
-            new_data = spec_data
-        
-        return new_data
-    
 
 class SpectrogramDropSamples(SignalTransform):
     """Randomly drop samples from the input data of specified durations and
@@ -186,10 +28,10 @@ class SpectrogramDropSamples(SignalTransform):
     * `min`: replace drop samples with the minimum of the absolute power
     * `max`: replace drop samples with the maximum of the absolute power
     * `ones`: replace drop samples with ones
-    
+
     Transform is based off of the
     `TSAug Dropout Transform <https://github.com/arundo/tsaug/blob/master/src/tsaug/_augmenter/dropout.py>`_.
-    
+
     Args:
          drop_rate (:py:class:`~Callable`, :obj:`int`, :obj:`float`, :obj:`list`, :obj:`tuple`):
             drop_rate sets the rate at which to drop samples
@@ -197,26 +39,29 @@ class SpectrogramDropSamples(SignalTransform):
             * If int or float, drop_rate is fixed at the value provided
             * If list, drop_rate is any element in the list
             * If tuple, drop_rate is in range of (tuple[0], tuple[1])
-            
+
         size (:py:class:`~Callable`, :obj:`int`, :obj:`float`, :obj:`list`, :obj:`tuple`):
             size sets the size of each instance of dropped samples
             * If Callable, produces a sample by calling size()
             * If int or float, size is fixed at the value provided
             * If list, size is any element in the list
             * If tuple, size is in range of (tuple[0], tuple[1])
-            
+
         fill (:py:class:`~Callable`, :obj:`list`, :obj:`str`):
             fill sets the method of how the dropped samples should be filled
             * If Callable, produces a sample by calling fill()
             * If list, fill is any element in the list
             * If str, fill is fixed at the method provided
-    
+
     """
+
     def __init__(
         self,
-        drop_rate: NumericParameter = uniform_continuous_distribution(0.001,0.005),
-        size: NumericParameter = uniform_discrete_distribution(np.arange(1,10)),
-        fill: Union[List, str] = uniform_discrete_distribution(["ffill", "bfill", "mean", "zero", "low", "min", "max", "ones"]),
+        drop_rate: NumericParameter = uniform_continuous_distribution(0.001, 0.005),
+        size: NumericParameter = uniform_discrete_distribution(np.arange(1, 10)),
+        fill: Union[List, str] = uniform_discrete_distribution(
+            ["ffill", "bfill", "mean", "zero", "low", "min", "max", "ones"]
+        ),
     ):
         super(SpectrogramDropSamples, self).__init__()
         self.drop_rate = to_distribution(drop_rate, self.random_generator)
@@ -226,7 +71,7 @@ def __init__(
     def __call__(self, data: Any) -> Any:
         drop_rate = self.drop_rate()
         fill = self.fill()
-        
+
         if isinstance(data, SignalData):
             # Create new SignalData object for transformed data
             new_data = SignalData(
@@ -235,31 +80,37 @@ def __call__(self, data: Any) -> Any:
                 data_type=np.dtype(np.float64),
                 signal_description=data.signal_description,
             )
-            
+
             # Perform data augmentation
             channels, height, width = data.iq_data.shape
             spec_size = height * width
             drop_instances = int(spec_size * drop_rate)
             drop_sizes = self.size(drop_instances).astype(int)
-            drop_starts = np.random.uniform(1, spec_size-max(drop_sizes)-1, drop_instances).astype(int)
-            
-            new_data.iq_data = functional.drop_spec_samples(data.iq_data, drop_starts, drop_sizes, fill)
-                
+            drop_starts = np.random.uniform(
+                1, spec_size - max(drop_sizes) - 1, drop_instances
+            ).astype(int)
+
+            new_data.iq_data = functional.drop_spec_samples(
+                data.iq_data, drop_starts, drop_sizes, fill
+            )
+
         else:
             drop_instances = int(data.shape[0] * drop_rate)
             drop_sizes = self.size(drop_instances).astype(int)
-            drop_starts = np.random.uniform(0, data.shape[0]-max(drop_sizes), drop_instances).astype(int)
-            
+            drop_starts = np.random.uniform(
+                0, data.shape[0] - max(drop_sizes), drop_instances
+            ).astype(int)
+
             new_data = functional.drop_spec_samples(data, drop_starts, drop_sizes, fill)
         return new_data
-    
+
 
 class SpectrogramPatchShuffle(SignalTransform):
     """Randomly shuffle multiple local regions of samples.
-    
+
     Transform is loosely based on
     `PatchShuffle Regularization <https://arxiv.org/pdf/1707.07103.pdf>`_.
-    
+
     Args:
          patch_size (:py:class:`~Callable`, :obj:`int`, :obj:`float`, :obj:`list`, :obj:`tuple`):
             patch_size sets the size of each patch to shuffle
@@ -267,19 +118,20 @@ class SpectrogramPatchShuffle(SignalTransform):
             * If int or float, patch_size is fixed at the value provided
             * If list, patch_size is any element in the list
             * If tuple, patch_size is in range of (tuple[0], tuple[1])
-            
+
         shuffle_ratio (:py:class:`~Callable`, :obj:`int`, :obj:`float`, :obj:`list`, :obj:`tuple`):
             shuffle_ratio sets the ratio of the patches to shuffle
             * If Callable, produces a sample by calling shuffle_ratio()
             * If int or float, shuffle_ratio is fixed at the value provided
             * If list, shuffle_ratio is any element in the list
             * If tuple, shuffle_ratio is in range of (tuple[0], tuple[1])
-    
+
     """
+
     def __init__(
         self,
-        patch_size: NumericParameter = uniform_continuous_distribution(2,16),
-        shuffle_ratio: FloatParameter = uniform_continuous_distribution(0.01,0.10),
+        patch_size: NumericParameter = uniform_continuous_distribution(2, 16),
+        shuffle_ratio: FloatParameter = uniform_continuous_distribution(0.01, 0.10),
     ):
         super(SpectrogramPatchShuffle, self).__init__()
         self.patch_size = to_distribution(patch_size, self.random_generator)
@@ -288,7 +140,7 @@ def __init__(
     def __call__(self, data: Any) -> Any:
         patch_size = int(self.patch_size())
         shuffle_ratio = self.shuffle_ratio()
-        
+
         if isinstance(data, SignalData):
             # Create new SignalData object for transformed data
             new_data = SignalData(
@@ -297,18 +149,20 @@ def __call__(self, data: Any) -> Any:
                 data_type=np.dtype(np.complex128),
                 signal_description=data.signal_description,
             )
-            
+
             # Perform data augmentation
-            new_data.iq_data = functional.spec_patch_shuffle(data.iq_data, patch_size, shuffle_ratio)
+            new_data.iq_data = functional.spec_patch_shuffle(
+                data.iq_data, patch_size, shuffle_ratio
+            )
         else:
             new_data = functional.spec_patch_shuffle(data, patch_size, shuffle_ratio)
         return new_data
 
-    
+
 class SpectrogramTranslation(SignalTransform):
     """Transform that inputs a spectrogram and applies a random time/freq
     translation
-    
+
     Args:
          time_shift (:py:class:`~Callable`, :obj:`int`, :obj:`float`, :obj:`list`, :obj:`tuple`):
             time_shift sets the translation along the time-axis
@@ -316,19 +170,20 @@ class SpectrogramTranslation(SignalTransform):
             * If int, time_shift is fixed at the value provided
             * If list, time_shift is any element in the list
             * If tuple, time_shift is in range of (tuple[0], tuple[1])
-            
+
         freq_shift (:py:class:`~Callable`, :obj:`int`, :obj:`float`, :obj:`list`, :obj:`tuple`):
             freq_shift sets the translation along the freq-axis
             * If Callable, produces a sample by calling freq_shift()
             * If int, freq_shift is fixed at the value provided
             * If list, freq_shift is any element in the list
             * If tuple, freq_shift is in range of (tuple[0], tuple[1])
-    
+
     """
+
     def __init__(
         self,
-        time_shift: IntParameter = uniform_continuous_distribution(-128,128),
-        freq_shift: IntParameter = uniform_continuous_distribution(-128,128),
+        time_shift: IntParameter = uniform_continuous_distribution(-128, 128),
+        freq_shift: IntParameter = uniform_continuous_distribution(-128, 128),
     ):
         super(SpectrogramTranslation, self).__init__()
         self.time_shift = to_distribution(time_shift, self.random_generator)
@@ -337,7 +192,7 @@ def __init__(
     def __call__(self, data: Any) -> Any:
         time_shift = int(self.time_shift())
         freq_shift = int(self.freq_shift())
-        
+
         if isinstance(data, SignalData):
             # Create new SignalData object for transformed data
             new_data = SignalData(
@@ -346,64 +201,108 @@ def __call__(self, data: Any) -> Any:
                 data_type=np.dtype(np.complex128),
                 signal_description=data.signal_description,
             )
-            
-            new_data.iq_data = functional.spec_translate(data.iq_data, time_shift, freq_shift)
-            
+
+            new_data.iq_data = functional.spec_translate(
+                data.iq_data, time_shift, freq_shift
+            )
+
             # Update SignalDescription
             new_signal_description = []
-            signal_description = [data.signal_description] if isinstance(data.signal_description, SignalDescription) else data.signal_description
+            signal_description = (
+                [data.signal_description]
+                if isinstance(data.signal_description, SignalDescription)
+                else data.signal_description
+            )
             for signal_desc in signal_description:
                 new_signal_desc = deepcopy(signal_desc)
-                
+
                 # Update time fields
-                new_signal_desc.start = new_signal_desc.start + time_shift / new_data.iq_data.shape[1]
-                new_signal_desc.stop = new_signal_desc.stop + time_shift / new_data.iq_data.shape[1]
+                new_signal_desc.start = (
+                    new_signal_desc.start + time_shift / new_data.iq_data.shape[1]
+                )
+                new_signal_desc.stop = (
+                    new_signal_desc.stop + time_shift / new_data.iq_data.shape[1]
+                )
                 if new_signal_desc.start >= 1.0 or new_signal_desc.stop <= 0.0:
                     continue
-                new_signal_desc.start = 0.0 if new_signal_desc.start < 0.0 else new_signal_desc.start
-                new_signal_desc.stop = 1.0 if new_signal_desc.stop > 1.0 else new_signal_desc.stop
+                new_signal_desc.start = (
+                    0.0 if new_signal_desc.start < 0.0 else new_signal_desc.start
+                )
+                new_signal_desc.stop = (
+                    1.0 if new_signal_desc.stop > 1.0 else new_signal_desc.stop
+                )
                 new_signal_desc.duration = new_signal_desc.stop - new_signal_desc.start
-                
+
                 # Trim any out-of-capture freq values
-                new_signal_desc.lower_frequency = -0.5 if new_signal_desc.lower_frequency < -0.5 else new_signal_desc.lower_frequency
-                new_signal_desc.upper_frequency = 0.5 if new_signal_desc.upper_frequency > 0.5 else new_signal_desc.upper_frequency
-                
+                new_signal_desc.lower_frequency = (
+                    -0.5
+                    if new_signal_desc.lower_frequency < -0.5
+                    else new_signal_desc.lower_frequency
+                )
+                new_signal_desc.upper_frequency = (
+                    0.5
+                    if new_signal_desc.upper_frequency > 0.5
+                    else new_signal_desc.upper_frequency
+                )
+
                 # Update freq fields
-                new_signal_desc.lower_frequency = new_signal_desc.lower_frequency + freq_shift / new_data.iq_data.shape[2]
-                new_signal_desc.upper_frequency = new_signal_desc.upper_frequency + freq_shift / new_data.iq_data.shape[2]
-                if new_signal_desc.lower_frequency >= 0.5 or new_signal_desc.upper_frequency <= -0.5:
+                new_signal_desc.lower_frequency = (
+                    new_signal_desc.lower_frequency
+                    + freq_shift / new_data.iq_data.shape[2]
+                )
+                new_signal_desc.upper_frequency = (
+                    new_signal_desc.upper_frequency
+                    + freq_shift / new_data.iq_data.shape[2]
+                )
+                if (
+                    new_signal_desc.lower_frequency >= 0.5
+                    or new_signal_desc.upper_frequency <= -0.5
+                ):
                     continue
-                new_signal_desc.lower_frequency = -0.5 if new_signal_desc.lower_frequency < -0.5 else new_signal_desc.lower_frequency
-                new_signal_desc.upper_frequency = 0.5 if new_signal_desc.upper_frequency > 0.5 else new_signal_desc.upper_frequency
-                new_signal_desc.bandwidth = new_signal_desc.upper_frequency - new_signal_desc.lower_frequency
-                new_signal_desc.center_frequency = new_signal_desc.lower_frequency + new_signal_desc.bandwidth * 0.5
-                
+                new_signal_desc.lower_frequency = (
+                    -0.5
+                    if new_signal_desc.lower_frequency < -0.5
+                    else new_signal_desc.lower_frequency
+                )
+                new_signal_desc.upper_frequency = (
+                    0.5
+                    if new_signal_desc.upper_frequency > 0.5
+                    else new_signal_desc.upper_frequency
+                )
+                new_signal_desc.bandwidth = (
+                    new_signal_desc.upper_frequency - new_signal_desc.lower_frequency
+                )
+                new_signal_desc.center_frequency = (
+                    new_signal_desc.lower_frequency + new_signal_desc.bandwidth * 0.5
+                )
+
                 # Append SignalDescription to list
                 new_signal_description.append(new_signal_desc)
-            
+
             # Set output data's SignalDescription to above list
             new_data.signal_description = new_signal_description
-                
+
         else:
             new_data = functional.spec_translate(data, time_shift, freq_shift)
         return new_data
-    
-    
+
+
 class SpectrogramMosaicCrop(SignalTransform):
     """The SpectrogramMosaicCrop transform takes the original input tensor and
-    inserts it randomly into one cell of a 2x2 grid of 2x the size of the 
-    orginal spectrogram input. The `dataset` argument is then read 3x to 
+    inserts it randomly into one cell of a 2x2 grid of 2x the size of the
+    orginal spectrogram input. The `dataset` argument is then read 3x to
     retrieve spectrograms to fill the remaining cells of the 2x2 grid. Finally,
     the 2x larger stitched view of 4x spectrograms is randomly cropped to the
     original target size, containing pieces of each of the 4x stitched
     spectrograms.
-    
+
     Args:
         dataset :obj:`SignalDataset`:
             An SignalDataset of complex-valued examples to be used as a source for
             the mosaic operation
-    
+
     """
+
     def __init__(self, dataset: SignalDataset = None):
         super(SpectrogramMosaicCrop, self).__init__()
         self.dataset = dataset
@@ -417,71 +316,129 @@ def __call__(self, data: Any) -> Any:
                 data_type=np.dtype(np.complex128),
                 signal_description=data.signal_description,
             )
-            
+
             # Read shapes
             channels, height, width = data.iq_data.shape
-            
+
             # Randomly decide the new x0, y0 point of the stitched images
-            x0 = np.random.randint(0,width)
-            y0 = np.random.randint(0,height)
-            
+            x0 = np.random.randint(0, width)
+            y0 = np.random.randint(0, height)
+
             # Initialize new SignalDescription object
             new_signal_description = []
-            
+
             # First, create a 2x2 grid of (512+512,512+512) and randomly put the initial data into a grid cell
-            cell_idx = np.random.randint(0,4)
+            cell_idx = np.random.randint(0, 4)
             x_idx = 0 if cell_idx == 0 or cell_idx == 2 else 1
             y_idx = 0 if cell_idx == 0 or cell_idx == 1 else 1
             full_mosaic = np.empty(
-                (channels, height*2, width*2), 
+                (channels, height * 2, width * 2),
                 dtype=data.iq_data.dtype,
             )
-            full_mosaic[:,y_idx*height:(y_idx+1)*height,x_idx*width:(x_idx+1)*width] = data.iq_data
-            
+            full_mosaic[
+                :,
+                y_idx * height : (y_idx + 1) * height,
+                x_idx * width : (x_idx + 1) * width,
+            ] = data.iq_data
+
             # Update original data's SignalDescription objects given the cell index
-            signal_description = [data.signal_description] if isinstance(data.signal_description, SignalDescription) else data.signal_description
+            signal_description = (
+                [data.signal_description]
+                if isinstance(data.signal_description, SignalDescription)
+                else data.signal_description
+            )
             for signal_desc in signal_description:
                 new_signal_desc = deepcopy(signal_desc)
-                
+
                 # Update time fields
                 if x_idx == 0:
                     if new_signal_desc.stop * width < x0:
                         continue
-                    new_signal_desc.start = 0 if new_signal_desc.start < (x0 / width) else new_signal_desc.start - (x0 / width)
-                    new_signal_desc.stop = new_signal_desc.stop - (x0 / width) if new_signal_desc.stop < 1.0 else 1.0 - (x0 / width)
-                    new_signal_desc.duration = new_signal_desc.stop - new_signal_desc.start
-                    
+                    new_signal_desc.start = (
+                        0
+                        if new_signal_desc.start < (x0 / width)
+                        else new_signal_desc.start - (x0 / width)
+                    )
+                    new_signal_desc.stop = (
+                        new_signal_desc.stop - (x0 / width)
+                        if new_signal_desc.stop < 1.0
+                        else 1.0 - (x0 / width)
+                    )
+                    new_signal_desc.duration = (
+                        new_signal_desc.stop - new_signal_desc.start
+                    )
+
                 else:
                     if new_signal_desc.start * width > x0:
                         continue
                     new_signal_desc.start = (width - x0) / width + new_signal_desc.start
                     new_signal_desc.stop = (width - x0) / width + new_signal_desc.stop
-                    new_signal_desc.stop = 1.0 if new_signal_desc.stop > 1.0 else new_signal_desc.stop
-                    new_signal_desc.duration = new_signal_desc.stop - new_signal_desc.start
+                    new_signal_desc.stop = (
+                        1.0 if new_signal_desc.stop > 1.0 else new_signal_desc.stop
+                    )
+                    new_signal_desc.duration = (
+                        new_signal_desc.stop - new_signal_desc.start
+                    )
 
                 # Update frequency fields
-                new_signal_desc.lower_frequency = -0.5 if new_signal_desc.lower_frequency < -0.5 else new_signal_desc.lower_frequency
-                new_signal_desc.upper_frequency = 0.5 if new_signal_desc.upper_frequency > 0.5 else new_signal_desc.upper_frequency
+                new_signal_desc.lower_frequency = (
+                    -0.5
+                    if new_signal_desc.lower_frequency < -0.5
+                    else new_signal_desc.lower_frequency
+                )
+                new_signal_desc.upper_frequency = (
+                    0.5
+                    if new_signal_desc.upper_frequency > 0.5
+                    else new_signal_desc.upper_frequency
+                )
                 if y_idx == 0:
-                    if (new_signal_desc.upper_frequency+0.5) * height < y0:
+                    if (new_signal_desc.upper_frequency + 0.5) * height < y0:
                         continue
-                    new_signal_desc.lower_frequency = -0.5 if (new_signal_desc.lower_frequency+0.5) < (y0 / height) else new_signal_desc.lower_frequency - (y0 / height)
-                    new_signal_desc.upper_frequency = new_signal_desc.upper_frequency - (y0 / height) if new_signal_desc.upper_frequency < 0.5 else 0.5 - (y0 / height)
-                    new_signal_desc.bandwidth = new_signal_desc.upper_frequency - new_signal_desc.lower_frequency
-                    new_signal_desc.center_frequency = new_signal_desc.lower_frequency + new_signal_desc.bandwidth * 0.5
-                    
+                    new_signal_desc.lower_frequency = (
+                        -0.5
+                        if (new_signal_desc.lower_frequency + 0.5) < (y0 / height)
+                        else new_signal_desc.lower_frequency - (y0 / height)
+                    )
+                    new_signal_desc.upper_frequency = (
+                        new_signal_desc.upper_frequency - (y0 / height)
+                        if new_signal_desc.upper_frequency < 0.5
+                        else 0.5 - (y0 / height)
+                    )
+                    new_signal_desc.bandwidth = (
+                        new_signal_desc.upper_frequency
+                        - new_signal_desc.lower_frequency
+                    )
+                    new_signal_desc.center_frequency = (
+                        new_signal_desc.lower_frequency
+                        + new_signal_desc.bandwidth * 0.5
+                    )
+
                 else:
-                    if (new_signal_desc.lower_frequency+0.5) * height > y0:
+                    if (new_signal_desc.lower_frequency + 0.5) * height > y0:
                         continue
-                    new_signal_desc.lower_frequency = (height - y0) / height + new_signal_desc.lower_frequency
-                    new_signal_desc.upper_frequency = (height - y0) / height + new_signal_desc.upper_frequency
-                    new_signal_desc.upper_frequency = 0.5 if new_signal_desc.upper_frequency > 0.5 else new_signal_desc.upper_frequency
-                    new_signal_desc.bandwidth = new_signal_desc.upper_frequency - new_signal_desc.lower_frequency
-                    new_signal_desc.center_frequency = new_signal_desc.lower_frequency + new_signal_desc.bandwidth * 0.5
-                    
+                    new_signal_desc.lower_frequency = (
+                        height - y0
+                    ) / height + new_signal_desc.lower_frequency
+                    new_signal_desc.upper_frequency = (
+                        height - y0
+                    ) / height + new_signal_desc.upper_frequency
+                    new_signal_desc.upper_frequency = (
+                        0.5
+                        if new_signal_desc.upper_frequency > 0.5
+                        else new_signal_desc.upper_frequency
+                    )
+                    new_signal_desc.bandwidth = (
+                        new_signal_desc.upper_frequency
+                        - new_signal_desc.lower_frequency
+                    )
+                    new_signal_desc.center_frequency = (
+                        new_signal_desc.lower_frequency
+                        + new_signal_desc.bandwidth * 0.5
+                    )
+
                 # Append SignalDescription to list
                 new_signal_description.append(new_signal_desc)
-            
+
             # Next, fill in the remaining cells with data randomly sampled from the input dataset
             for cell_i in range(4):
                 if cell_i == cell_idx:
@@ -491,10 +448,18 @@ def __call__(self, data: Any) -> Any:
                 y_idx = 0 if cell_i == 0 or cell_i == 1 else 1
                 dataset_idx = np.random.randint(len(self.dataset))
                 curr_data, curr_signal_desc = self.dataset[dataset_idx]
-                full_mosaic[:,y_idx*height:(y_idx+1)*height,x_idx*width:(x_idx+1)*width] = curr_data
-                
+                full_mosaic[
+                    :,
+                    y_idx * height : (y_idx + 1) * height,
+                    x_idx * width : (x_idx + 1) * width,
+                ] = curr_data
+
                 # Update inserted data's SignalDescription objects given the cell index
-                signal_description = [curr_signal_desc] if isinstance(curr_signal_desc, SignalDescription) else curr_signal_desc
+                signal_description = (
+                    [curr_signal_desc]
+                    if isinstance(curr_signal_desc, SignalDescription)
+                    else curr_signal_desc
+                )
                 for signal_desc in signal_description:
                     new_signal_desc = deepcopy(signal_desc)
 
@@ -502,68 +467,126 @@ def __call__(self, data: Any) -> Any:
                     if x_idx == 0:
                         if new_signal_desc.stop * width < x0:
                             continue
-                        new_signal_desc.start = 0 if new_signal_desc.start < (x0 / width) else new_signal_desc.start - (x0 / width)
-                        new_signal_desc.stop = new_signal_desc.stop - (x0 / width) if new_signal_desc.stop < 1.0 else 1.0 - (x0 / width)
-                        new_signal_desc.duration = new_signal_desc.stop - new_signal_desc.start
+                        new_signal_desc.start = (
+                            0
+                            if new_signal_desc.start < (x0 / width)
+                            else new_signal_desc.start - (x0 / width)
+                        )
+                        new_signal_desc.stop = (
+                            new_signal_desc.stop - (x0 / width)
+                            if new_signal_desc.stop < 1.0
+                            else 1.0 - (x0 / width)
+                        )
+                        new_signal_desc.duration = (
+                            new_signal_desc.stop - new_signal_desc.start
+                        )
 
                     else:
                         if new_signal_desc.start * width > x0:
                             continue
-                        new_signal_desc.start = (width - x0) / width + new_signal_desc.start
-                        new_signal_desc.stop = (width - x0) / width + new_signal_desc.stop
-                        new_signal_desc.stop = 1.0 if new_signal_desc.stop > 1.0 else new_signal_desc.stop
-                        new_signal_desc.duration = new_signal_desc.stop - new_signal_desc.start
+                        new_signal_desc.start = (
+                            width - x0
+                        ) / width + new_signal_desc.start
+                        new_signal_desc.stop = (
+                            width - x0
+                        ) / width + new_signal_desc.stop
+                        new_signal_desc.stop = (
+                            1.0 if new_signal_desc.stop > 1.0 else new_signal_desc.stop
+                        )
+                        new_signal_desc.duration = (
+                            new_signal_desc.stop - new_signal_desc.start
+                        )
 
                     # Update frequency fields
-                    new_signal_desc.lower_frequency = -0.5 if new_signal_desc.lower_frequency < -0.5 else new_signal_desc.lower_frequency
-                    new_signal_desc.upper_frequency = 0.5 if new_signal_desc.upper_frequency > 0.5 else new_signal_desc.upper_frequency
+                    new_signal_desc.lower_frequency = (
+                        -0.5
+                        if new_signal_desc.lower_frequency < -0.5
+                        else new_signal_desc.lower_frequency
+                    )
+                    new_signal_desc.upper_frequency = (
+                        0.5
+                        if new_signal_desc.upper_frequency > 0.5
+                        else new_signal_desc.upper_frequency
+                    )
                     if y_idx == 0:
-                        if (new_signal_desc.upper_frequency+0.5) * height < y0:
+                        if (new_signal_desc.upper_frequency + 0.5) * height < y0:
                             continue
-                        new_signal_desc.lower_frequency = -0.5 if (new_signal_desc.lower_frequency+0.5) < (y0 / height) else new_signal_desc.lower_frequency - (y0 / height)
-                        new_signal_desc.upper_frequency = new_signal_desc.upper_frequency - (y0 / height) if new_signal_desc.upper_frequency < 0.5 else 0.5 - (y0 / height)
-                        new_signal_desc.bandwidth = new_signal_desc.upper_frequency - new_signal_desc.lower_frequency
-                        new_signal_desc.center_frequency = new_signal_desc.lower_frequency + new_signal_desc.bandwidth * 0.5
+                        new_signal_desc.lower_frequency = (
+                            -0.5
+                            if (new_signal_desc.lower_frequency + 0.5) < (y0 / height)
+                            else new_signal_desc.lower_frequency - (y0 / height)
+                        )
+                        new_signal_desc.upper_frequency = (
+                            new_signal_desc.upper_frequency - (y0 / height)
+                            if new_signal_desc.upper_frequency < 0.5
+                            else 0.5 - (y0 / height)
+                        )
+                        new_signal_desc.bandwidth = (
+                            new_signal_desc.upper_frequency
+                            - new_signal_desc.lower_frequency
+                        )
+                        new_signal_desc.center_frequency = (
+                            new_signal_desc.lower_frequency
+                            + new_signal_desc.bandwidth * 0.5
+                        )
 
                     else:
-                        if (new_signal_desc.lower_frequency+0.5) * height > y0:
+                        if (new_signal_desc.lower_frequency + 0.5) * height > y0:
                             continue
-                        new_signal_desc.lower_frequency = (height - y0) / height + new_signal_desc.lower_frequency
-                        new_signal_desc.upper_frequency = (height - y0) / height + new_signal_desc.upper_frequency
-                        new_signal_desc.upper_frequency = 0.5 if new_signal_desc.upper_frequency > 0.5 else new_signal_desc.upper_frequency
-                        new_signal_desc.bandwidth = new_signal_desc.upper_frequency - new_signal_desc.lower_frequency
-                        new_signal_desc.center_frequency = new_signal_desc.lower_frequency + new_signal_desc.bandwidth * 0.5
+                        new_signal_desc.lower_frequency = (
+                            height - y0
+                        ) / height + new_signal_desc.lower_frequency
+                        new_signal_desc.upper_frequency = (
+                            height - y0
+                        ) / height + new_signal_desc.upper_frequency
+                        new_signal_desc.upper_frequency = (
+                            0.5
+                            if new_signal_desc.upper_frequency > 0.5
+                            else new_signal_desc.upper_frequency
+                        )
+                        new_signal_desc.bandwidth = (
+                            new_signal_desc.upper_frequency
+                            - new_signal_desc.lower_frequency
+                        )
+                        new_signal_desc.center_frequency = (
+                            new_signal_desc.lower_frequency
+                            + new_signal_desc.bandwidth * 0.5
+                        )
 
                     # Append SignalDescription to list
                     new_signal_description.append(new_signal_desc)
-                
+
             # After the data has been stitched into the large 2x2 gride, crop using x0, y0
-            new_data.iq_data = full_mosaic[:,y0:y0+height,x0:x0+width]
-                
+            new_data.iq_data = full_mosaic[:, y0 : y0 + height, x0 : x0 + width]
+
             # Set output data's SignalDescription to above list
             new_data.signal_description = new_signal_description
-                
+
         else:
             # Read shapes
             channels, height, width = data.shape
-            
+
             # Randomly decide the new x0, y0 point of the stitched images
-            x0 = np.random.randint(0,width)
-            y0 = np.random.randint(0,height)
-            
+            x0 = np.random.randint(0, width)
+            y0 = np.random.randint(0, height)
+
             # Initialize new SignalDescription object
             new_signal_description = []
-            
+
             # First, create a 2x2 grid of (512+512,512+512) and randomly put the initial data into a grid cell
-            cell_idx = np.random.randint(0,4)
+            cell_idx = np.random.randint(0, 4)
             x_idx = 0 if cell_idx == 0 or cell_idx == 2 else 1
             y_idx = 0 if cell_idx == 0 or cell_idx == 1 else 1
             full_mosaic = np.empty(
-                (channels, height*2, width*2), 
+                (channels, height * 2, width * 2),
                 dtype=data.dtype,
             )
-            full_mosaic[:,y_idx*height:(y_idx+1)*height,x_idx*width:(x_idx+1)*width] = data
-            
+            full_mosaic[
+                :,
+                y_idx * height : (y_idx + 1) * height,
+                x_idx * width : (x_idx + 1) * width,
+            ] = data
+
             # Next, fill in the remaining cells with data randomly sampled from the input dataset
             for cell_i in range(4):
                 if cell_i == cell_idx:
@@ -573,28 +596,33 @@ def __call__(self, data: Any) -> Any:
                 y_idx = 0 if cell_i == 0 or cell_i == 1 else 1
                 dataset_idx = np.random.randint(len(self.dataset))
                 curr_data, curr_signal_desc = self.dataset[dataset_idx]
-                full_mosaic[:,y_idx*height:(y_idx+1)*height,x_idx*width:(x_idx+1)*width] = curr_data
-                
+                full_mosaic[
+                    :,
+                    y_idx * height : (y_idx + 1) * height,
+                    x_idx * width : (x_idx + 1) * width,
+                ] = curr_data
+
             # After the data has been stitched into the large 2x2 gride, crop using x0, y0
-            new_data = full_mosaic[:,y0:y0+height,x0:x0+width]
-            
+            new_data = full_mosaic[:, y0 : y0 + height, x0 : x0 + width]
+
         return new_data
-    
-    
+
+
 class SpectrogramMosaicDownsample(SignalTransform):
-    """The SpectrogramMosaicDownsample transform takes the original input 
-    tensor and inserts it randomly into one cell of a 2x2 grid of 2x the size 
-    of the orginal spectrogram input. The `dataset` argument is then read 3x to 
+    """The SpectrogramMosaicDownsample transform takes the original input
+    tensor and inserts it randomly into one cell of a 2x2 grid of 2x the size
+    of the orginal spectrogram input. The `dataset` argument is then read 3x to
     retrieve spectrograms to fill the remaining cells of the 2x2 grid. Finally,
-    the 2x oversized stitched spectrograms are downsampled by 2 to become the 
+    the 2x oversized stitched spectrograms are downsampled by 2 to become the
     desired, original shape
-    
+
     Args:
         dataset :obj:`SignalDataset`:
             An SignalDataset of complex-valued examples to be used as a source for
             the mosaic operation
-    
+
     """
+
     def __init__(self, dataset: SignalDataset = None):
         super(SpectrogramMosaicDownsample, self).__init__()
         self.dataset = dataset
@@ -608,57 +636,97 @@ def __call__(self, data: Any) -> Any:
                 data_type=np.dtype(np.complex128),
                 signal_description=data.signal_description,
             )
-            
+
             # Read shapes
             channels, height, width = data.iq_data.shape
-            
+
             # Initialize new SignalDescription object
             new_signal_description = []
-            
+
             # First, create a 2x2 grid of (512+512,512+512) and randomly put the initial data into a grid cell
-            cell_idx = np.random.randint(0,4)
+            cell_idx = np.random.randint(0, 4)
             x_idx = 0 if cell_idx == 0 or cell_idx == 2 else 1
             y_idx = 0 if cell_idx == 0 or cell_idx == 1 else 1
             full_mosaic = np.empty(
-                (channels, height*2, width*2), 
+                (channels, height * 2, width * 2),
                 dtype=data.iq_data.dtype,
             )
-            full_mosaic[:,y_idx*height:(y_idx+1)*height,x_idx*width:(x_idx+1)*width] = data.iq_data
-            
+            full_mosaic[
+                :,
+                y_idx * height : (y_idx + 1) * height,
+                x_idx * width : (x_idx + 1) * width,
+            ] = data.iq_data
+
             # Update original data's SignalDescription objects given the cell index
-            signal_description = [data.signal_description] if isinstance(data.signal_description, SignalDescription) else data.signal_description
+            signal_description = (
+                [data.signal_description]
+                if isinstance(data.signal_description, SignalDescription)
+                else data.signal_description
+            )
             for signal_desc in signal_description:
                 new_signal_desc = deepcopy(signal_desc)
-                
+
                 # Update time fields
                 if x_idx == 0:
                     new_signal_desc.start /= 2
                     new_signal_desc.stop /= 2
-                    new_signal_desc.duration = new_signal_desc.stop - new_signal_desc.start
-                    
+                    new_signal_desc.duration = (
+                        new_signal_desc.stop - new_signal_desc.start
+                    )
+
                 else:
                     new_signal_desc.start = new_signal_desc.start / 2 + 0.5
                     new_signal_desc.stop = new_signal_desc.stop / 2 + 0.5
-                    new_signal_desc.duration = new_signal_desc.stop - new_signal_desc.start
+                    new_signal_desc.duration = (
+                        new_signal_desc.stop - new_signal_desc.start
+                    )
 
                 # Update frequency fields
-                new_signal_desc.lower_frequency = -0.5 if new_signal_desc.lower_frequency < -0.5 else new_signal_desc.lower_frequency
-                new_signal_desc.upper_frequency = 0.5 if new_signal_desc.upper_frequency > 0.5 else new_signal_desc.upper_frequency
+                new_signal_desc.lower_frequency = (
+                    -0.5
+                    if new_signal_desc.lower_frequency < -0.5
+                    else new_signal_desc.lower_frequency
+                )
+                new_signal_desc.upper_frequency = (
+                    0.5
+                    if new_signal_desc.upper_frequency > 0.5
+                    else new_signal_desc.upper_frequency
+                )
                 if y_idx == 0:
-                    new_signal_desc.lower_frequency = (new_signal_desc.lower_frequency+0.5) / 2 - 0.5
-                    new_signal_desc.upper_frequency = (new_signal_desc.upper_frequency+0.5) / 2 - 0.5
-                    new_signal_desc.bandwidth = new_signal_desc.upper_frequency - new_signal_desc.lower_frequency
-                    new_signal_desc.center_frequency = new_signal_desc.lower_frequency + new_signal_desc.bandwidth * 0.5
-                    
+                    new_signal_desc.lower_frequency = (
+                        new_signal_desc.lower_frequency + 0.5
+                    ) / 2 - 0.5
+                    new_signal_desc.upper_frequency = (
+                        new_signal_desc.upper_frequency + 0.5
+                    ) / 2 - 0.5
+                    new_signal_desc.bandwidth = (
+                        new_signal_desc.upper_frequency
+                        - new_signal_desc.lower_frequency
+                    )
+                    new_signal_desc.center_frequency = (
+                        new_signal_desc.lower_frequency
+                        + new_signal_desc.bandwidth * 0.5
+                    )
+
                 else:
-                    new_signal_desc.lower_frequency = (new_signal_desc.lower_frequency+0.5) / 2
-                    new_signal_desc.upper_frequency = (new_signal_desc.upper_frequency+0.5) / 2
-                    new_signal_desc.bandwidth = new_signal_desc.upper_frequency - new_signal_desc.lower_frequency
-                    new_signal_desc.center_frequency = new_signal_desc.lower_frequency + new_signal_desc.bandwidth * 0.5
-                    
+                    new_signal_desc.lower_frequency = (
+                        new_signal_desc.lower_frequency + 0.5
+                    ) / 2
+                    new_signal_desc.upper_frequency = (
+                        new_signal_desc.upper_frequency + 0.5
+                    ) / 2
+                    new_signal_desc.bandwidth = (
+                        new_signal_desc.upper_frequency
+                        - new_signal_desc.lower_frequency
+                    )
+                    new_signal_desc.center_frequency = (
+                        new_signal_desc.lower_frequency
+                        + new_signal_desc.bandwidth * 0.5
+                    )
+
                 # Append SignalDescription to list
                 new_signal_description.append(new_signal_desc)
-            
+
             # Next, fill in the remaining cells with data randomly sampled from the input dataset
             for cell_i in range(4):
                 if cell_i == cell_idx:
@@ -668,10 +736,18 @@ def __call__(self, data: Any) -> Any:
                 y_idx = 0 if cell_i == 0 or cell_i == 1 else 1
                 dataset_idx = np.random.randint(len(self.dataset))
                 curr_data, curr_signal_desc = self.dataset[dataset_idx]
-                full_mosaic[:,y_idx*height:(y_idx+1)*height,x_idx*width:(x_idx+1)*width] = curr_data
+                full_mosaic[
+                    :,
+                    y_idx * height : (y_idx + 1) * height,
+                    x_idx * width : (x_idx + 1) * width,
+                ] = curr_data
 
                 # Update inserted data's SignalDescription objects given the cell index
-                signal_description = [curr_signal_desc] if isinstance(curr_signal_desc, SignalDescription) else curr_signal_desc
+                signal_description = (
+                    [curr_signal_desc]
+                    if isinstance(curr_signal_desc, SignalDescription)
+                    else curr_signal_desc
+                )
                 for signal_desc in signal_description:
                     new_signal_desc = deepcopy(signal_desc)
 
@@ -679,54 +755,90 @@ def __call__(self, data: Any) -> Any:
                     if x_idx == 0:
                         new_signal_desc.start /= 2
                         new_signal_desc.stop /= 2
-                        new_signal_desc.duration = new_signal_desc.stop - new_signal_desc.start
+                        new_signal_desc.duration = (
+                            new_signal_desc.stop - new_signal_desc.start
+                        )
 
                     else:
                         new_signal_desc.start = new_signal_desc.start / 2 + 0.5
                         new_signal_desc.stop = new_signal_desc.stop / 2 + 0.5
-                        new_signal_desc.duration = new_signal_desc.stop - new_signal_desc.start
+                        new_signal_desc.duration = (
+                            new_signal_desc.stop - new_signal_desc.start
+                        )
 
                     # Update frequency fields
-                    new_signal_desc.lower_frequency = -0.5 if new_signal_desc.lower_frequency < -0.5 else new_signal_desc.lower_frequency
-                    new_signal_desc.upper_frequency = 0.5 if new_signal_desc.upper_frequency > 0.5 else new_signal_desc.upper_frequency
+                    new_signal_desc.lower_frequency = (
+                        -0.5
+                        if new_signal_desc.lower_frequency < -0.5
+                        else new_signal_desc.lower_frequency
+                    )
+                    new_signal_desc.upper_frequency = (
+                        0.5
+                        if new_signal_desc.upper_frequency > 0.5
+                        else new_signal_desc.upper_frequency
+                    )
                     if y_idx == 0:
-                        new_signal_desc.lower_frequency = (new_signal_desc.lower_frequency+0.5) / 2 - 0.5
-                        new_signal_desc.upper_frequency = (new_signal_desc.upper_frequency+0.5) / 2 - 0.5
-                        new_signal_desc.bandwidth = new_signal_desc.upper_frequency - new_signal_desc.lower_frequency
-                        new_signal_desc.center_frequency = new_signal_desc.lower_frequency + new_signal_desc.bandwidth * 0.5
+                        new_signal_desc.lower_frequency = (
+                            new_signal_desc.lower_frequency + 0.5
+                        ) / 2 - 0.5
+                        new_signal_desc.upper_frequency = (
+                            new_signal_desc.upper_frequency + 0.5
+                        ) / 2 - 0.5
+                        new_signal_desc.bandwidth = (
+                            new_signal_desc.upper_frequency
+                            - new_signal_desc.lower_frequency
+                        )
+                        new_signal_desc.center_frequency = (
+                            new_signal_desc.lower_frequency
+                            + new_signal_desc.bandwidth * 0.5
+                        )
 
                     else:
-                        new_signal_desc.lower_frequency = (new_signal_desc.lower_frequency+0.5) / 2
-                        new_signal_desc.upper_frequency = (new_signal_desc.upper_frequency+0.5) / 2
-                        new_signal_desc.bandwidth = new_signal_desc.upper_frequency - new_signal_desc.lower_frequency
-                        new_signal_desc.center_frequency = new_signal_desc.lower_frequency + new_signal_desc.bandwidth * 0.5
+                        new_signal_desc.lower_frequency = (
+                            new_signal_desc.lower_frequency + 0.5
+                        ) / 2
+                        new_signal_desc.upper_frequency = (
+                            new_signal_desc.upper_frequency + 0.5
+                        ) / 2
+                        new_signal_desc.bandwidth = (
+                            new_signal_desc.upper_frequency
+                            - new_signal_desc.lower_frequency
+                        )
+                        new_signal_desc.center_frequency = (
+                            new_signal_desc.lower_frequency
+                            + new_signal_desc.bandwidth * 0.5
+                        )
 
                     # Append SignalDescription to list
                     new_signal_description.append(new_signal_desc)
-                
+
             # After the data has been stitched into the large 2x2 gride, downsample by 2
-            new_data.iq_data = full_mosaic[:,::2,::2]
-                
+            new_data.iq_data = full_mosaic[:, ::2, ::2]
+
             # Set output data's SignalDescription to above list
             new_data.signal_description = new_signal_description
-                
+
         else:
             # Read shapes
             channels, height, width = data.shape
-            
+
             # Initialize new SignalDescription object
             new_signal_description = []
-            
+
             # First, create a 2x2 grid of (512+512,512+512) and randomly put the initial data into a grid cell
-            cell_idx = np.random.randint(0,4)
+            cell_idx = np.random.randint(0, 4)
             x_idx = 0 if cell_idx == 0 or cell_idx == 2 else 1
             y_idx = 0 if cell_idx == 0 or cell_idx == 1 else 1
             full_mosaic = np.empty(
-                (channels, height*2, width*2), 
+                (channels, height * 2, width * 2),
                 dtype=data.dtype,
             )
-            full_mosaic[:,y_idx*height:(y_idx+1)*height,x_idx*width:(x_idx+1)*width] = data
-            
+            full_mosaic[
+                :,
+                y_idx * height : (y_idx + 1) * height,
+                x_idx * width : (x_idx + 1) * width,
+            ] = data
+
             # Next, fill in the remaining cells with data randomly sampled from the input dataset
             for cell_i in range(4):
                 if cell_i == cell_idx:
@@ -736,9 +848,13 @@ def __call__(self, data: Any) -> Any:
                 y_idx = 0 if cell_i == 0 or cell_i == 1 else 1
                 dataset_idx = np.random.randint(len(self.dataset))
                 curr_data, curr_signal_desc = self.dataset[dataset_idx]
-                full_mosaic[:,y_idx*height:(y_idx+1)*height,x_idx*width:(x_idx+1)*width] = curr_data
-                
+                full_mosaic[
+                    :,
+                    y_idx * height : (y_idx + 1) * height,
+                    x_idx * width : (x_idx + 1) * width,
+                ] = curr_data
+
             # After the data has been stitched into the large 2x2 gride, downsample by 2
-            new_data = full_mosaic[:,::2,::2]
-            
+            new_data = full_mosaic[:, ::2, ::2]
+
         return new_data
diff --git a/torchsig/transforms/target_transforms/target_transforms.py b/torchsig/transforms/target_transforms/target_transforms.py
index f41a046..f26f7e9 100644
--- a/torchsig/transforms/target_transforms/target_transforms.py
+++ b/torchsig/transforms/target_transforms/target_transforms.py
@@ -11,15 +11,26 @@ class DescToClassName(Transform):
     or a list of the classes present if there are multiple classes
 
     """
+
     def __init__(self):
         super(DescToClassName, self).__init__()
 
-    def __call__(self, signal_description: Union[List[SignalDescription], SignalDescription]) -> Union[List[str], str]:
+    def __call__(
+        self, signal_description: Union[List[SignalDescription], SignalDescription]
+    ) -> Union[List[str], str]:
         classes = []
         # Handle cases of both SignalDescriptions and lists of SignalDescriptions
-        signal_description = [signal_description] if isinstance(signal_description, SignalDescription) else signal_description
+        signal_description = (
+            [signal_description]
+            if isinstance(signal_description, SignalDescription)
+            else signal_description
+        )
         for signal_desc_idx, signal_desc in enumerate(signal_description):
-            curr_class_name = signal_desc.class_name[0] if isinstance(signal_desc.class_name, list) else signal_desc.class_name
+            curr_class_name = (
+                signal_desc.class_name[0]
+                if isinstance(signal_desc.class_name, list)
+                else signal_desc.class_name
+            )
             classes.append(curr_class_name)
         if len(classes) > 1:
             return classes
@@ -27,7 +38,7 @@ def __call__(self, signal_description: Union[List[SignalDescription], SignalDesc
             return classes[0]
         else:
             return []
-        
+
 
 class DescToClassNameSNR(Transform):
     """Transform to transform SignalDescription into either the single class name
@@ -35,17 +46,21 @@ class DescToClassNameSNR(Transform):
     the SNRs for each
 
     """
+
     def __init__(self):
         super(DescToClassNameSNR, self).__init__()
 
     def __call__(
-        self, 
-        signal_description: Union[List[SignalDescription], SignalDescription]
-    ) -> Union[Tuple[List[str],List[float]], Tuple[str, float]]:
+        self, signal_description: Union[List[SignalDescription], SignalDescription]
+    ) -> Union[Tuple[List[str], List[float]], Tuple[str, float]]:
         classes = []
         snrs = []
         # Handle cases of both SignalDescriptions and lists of SignalDescriptions
-        signal_description = [signal_description] if isinstance(signal_description, SignalDescription) else signal_description
+        signal_description = (
+            [signal_description]
+            if isinstance(signal_description, SignalDescription)
+            else signal_description
+        )
         for signal_desc_idx, signal_desc in enumerate(signal_description):
             classes.append(signal_desc.class_name)
             snrs.append(signal_desc.snr)
@@ -53,28 +68,35 @@ def __call__(
             return classes, snrs
         else:
             return classes[0], snrs[0]
-        
+
 
 class DescToClassIndex(Transform):
     """Transform to transform SignalDescription into either the single class index
     or a list of the class indices present if there are multiple classes. Note:
-    if the SignalDescription contains classes not present in the provided 
-    `class_list`, the SignalDescription is interpretted as having no classes 
+    if the SignalDescription contains classes not present in the provided
+    `class_list`, the SignalDescription is interpretted as having no classes
     present
-    
+
     Args:
         class_list (:obj:`List[str]`):
             A full list of classes to map the class names to indices
 
     """
+
     def __init__(self, class_list: List[str] = None):
         super(DescToClassIndex, self).__init__()
         self.class_list = class_list
 
-    def __call__(self, signal_description: Union[List[SignalDescription], SignalDescription]) -> Union[List[int], int]:
+    def __call__(
+        self, signal_description: Union[List[SignalDescription], SignalDescription]
+    ) -> Union[List[int], int]:
         classes = []
         # Handle cases of both SignalDescriptions and lists of SignalDescriptions
-        signal_description = [signal_description] if isinstance(signal_description, SignalDescription) else signal_description
+        signal_description = (
+            [signal_description]
+            if isinstance(signal_description, SignalDescription)
+            else signal_description
+        )
         for signal_desc_idx, signal_desc in enumerate(signal_description):
             if signal_desc.class_name in self.class_list:
                 classes.append(self.class_list.index(signal_desc.class_name))
@@ -83,31 +105,35 @@ def __call__(self, signal_description: Union[List[SignalDescription], SignalDesc
         else:
             return classes[0]
 
-        
+
 class DescToClassIndexSNR(Transform):
     """Transform to transform SignalDescription into either the single class index
     or a list of the class indices present if there are multiple classes along
-    with the SNRs of each. Note: if the SignalDescription contains classes not 
+    with the SNRs of each. Note: if the SignalDescription contains classes not
     present in the provided `class_list`, the SignalDescription is interpretted as
     having no classes present
-    
+
     Args:
         class_list (:obj:`List[str]`):
             A full list of classes to map the class names to indices
 
     """
+
     def __init__(self, class_list: List[str] = None):
         super(DescToClassIndexSNR, self).__init__()
         self.class_list = class_list
 
     def __call__(
-        self, 
-        signal_description: Union[List[SignalDescription], SignalDescription]
+        self, signal_description: Union[List[SignalDescription], SignalDescription]
     ) -> Union[Tuple[List[int], List[float]], Tuple[int, float]]:
         classes = []
         snrs = []
         # Handle cases of both SignalDescriptions and lists of SignalDescriptions
-        signal_description = [signal_description] if isinstance(signal_description, SignalDescription) else signal_description
+        signal_description = (
+            [signal_description]
+            if isinstance(signal_description, SignalDescription)
+            else signal_description
+        )
         for signal_desc_idx, signal_desc in enumerate(signal_description):
             if signal_desc.class_name in self.class_list:
                 classes.append(self.class_list.index(signal_desc.class_name))
@@ -117,7 +143,7 @@ def __call__(
         else:
             return classes[0], snrs[0]
 
-        
+
 class DescToMask(Transform):
     """Transform to transform SignalDescriptions into spectrogram masks
 
@@ -128,17 +154,24 @@ class DescToMask(Transform):
             Width of resultant spectrogram mask
         height (:obj:`int`):
             Height of resultant spectrogram mask
-            
+
     """
+
     def __init__(self, max_bursts: int, width: int, height: int):
         super(DescToMask, self).__init__()
         self.max_bursts = max_bursts
         self.width = width
         self.height = height
 
-    def __call__(self, signal_description: Union[List[SignalDescription], SignalDescription]) -> np.ndarray:
+    def __call__(
+        self, signal_description: Union[List[SignalDescription], SignalDescription]
+    ) -> np.ndarray:
         # Handle cases of both SignalDescriptions and lists of SignalDescriptions
-        signal_description = [signal_description] if isinstance(signal_description, SignalDescription) else signal_description
+        signal_description = (
+            [signal_description]
+            if isinstance(signal_description, SignalDescription)
+            else signal_description
+        )
         masks = np.zeros((self.max_bursts, self.height, self.width))
         idx = 0
         for signal_desc in signal_description:
@@ -146,22 +179,33 @@ def __call__(self, signal_description: Union[List[SignalDescription], SignalDesc
                 signal_desc.lower_frequency = -0.5
             if signal_desc.upper_frequency > 0.5:
                 signal_desc.upper_frequency = 0.5
-            if int((signal_desc.lower_frequency+0.5) * self.height) == int((signal_desc.upper_frequency+0.5) * self.height):
+            if int((signal_desc.lower_frequency + 0.5) * self.height) == int(
+                (signal_desc.upper_frequency + 0.5) * self.height
+            ):
                 masks[
                     idx,
-                    int((signal_desc.lower_frequency+0.5) * self.height) : int((signal_desc.upper_frequency+0.5) * self.height)+1,
-                    int(signal_desc.start * self.width) : int(signal_desc.stop * self.width),
+                    int((signal_desc.lower_frequency + 0.5) * self.height) : int(
+                        (signal_desc.upper_frequency + 0.5) * self.height
+                    )
+                    + 1,
+                    int(signal_desc.start * self.width) : int(
+                        signal_desc.stop * self.width
+                    ),
                 ] = 1.0
             else:
                 masks[
                     idx,
-                    int((signal_desc.lower_frequency+0.5) * self.height) : int((signal_desc.upper_frequency+0.5) * self.height),
-                    int(signal_desc.start * self.width) : int(signal_desc.stop * self.width),
+                    int((signal_desc.lower_frequency + 0.5) * self.height) : int(
+                        (signal_desc.upper_frequency + 0.5) * self.height
+                    ),
+                    int(signal_desc.start * self.width) : int(
+                        signal_desc.stop * self.width
+                    ),
                 ] = 1.0
             idx += 1
         return masks
-        
-    
+
+
 class DescToMaskSignal(Transform):
     """Transform to transform SignalDescriptions into spectrogram masks for binary
     signal detection
@@ -171,35 +215,53 @@ class DescToMaskSignal(Transform):
             Width of resultant spectrogram mask
         height (:obj:`int`):
             Height of resultant spectrogram mask
-            
+
     """
+
     def __init__(self, width: int, height: int):
         super(DescToMaskSignal, self).__init__()
         self.width = width
         self.height = height
 
-    def __call__(self, signal_description: Union[List[SignalDescription], SignalDescription]) -> np.ndarray:
+    def __call__(
+        self, signal_description: Union[List[SignalDescription], SignalDescription]
+    ) -> np.ndarray:
         # Handle cases of both SignalDescriptions and lists of SignalDescriptions
-        signal_description = [signal_description] if isinstance(signal_description, SignalDescription) else signal_description
+        signal_description = (
+            [signal_description]
+            if isinstance(signal_description, SignalDescription)
+            else signal_description
+        )
         masks = np.zeros((self.height, self.width))
         for signal_desc in signal_description:
             if signal_desc.lower_frequency < -0.5:
                 signal_desc.lower_frequency = -0.5
             if signal_desc.upper_frequency > 0.5:
                 signal_desc.upper_frequency = 0.5
-            if int((signal_desc.lower_frequency+0.5) * self.height) == int((signal_desc.upper_frequency+0.5) * self.height):
+            if int((signal_desc.lower_frequency + 0.5) * self.height) == int(
+                (signal_desc.upper_frequency + 0.5) * self.height
+            ):
                 masks[
-                    int((signal_desc.lower_frequency+0.5) * self.height) : int((signal_desc.upper_frequency+0.5) * self.height)+1,
-                    int(signal_desc.start * self.width) : int(signal_desc.stop * self.width),
+                    int((signal_desc.lower_frequency + 0.5) * self.height) : int(
+                        (signal_desc.upper_frequency + 0.5) * self.height
+                    )
+                    + 1,
+                    int(signal_desc.start * self.width) : int(
+                        signal_desc.stop * self.width
+                    ),
                 ] = 1.0
             else:
                 masks[
-                    int((signal_desc.lower_frequency+0.5) * self.height) : int((signal_desc.upper_frequency+0.5) * self.height),
-                    int(signal_desc.start * self.width) : int(signal_desc.stop * self.width),
+                    int((signal_desc.lower_frequency + 0.5) * self.height) : int(
+                        (signal_desc.upper_frequency + 0.5) * self.height
+                    ),
+                    int(signal_desc.start * self.width) : int(
+                        signal_desc.stop * self.width
+                    ),
                 ] = 1.0
         return masks
-        
-        
+
+
 class DescToMaskFamily(Transform):
     """Transform to transform SignalDescriptions into spectrogram masks with
     different channels for each class's family. If no `class_family_dict`
@@ -214,81 +276,95 @@ class DescToMaskFamily(Transform):
             Width of resultant spectrogram mask
         height (:obj:`int`):
             Height of resultant spectrogram mask
-            
+
     """
+
     class_family_dict = {
-        '4ask':'ask',
-        '8ask':'ask',
-        '16ask':'ask',
-        '32ask':'ask',
-        '64ask':'ask',
-        'ook':'pam',
-        '4pam':'pam',
-        '8pam':'pam',
-        '16pam':'pam',
-        '32pam':'pam',
-        '64pam':'pam',
-        '2fsk':'fsk',
-        '2gfsk':'fsk',
-        '2msk':'fsk',
-        '2gmsk':'fsk',
-        '4fsk':'fsk',
-        '4gfsk':'fsk',
-        '4msk':'fsk',
-        '4gmsk':'fsk',
-        '8fsk':'fsk',
-        '8gfsk':'fsk',
-        '8msk':'fsk',
-        '8gmsk':'fsk',
-        '16fsk':'fsk',
-        '16gfsk':'fsk',
-        '16msk':'fsk',
-        '16gmsk':'fsk',
-        'bpsk':'psk',
-        'qpsk':'psk',
-        '8psk':'psk',
-        '16psk':'psk',
-        '32psk':'psk',
-        '64psk':'psk',
-        '16qam':'qam',
-        '32qam':'qam',
-        '32qam_cross':'qam',
-        '64qam':'qam',
-        '128qam_cross':'qam',
-        '256qam':'qam',
-        '512qam_cross':'qam',
-        '1024qam':'qam',
-        'ofdm-64':'ofdm',
-        'ofdm-72':'ofdm',
-        'ofdm-128':'ofdm',
-        'ofdm-180':'ofdm',
-        'ofdm-256':'ofdm',
-        'ofdm-300':'ofdm',
-        'ofdm-512':'ofdm',
-        'ofdm-600':'ofdm',
-        'ofdm-900':'ofdm',
-        'ofdm-1024':'ofdm',
-        'ofdm-1200':'ofdm',
-        'ofdm-2048':'ofdm',
+        "4ask": "ask",
+        "8ask": "ask",
+        "16ask": "ask",
+        "32ask": "ask",
+        "64ask": "ask",
+        "ook": "pam",
+        "4pam": "pam",
+        "8pam": "pam",
+        "16pam": "pam",
+        "32pam": "pam",
+        "64pam": "pam",
+        "2fsk": "fsk",
+        "2gfsk": "fsk",
+        "2msk": "fsk",
+        "2gmsk": "fsk",
+        "4fsk": "fsk",
+        "4gfsk": "fsk",
+        "4msk": "fsk",
+        "4gmsk": "fsk",
+        "8fsk": "fsk",
+        "8gfsk": "fsk",
+        "8msk": "fsk",
+        "8gmsk": "fsk",
+        "16fsk": "fsk",
+        "16gfsk": "fsk",
+        "16msk": "fsk",
+        "16gmsk": "fsk",
+        "bpsk": "psk",
+        "qpsk": "psk",
+        "8psk": "psk",
+        "16psk": "psk",
+        "32psk": "psk",
+        "64psk": "psk",
+        "16qam": "qam",
+        "32qam": "qam",
+        "32qam_cross": "qam",
+        "64qam": "qam",
+        "128qam_cross": "qam",
+        "256qam": "qam",
+        "512qam_cross": "qam",
+        "1024qam": "qam",
+        "ofdm-64": "ofdm",
+        "ofdm-72": "ofdm",
+        "ofdm-128": "ofdm",
+        "ofdm-180": "ofdm",
+        "ofdm-256": "ofdm",
+        "ofdm-300": "ofdm",
+        "ofdm-512": "ofdm",
+        "ofdm-600": "ofdm",
+        "ofdm-900": "ofdm",
+        "ofdm-1024": "ofdm",
+        "ofdm-1200": "ofdm",
+        "ofdm-2048": "ofdm",
     }
+
     def __init__(
-        self, 
-        width: int, 
+        self,
+        width: int,
         height: int,
-        class_family_dict: dict = None, 
+        class_family_dict: dict = None,
         family_list: list = None,
         label_encode: bool = False,
     ):
         super(DescToMaskFamily, self).__init__()
-        self.class_family_dict = class_family_dict if class_family_dict else self.class_family_dict
-        self.family_list = family_list if family_list else sorted(list(set(self.class_family_dict.values())))
+        self.class_family_dict = (
+            class_family_dict if class_family_dict else self.class_family_dict
+        )
+        self.family_list = (
+            family_list
+            if family_list
+            else sorted(list(set(self.class_family_dict.values())))
+        )
         self.width = width
         self.height = height
         self.label_encode = label_encode
 
-    def __call__(self, signal_description: Union[List[SignalDescription], SignalDescription]) -> np.ndarray:
+    def __call__(
+        self, signal_description: Union[List[SignalDescription], SignalDescription]
+    ) -> np.ndarray:
         # Handle cases of both SignalDescriptions and lists of SignalDescriptions
-        signal_description = [signal_description] if isinstance(signal_description, SignalDescription) else signal_description
+        signal_description = (
+            [signal_description]
+            if isinstance(signal_description, SignalDescription)
+            else signal_description
+        )
         masks = np.zeros((len(self.family_list), self.height, self.width))
         for signal_desc in signal_description:
             if signal_desc.lower_frequency < -0.5:
@@ -299,27 +375,38 @@ def __call__(self, signal_description: Union[List[SignalDescription], SignalDesc
                 signal_desc.class_name = signal_desc.class_name[0]
             family_name = self.class_family_dict[signal_desc.class_name]
             family_idx = self.family_list.index(family_name)
-            if int((signal_desc.lower_frequency+0.5) * self.height) == int((signal_desc.upper_frequency+0.5) * self.height):
+            if int((signal_desc.lower_frequency + 0.5) * self.height) == int(
+                (signal_desc.upper_frequency + 0.5) * self.height
+            ):
                 masks[
                     family_idx,
-                    int((signal_desc.lower_frequency+0.5) * self.height) : int((signal_desc.upper_frequency+0.5) * self.height)+1,
-                    int(signal_desc.start * self.width) : int(signal_desc.stop * self.width),
+                    int((signal_desc.lower_frequency + 0.5) * self.height) : int(
+                        (signal_desc.upper_frequency + 0.5) * self.height
+                    )
+                    + 1,
+                    int(signal_desc.start * self.width) : int(
+                        signal_desc.stop * self.width
+                    ),
                 ] = 1.0
             else:
                 masks[
                     family_idx,
-                    int((signal_desc.lower_frequency+0.5) * self.height) : int((signal_desc.upper_frequency+0.5) * self.height),
-                    int(signal_desc.start * self.width) : int(signal_desc.stop * self.width),
+                    int((signal_desc.lower_frequency + 0.5) * self.height) : int(
+                        (signal_desc.upper_frequency + 0.5) * self.height
+                    ),
+                    int(signal_desc.start * self.width) : int(
+                        signal_desc.stop * self.width
+                    ),
                 ] = 1.0
         if self.label_encode:
             background_mask = np.zeros((1, self.height, self.height))
             masks = np.concatenate([background_mask, masks], axis=0)
             masks = np.argmax(masks, axis=0)
         return masks
-    
-    
+
+
 class DescToMaskClass(Transform):
-    """Transform to transform list of SignalDescriptions into spectrogram masks 
+    """Transform to transform list of SignalDescriptions into spectrogram masks
     with classes
 
     Args:
@@ -329,45 +416,63 @@ class DescToMaskClass(Transform):
             Width of resultant spectrogram mask
         height (:obj:`int`):
             Height of resultant spectrogram mask
-            
+
     """
+
     def __init__(self, num_classes: int, width: int, height: int):
         super(DescToMaskClass, self).__init__()
         self.num_classes = num_classes
         self.width = width
         self.height = height
 
-    def __call__(self, signal_description: Union[List[SignalDescription], SignalDescription]) -> np.ndarray:
+    def __call__(
+        self, signal_description: Union[List[SignalDescription], SignalDescription]
+    ) -> np.ndarray:
         # Handle cases of both SignalDescriptions and lists of SignalDescriptions
-        signal_description = [signal_description] if isinstance(signal_description, SignalDescription) else signal_description
+        signal_description = (
+            [signal_description]
+            if isinstance(signal_description, SignalDescription)
+            else signal_description
+        )
         masks = np.zeros((self.num_classes, self.height, self.width))
         for signal_desc in signal_description:
             if signal_desc.lower_frequency < -0.5:
                 signal_desc.lower_frequency = -0.5
             if signal_desc.upper_frequency > 0.5:
                 signal_desc.upper_frequency = 0.5
-            if int((signal_desc.lower_frequency+0.5) * self.height) == int((signal_desc.upper_frequency+0.5) * self.height):
+            if int((signal_desc.lower_frequency + 0.5) * self.height) == int(
+                (signal_desc.upper_frequency + 0.5) * self.height
+            ):
                 masks[
                     signal_desc.class_index,
-                    int((signal_desc.lower_frequency+0.5) * self.height) : int((signal_desc.upper_frequency+0.5) * self.height)+1,
-                    int(signal_desc.start * self.width) : int(signal_desc.stop * self.width),
+                    int((signal_desc.lower_frequency + 0.5) * self.height) : int(
+                        (signal_desc.upper_frequency + 0.5) * self.height
+                    )
+                    + 1,
+                    int(signal_desc.start * self.width) : int(
+                        signal_desc.stop * self.width
+                    ),
                 ] = 1.0
             else:
                 masks[
                     signal_desc.class_index,
-                    int((signal_desc.lower_frequency+0.5) * self.height) : int((signal_desc.upper_frequency+0.5) * self.height),
-                    int(signal_desc.start * self.width) : int(signal_desc.stop * self.width),
+                    int((signal_desc.lower_frequency + 0.5) * self.height) : int(
+                        (signal_desc.upper_frequency + 0.5) * self.height
+                    ),
+                    int(signal_desc.start * self.width) : int(
+                        signal_desc.stop * self.width
+                    ),
                 ] = 1.0
         return masks
-    
-    
+
+
 class DescToSemanticClass(Transform):
     """Transform to transform SignalDescriptions into spectrogram semantic
     segmentation mask with class information denoted as a value, rather than by
-    a one/multi-hot vector in an additional channel like the 
+    a one/multi-hot vector in an additional channel like the
     DescToMaskClass does. Note that the class indicies are all
-    incremented by 1 in order to reserve the 0 class for "background". Note 
-    that cases of overlapping bursts are currently resolved by comparing SNRs, 
+    incremented by 1 in order to reserve the 0 class for "background". Note
+    that cases of overlapping bursts are currently resolved by comparing SNRs,
     labeling the pixel by the stronger signal. Ties in SNR are awarded to the
     burst that appears later in the burst collection.
 
@@ -378,19 +483,26 @@ class DescToSemanticClass(Transform):
             Width of resultant spectrogram mask
         height (:obj:`int`):
             Height of resultant spectrogram mask
-            
+
     """
+
     def __init__(self, num_classes: int, width: int, height: int):
         super(DescToSemanticClass, self).__init__()
         self.num_classes = num_classes
         self.width = width
         self.height = height
 
-    def __call__(self, signal_description: Union[List[SignalDescription], SignalDescription]) -> np.ndarray:
+    def __call__(
+        self, signal_description: Union[List[SignalDescription], SignalDescription]
+    ) -> np.ndarray:
         # Handle cases of both SignalDescriptions and lists of SignalDescriptions
-        signal_description = [signal_description] if isinstance(signal_description, SignalDescription) else signal_description
+        signal_description = (
+            [signal_description]
+            if isinstance(signal_description, SignalDescription)
+            else signal_description
+        )
         masks = np.zeros((self.height, self.width))
-        curr_snrs = np.ones((self.height, self.width))*-np.inf
+        curr_snrs = np.ones((self.height, self.width)) * -np.inf
         for signal_desc in signal_description:
             # Normalize freq values to [0,1]
             if signal_desc.lower_frequency < -0.5:
@@ -399,14 +511,18 @@ def __call__(self, signal_description: Union[List[SignalDescription], SignalDesc
                 signal_desc.upper_frequency = 0.5
 
             # Convert to pixels
-            height_start = max(0, int((signal_desc.lower_frequency+0.5) * self.height))
-            height_stop = min(int((signal_desc.upper_frequency+0.5) * self.height), self.height)
+            height_start = max(
+                0, int((signal_desc.lower_frequency + 0.5) * self.height)
+            )
+            height_stop = min(
+                int((signal_desc.upper_frequency + 0.5) * self.height), self.height
+            )
             width_start = max(0, int(signal_desc.start * self.width))
             width_stop = min(int(signal_desc.stop * self.width), self.width)
 
             # Account for signals with bandwidths < a pixel
             if height_start == height_stop:
-                height_stop = min(height_stop+1, self.height)
+                height_stop = min(height_stop + 1, self.height)
 
             # Loop through pixels
             for height_idx in range(height_start, height_stop):
@@ -414,17 +530,16 @@ def __call__(self, signal_description: Union[List[SignalDescription], SignalDesc
                     # Check SNR against currently stored SNR at pixel
                     if signal_desc.snr >= curr_snrs[height_idx, width_idx]:
                         # If SNR >= currently stored class's SNR, update class & snr
-                        masks[
-                            height_start : height_stop,
-                            width_start : width_stop,
-                        ] = signal_desc.class_index+1
+                        masks[height_start:height_stop, width_start:width_stop,] = (
+                            signal_desc.class_index + 1
+                        )
                         curr_snrs[
-                            height_start : height_stop,
-                            width_start : width_stop,
+                            height_start:height_stop,
+                            width_start:width_stop,
                         ] = signal_desc.snr_db
         return masks
 
-    
+
 class DescToBBox(Transform):
     """Transform to transform SignalDescriptions into spectrogram bounding boxes
     with dimensions: <grid_width, grid_height, 5>, where the last 5 represents:
@@ -433,7 +548,7 @@ class DescToBBox(Transform):
         - 2: dur_time ~ normalized to full spec time
         - 3: center_freq ~ normalized to cell
         - 4: bw_freq ~ normalized to full spec bw
-          
+
     Args:
         grid_width (:obj:`int`):
             Width of grid celling
@@ -441,14 +556,21 @@ class DescToBBox(Transform):
             Height of grid celling
 
     """
+
     def __init__(self, grid_width: int, grid_height: int):
         super(DescToBBox, self).__init__()
         self.grid_width = grid_width
         self.grid_height = grid_height
-    
-    def __call__(self, signal_description: Union[List[SignalDescription], SignalDescription]) -> np.ndarray:
+
+    def __call__(
+        self, signal_description: Union[List[SignalDescription], SignalDescription]
+    ) -> np.ndarray:
         # Handle cases of both SignalDescriptions and lists of SignalDescriptions
-        signal_description = [signal_description] if isinstance(signal_description, SignalDescription) else signal_description
+        signal_description = (
+            [signal_description]
+            if isinstance(signal_description, SignalDescription)
+            else signal_description
+        )
         boxes = np.zeros((self.grid_width, self.grid_height, 5))
         for signal_desc in signal_description:
             # Time conversions
@@ -470,8 +592,12 @@ def __call__(self, signal_description: Union[List[SignalDescription], SignalDesc
                 signal_desc.lower_frequency = -0.5
             if signal_desc.upper_frequency > 0.5:
                 signal_desc.upper_frequency = 0.5
-            signal_desc.bandwidth = signal_desc.upper_frequency - signal_desc.lower_frequency
-            signal_desc.center_frequency = signal_desc.lower_frequency + signal_desc.bandwidth / 2
+            signal_desc.bandwidth = (
+                signal_desc.upper_frequency - signal_desc.lower_frequency
+            )
+            signal_desc.center_frequency = (
+                signal_desc.lower_frequency + signal_desc.bandwidth / 2
+            )
             y = (signal_desc.center_frequency + 0.5) * self.grid_height
             freq_cell = int(np.floor(y))
             center_freq = y - freq_cell
@@ -498,17 +624,17 @@ def __call__(self, signal_description: Union[List[SignalDescription], SignalDesc
             boxes[time_cell, freq_cell, 4] = signal_desc.bandwidth
         return boxes
 
-    
+
 class DescToAnchorBoxes(Transform):
     """Transform to transform BurstCollections into spectrogram bounding boxes
-    using anchor boxes, such that the output target shape will have the     
+    using anchor boxes, such that the output target shape will have the
     dimensions: <grid_width, grid_height, 5*num_anchor_boxes>, where the last 5 represents:
         - 0: objectness ~ 1 if burst associated with current cell & anchor, else 0
         - 1: center_time ~ normalized to cell
         - 2: dur_offset ~ offset in duration with anchor box duration
         - 3: center_freq ~ normalized to cell
         - 4: bw_offset ~ offset in bandwidth with anchor box duration
-          
+
     Args:
         grid_width (:obj:`int`):
             Width of grid celling
@@ -519,6 +645,7 @@ class DescToAnchorBoxes(Transform):
                 Example format: [(dur1, bw1), (dur2, bw2)]
 
     """
+
     def __init__(self, grid_width: int, grid_height: int, anchor_boxes: List):
         super(DescToAnchorBoxes, self).__init__()
         self.grid_width = grid_width
@@ -527,17 +654,19 @@ def __init__(self, grid_width: int, grid_height: int, anchor_boxes: List):
         self.num_anchor_boxes = len(anchor_boxes)
 
     # IoU function
-    def iou(self, start_a, dur_a, center_freq_a, bw_a, start_b, dur_b, center_freq_b, bw_b):
+    def iou(
+        self, start_a, dur_a, center_freq_a, bw_a, start_b, dur_b, center_freq_b, bw_b
+    ):
         # Convert to start/stops
         x_start_a = start_a
         x_stop_a = start_a + dur_a
-        y_start_a = center_freq_a - bw_a/2
-        y_stop_a = center_freq_a + bw_a/2
+        y_start_a = center_freq_a - bw_a / 2
+        y_stop_a = center_freq_a + bw_a / 2
 
         x_start_b = start_b
         x_stop_b = start_b + dur_b
-        y_start_b = center_freq_b - bw_b/2
-        y_stop_b = center_freq_b + bw_b/2
+        y_start_b = center_freq_b - bw_b / 2
+        y_stop_b = center_freq_b + bw_b / 2
 
         # Determine the (x, y)-coordinates of the intersection
         x_start_int = max(x_start_a, x_start_b)
@@ -546,7 +675,9 @@ def iou(self, start_a, dur_a, center_freq_a, bw_a, start_b, dur_b, center_freq_b
         y_stop_int = min(y_stop_a, y_stop_b)
 
         # Compute the area of intersection
-        inter_area = abs(max((x_stop_int - x_start_int, 0)) * max((y_stop_int - y_start_int), 0))
+        inter_area = abs(
+            max((x_stop_int - x_start_int, 0)) * max((y_stop_int - y_start_int), 0)
+        )
         if inter_area == 0:
             return 0
         # Compute the area of both the prediction and ground-truth
@@ -557,10 +688,16 @@ def iou(self, start_a, dur_a, center_freq_a, bw_a, start_b, dur_b, center_freq_b
         iou = inter_area / float(area_a + area_b - inter_area)
         return iou
 
-    def __call__(self, signal_description: Union[List[SignalDescription], SignalDescription]) -> np.ndarray:
+    def __call__(
+        self, signal_description: Union[List[SignalDescription], SignalDescription]
+    ) -> np.ndarray:
         # Handle cases of both SignalDescriptions and lists of SignalDescriptions
-        signal_description = [signal_description] if isinstance(signal_description, SignalDescription) else signal_description
-        boxes = np.zeros((self.grid_width, self.grid_height, 5*self.num_anchor_boxes))
+        signal_description = (
+            [signal_description]
+            if isinstance(signal_description, SignalDescription)
+            else signal_description
+        )
+        boxes = np.zeros((self.grid_width, self.grid_height, 5 * self.num_anchor_boxes))
         for signal_desc in signal_description:
             # Time conversions
             if signal_desc.start > 1.0:
@@ -597,15 +734,30 @@ def __call__(self, signal_description: Union[List[SignalDescription], SignalDesc
             best_anchor_duration = 0
             best_anchor_bw = 0
             for anchor_idx, anchor_box in enumerate(self.anchor_boxes):
-                #anchor_start = ((time_cell+0.5) / self.grid_width) - (anchor_box[0]*0.5) # Anchor centered on cell
-                anchor_start = signal_desc.start + 0.5*signal_desc.duration - anchor_box[0]*0.5 # Anchor overlaid on burst
+                # anchor_start = ((time_cell+0.5) / self.grid_width) - (anchor_box[0]*0.5) # Anchor centered on cell
+                anchor_start = (
+                    signal_desc.start + 0.5 * signal_desc.duration - anchor_box[0] * 0.5
+                )  # Anchor overlaid on burst
                 anchor_duration = anchor_box[0]
-                #anchor_center_freq = (freq_cell+0.5) / self.grid_height # Anchor centered on cell
-                anchor_center_freq = signal_desc.center_frequency # Anchor overlaid on burst
+                # anchor_center_freq = (freq_cell+0.5) / self.grid_height # Anchor centered on cell
+                anchor_center_freq = (
+                    signal_desc.center_frequency
+                )  # Anchor overlaid on burst
                 anchor_bw = anchor_box[1]
-                iou_score = self.iou(signal_desc.start, signal_desc.duration, signal_desc.center_frequency, signal_desc.bandwidth, 
-                                     anchor_start, anchor_duration, anchor_center_freq, anchor_bw)
-                if iou_score > best_iou_score and boxes[time_cell, freq_cell, 0+5*anchor_idx] != 1:
+                iou_score = self.iou(
+                    signal_desc.start,
+                    signal_desc.duration,
+                    signal_desc.center_frequency,
+                    signal_desc.bandwidth,
+                    anchor_start,
+                    anchor_duration,
+                    anchor_center_freq,
+                    anchor_bw,
+                )
+                if (
+                    iou_score > best_iou_score
+                    and boxes[time_cell, freq_cell, 0 + 5 * anchor_idx] != 1
+                ):
                     # If IoU score is the best out of all anchors and anchor hasn't already been used for another burst, save results
                     best_iou_score = iou_score
                     best_iou_idx = anchor_idx
@@ -620,65 +772,78 @@ def __call__(self, signal_description: Union[List[SignalDescription], SignalDesc
             #   -- loss & inference will require predicted_box_wh = (sigmoid(model_output_wh)*2)**2 * anchor_wh
             if best_iou_score > 0:
                 # Detection:
-                boxes[time_cell, freq_cell, 0+5*best_iou_idx] = 1
+                boxes[time_cell, freq_cell, 0 + 5 * best_iou_idx] = 1
                 # Center time & freq
-                boxes[time_cell, freq_cell, 1+5*best_iou_idx] = center_time
-                boxes[time_cell, freq_cell, 3+5*best_iou_idx] = center_freq
+                boxes[time_cell, freq_cell, 1 + 5 * best_iou_idx] = center_time
+                boxes[time_cell, freq_cell, 3 + 5 * best_iou_idx] = center_freq
                 # Duration/Bandwidth (Width/Height)
-                boxes[time_cell, freq_cell, 2+5*best_iou_idx] = signal_desc.duration / best_anchor_duration
-                boxes[time_cell, freq_cell, 4+5*best_iou_idx] = signal_desc.bandwidth / best_anchor_bw
-        return boxes    
+                boxes[time_cell, freq_cell, 2 + 5 * best_iou_idx] = (
+                    signal_desc.duration / best_anchor_duration
+                )
+                boxes[time_cell, freq_cell, 4 + 5 * best_iou_idx] = (
+                    signal_desc.bandwidth / best_anchor_bw
+                )
+        return boxes
+
 
-    
 class DescPassThrough(Transform):
     """Transform to simply pass the SignalDescription through. Same as applying no
     transform in most cases.
-            
+
     """
+
     def __init__(self):
         super(DescPassThrough, self).__init__()
 
-    def __call__(self, signal_description: Union[List[SignalDescription], SignalDescription]) -> Union[List[SignalDescription], SignalDescription]:
+    def __call__(
+        self, signal_description: Union[List[SignalDescription], SignalDescription]
+    ) -> Union[List[SignalDescription], SignalDescription]:
         return signal_description
-    
-    
+
+
 class DescToBinary(Transform):
     """Transform to transform SignalDescription into binary 0/1 label
 
     Args:
         label (:obj:`int`):
             Binary label to assign
-            
+
     """
+
     def __init__(self, label: int):
         super(DescToBinary, self).__init__()
         self.label = label
 
-    def __call__(self, signal_description: Union[List[SignalDescription], SignalDescription]) -> int:
+    def __call__(
+        self, signal_description: Union[List[SignalDescription], SignalDescription]
+    ) -> int:
         return self.label
 
-    
+
 class DescToCustom(Transform):
     """Transform to transform SignalDescription into any static value
 
     Args:
         label (:obj:`Any`):
             Custom static label to assign
-            
+
     """
+
     def __init__(self, label: Any):
         super(DescToCustom, self).__init__()
         self.label = label
 
-    def __call__(self, signal_description: Union[List[SignalDescription], SignalDescription]) -> Any:
+    def __call__(
+        self, signal_description: Union[List[SignalDescription], SignalDescription]
+    ) -> Any:
         return self.label
-    
-    
+
+
 class DescToClassEncoding(Transform):
     """Transform to transform SignalDescription into one- or multi-hot class
     encodings. Note that either the number of classes or the full class list
-    must be provided as input. If neither are provided, the transform will 
-    raise an error, and if both are provided, the transform will default to 
+    must be provided as input. If neither are provided, the transform will
+    raise an error, and if both are provided, the transform will default to
     using the full class list. If only the number of classes are provided,
     the SignalDescription objects must contain the class index field
 
@@ -688,28 +853,35 @@ class DescToClassEncoding(Transform):
 
         num_classes (:obj:`Optional[int]`):
             Number of classes in the encoding
-        
+
     """
+
     def __init__(
-        self, 
+        self,
         class_list: Optional[List[str]] = None,
-        num_classes: Optional[int] = None, 
+        num_classes: Optional[int] = None,
     ) -> np.ndarray:
         super(DescToClassEncoding, self).__init__()
         self.class_list = class_list
         self.num_classes = num_classes if num_classes else len(class_list)
 
-    def __call__(self, signal_description: Union[List[SignalDescription], SignalDescription]) -> np.ndarray:
+    def __call__(
+        self, signal_description: Union[List[SignalDescription], SignalDescription]
+    ) -> np.ndarray:
         # Handle cases of both SignalDescriptions and lists of SignalDescriptions
-        signal_description = [signal_description] if isinstance(signal_description, SignalDescription) else signal_description
+        signal_description = (
+            [signal_description]
+            if isinstance(signal_description, SignalDescription)
+            else signal_description
+        )
         encoding = np.zeros((self.num_classes,))
         for signal_desc in signal_description:
             if self.class_list:
                 encoding[self.class_list.index(signal_desc.class_name)] = 1.0
             else:
-                encoding[signal_desc.class_index] = 1.0  
+                encoding[signal_desc.class_index] = 1.0
         return encoding
-    
+
 
 class DescToWeightedMixUp(Transform):
     """Transform to transform SignalDescription into weighted multi-hot class
@@ -718,19 +890,26 @@ class DescToWeightedMixUp(Transform):
     Args:
         class_list (:obj:`Optional[List[str]]`):
             Class list
-        
+
     """
+
     def __init__(
-        self, 
+        self,
         class_list: List[str] = None,
     ) -> np.ndarray:
         super(DescToWeightedMixUp, self).__init__()
         self.class_list = class_list
         self.num_classes = len(class_list)
 
-    def __call__(self, signal_description: Union[List[SignalDescription], SignalDescription]) -> np.ndarray:
+    def __call__(
+        self, signal_description: Union[List[SignalDescription], SignalDescription]
+    ) -> np.ndarray:
         # Handle cases of both SignalDescriptions and lists of SignalDescriptions
-        signal_description = [signal_description] if isinstance(signal_description, SignalDescription) else signal_description
+        signal_description = (
+            [signal_description]
+            if isinstance(signal_description, SignalDescription)
+            else signal_description
+        )
         encoding = np.zeros((self.num_classes,))
         # Instead of a binary value for the encoding, set it to the SNR
         for signal_desc in signal_description:
@@ -738,8 +917,8 @@ def __call__(self, signal_description: Union[List[SignalDescription], SignalDesc
         # Next, normalize to the total of all SNR values
         encoding = encoding / np.sum(encoding)
         return encoding
-    
-    
+
+
 class DescToWeightedCutMix(Transform):
     """Transform to transform SignalDescription into weighted multi-hot class
     encodings.
@@ -747,193 +926,236 @@ class DescToWeightedCutMix(Transform):
     Args:
         class_list (:obj:`Optional[List[str]]`):
             Class list
-        
+
     """
+
     def __init__(
-        self, 
+        self,
         class_list: List[str] = None,
     ) -> np.ndarray:
         super(DescToWeightedCutMix, self).__init__()
         self.class_list = class_list
         self.num_classes = len(class_list)
 
-    def __call__(self, signal_description: Union[List[SignalDescription], SignalDescription]) -> np.ndarray:
+    def __call__(
+        self, signal_description: Union[List[SignalDescription], SignalDescription]
+    ) -> np.ndarray:
         # Handle cases of both SignalDescriptions and lists of SignalDescriptions
-        signal_description = [signal_description] if isinstance(signal_description, SignalDescription) else signal_description
+        signal_description = (
+            [signal_description]
+            if isinstance(signal_description, SignalDescription)
+            else signal_description
+        )
         encoding = np.zeros((self.num_classes,))
         # Instead of a binary value for the encoding, set it to the cumulative duration
         for signal_desc in signal_description:
-            encoding[self.class_list.index(signal_desc.class_name)] += signal_desc.duration
+            encoding[
+                self.class_list.index(signal_desc.class_name)
+            ] += signal_desc.duration
         # Normalize on total signals durations
         encoding = encoding / np.sum(encoding)
         return encoding
-    
-    
+
+
 class DescToBBoxDict(Transform):
     """Transform to transform SignalDescriptions into the class bounding box format
     using dictionaries of labels and boxes, similar to the COCO image dataset
-    
+
     Args:
         class_list (:obj:`list`):
             List of class names. Used when converting SignalDescription class names
             to indices
 
     """
+
     def __init__(self, class_list):
         super(DescToBBoxDict, self).__init__()
         self.class_list = class_list
-    
-    def __call__(self, signal_description: Union[List[SignalDescription], SignalDescription]) -> np.ndarray:
-        signal_description = [signal_description] if isinstance(signal_description, SignalDescription) else signal_description
+
+    def __call__(
+        self, signal_description: Union[List[SignalDescription], SignalDescription]
+    ) -> np.ndarray:
+        signal_description = (
+            [signal_description]
+            if isinstance(signal_description, SignalDescription)
+            else signal_description
+        )
         labels = []
-        boxes = np.empty((len(signal_description),4))
+        boxes = np.empty((len(signal_description), 4))
         for signal_desc_idx, signal_desc in enumerate(signal_description):
-            #xcycwh
+            # xcycwh
             duration = signal_desc.stop - signal_desc.start
             bandwidth = signal_desc.upper_frequency - signal_desc.lower_frequency
-            boxes[signal_desc_idx] = np.array([
-                signal_desc.start + 0.5*duration, 
-                signal_desc.lower_frequency + 0.5 + 0.5*bandwidth, 
-                duration, 
-                bandwidth
-            ])
+            boxes[signal_desc_idx] = np.array(
+                [
+                    signal_desc.start + 0.5 * duration,
+                    signal_desc.lower_frequency + 0.5 + 0.5 * bandwidth,
+                    duration,
+                    bandwidth,
+                ]
+            )
             labels.append(self.class_list.index(signal_desc.class_name))
-            
-        targets = {"labels":torch.Tensor(labels).long(), "boxes":torch.Tensor(boxes)}
+
+        targets = {"labels": torch.Tensor(labels).long(), "boxes": torch.Tensor(boxes)}
         return targets
 
-    
+
 class DescToBBoxSignalDict(Transform):
     """Transform to transform SignalDescriptions into the class bounding box format
-    using dictionaries of labels and boxes, similar to the COCO image dataset. 
+    using dictionaries of labels and boxes, similar to the COCO image dataset.
     Differs from the `SignalDescriptionToBoundingBoxDictTransform` in the ommission
     of signal-specific class labels, grouping all objects into the 'signal'
     class.
 
     """
+
     def __init__(self):
         super(DescToBBoxSignalDict, self).__init__()
         self.class_list = ["signal"]
-    
-    def __call__(self, signal_description: Union[List[SignalDescription], SignalDescription]) -> np.ndarray:
-        signal_description = [signal_description] if isinstance(signal_description, SignalDescription) else signal_description
+
+    def __call__(
+        self, signal_description: Union[List[SignalDescription], SignalDescription]
+    ) -> np.ndarray:
+        signal_description = (
+            [signal_description]
+            if isinstance(signal_description, SignalDescription)
+            else signal_description
+        )
         labels = []
-        boxes = np.empty((len(signal_description),4))
+        boxes = np.empty((len(signal_description), 4))
         for signal_desc_idx, signal_desc in enumerate(signal_description):
-            #xcycwh
+            # xcycwh
             duration = signal_desc.stop - signal_desc.start
             bandwidth = signal_desc.upper_frequency - signal_desc.lower_frequency
-            boxes[signal_desc_idx] = np.array([
-                signal_desc.start + 0.5*duration,
-                signal_desc.lower_frequency + 0.5 + 0.5*bandwidth, 
-                duration, 
-                bandwidth
-            ])
+            boxes[signal_desc_idx] = np.array(
+                [
+                    signal_desc.start + 0.5 * duration,
+                    signal_desc.lower_frequency + 0.5 + 0.5 * bandwidth,
+                    duration,
+                    bandwidth,
+                ]
+            )
             labels.append(self.class_list.index(self.class_list[0]))
-            
-        targets = {"labels":torch.Tensor(labels).long(), "boxes":torch.Tensor(boxes)}
+
+        targets = {"labels": torch.Tensor(labels).long(), "boxes": torch.Tensor(boxes)}
         return targets
-    
-    
+
+
 class DescToBBoxFamilyDict(Transform):
     """Transform to transform SignalDescriptions into the class bounding box format
-    using dictionaries of labels and boxes, similar to the COCO image dataset. 
+    using dictionaries of labels and boxes, similar to the COCO image dataset.
     Differs from the `DescToBBoxDict` transform in the grouping
     of fine-grain classes into their signal family as defined by an input
     `class_family_dict` dictionary.
-    
+
     Args:
         class_family_dict (:obj:`dict`):
             Dictionary mapping all class names to their families
-            
+
     """
+
     class_family_dict = {
-        '4ask':'ask',
-        '8ask':'ask',
-        '16ask':'ask',
-        '32ask':'ask',
-        '64ask':'ask',
-        'ook':'pam',
-        '4pam':'pam',
-        '8pam':'pam',
-        '16pam':'pam',
-        '32pam':'pam',
-        '64pam':'pam',
-        '2fsk':'fsk',
-        '2gfsk':'fsk',
-        '2msk':'fsk',
-        '2gmsk':'fsk',
-        '4fsk':'fsk',
-        '4gfsk':'fsk',
-        '4msk':'fsk',
-        '4gmsk':'fsk',
-        '8fsk':'fsk',
-        '8gfsk':'fsk',
-        '8msk':'fsk',
-        '8gmsk':'fsk',
-        '16fsk':'fsk',
-        '16gfsk':'fsk',
-        '16msk':'fsk',
-        '16gmsk':'fsk',
-        'bpsk':'psk',
-        'qpsk':'psk',
-        '8psk':'psk',
-        '16psk':'psk',
-        '32psk':'psk',
-        '64psk':'psk',
-        '16qam':'qam',
-        '32qam':'qam',
-        '32qam_cross':'qam',
-        '64qam':'qam',
-        '128qam_cross':'qam',
-        '256qam':'qam',
-        '512qam_cross':'qam',
-        '1024qam':'qam',
-        'ofdm-64':'ofdm',
-        'ofdm-72':'ofdm',
-        'ofdm-128':'ofdm',
-        'ofdm-180':'ofdm',
-        'ofdm-256':'ofdm',
-        'ofdm-300':'ofdm',
-        'ofdm-512':'ofdm',
-        'ofdm-600':'ofdm',
-        'ofdm-900':'ofdm',
-        'ofdm-1024':'ofdm',
-        'ofdm-1200':'ofdm',
-        'ofdm-2048':'ofdm',
+        "4ask": "ask",
+        "8ask": "ask",
+        "16ask": "ask",
+        "32ask": "ask",
+        "64ask": "ask",
+        "ook": "pam",
+        "4pam": "pam",
+        "8pam": "pam",
+        "16pam": "pam",
+        "32pam": "pam",
+        "64pam": "pam",
+        "2fsk": "fsk",
+        "2gfsk": "fsk",
+        "2msk": "fsk",
+        "2gmsk": "fsk",
+        "4fsk": "fsk",
+        "4gfsk": "fsk",
+        "4msk": "fsk",
+        "4gmsk": "fsk",
+        "8fsk": "fsk",
+        "8gfsk": "fsk",
+        "8msk": "fsk",
+        "8gmsk": "fsk",
+        "16fsk": "fsk",
+        "16gfsk": "fsk",
+        "16msk": "fsk",
+        "16gmsk": "fsk",
+        "bpsk": "psk",
+        "qpsk": "psk",
+        "8psk": "psk",
+        "16psk": "psk",
+        "32psk": "psk",
+        "64psk": "psk",
+        "16qam": "qam",
+        "32qam": "qam",
+        "32qam_cross": "qam",
+        "64qam": "qam",
+        "128qam_cross": "qam",
+        "256qam": "qam",
+        "512qam_cross": "qam",
+        "1024qam": "qam",
+        "ofdm-64": "ofdm",
+        "ofdm-72": "ofdm",
+        "ofdm-128": "ofdm",
+        "ofdm-180": "ofdm",
+        "ofdm-256": "ofdm",
+        "ofdm-300": "ofdm",
+        "ofdm-512": "ofdm",
+        "ofdm-600": "ofdm",
+        "ofdm-900": "ofdm",
+        "ofdm-1024": "ofdm",
+        "ofdm-1200": "ofdm",
+        "ofdm-2048": "ofdm",
     }
+
     def __init__(self, class_family_dict: dict = None, family_list: list = None):
         super(DescToBBoxFamilyDict, self).__init__()
-        self.class_family_dict = class_family_dict if class_family_dict else self.class_family_dict
-        self.family_list = family_list if family_list else sorted(list(set(self.class_family_dict.values())))
-    
-    def __call__(self, signal_description: Union[List[SignalDescription], SignalDescription]) -> np.ndarray:
-        signal_description = [signal_description] if isinstance(signal_description, SignalDescription) else signal_description
+        self.class_family_dict = (
+            class_family_dict if class_family_dict else self.class_family_dict
+        )
+        self.family_list = (
+            family_list
+            if family_list
+            else sorted(list(set(self.class_family_dict.values())))
+        )
+
+    def __call__(
+        self, signal_description: Union[List[SignalDescription], SignalDescription]
+    ) -> np.ndarray:
+        signal_description = (
+            [signal_description]
+            if isinstance(signal_description, SignalDescription)
+            else signal_description
+        )
         labels = []
-        boxes = np.empty((len(signal_description),4))
+        boxes = np.empty((len(signal_description), 4))
         for signal_desc_idx, signal_desc in enumerate(signal_description):
-            #xcycwh
+            # xcycwh
             duration = signal_desc.stop - signal_desc.start
             bandwidth = signal_desc.upper_frequency - signal_desc.lower_frequency
-            boxes[signal_desc_idx] = np.array([
-                signal_desc.start + 0.5*duration, 
-                signal_desc.lower_frequency + 0.5 + 0.5*bandwidth, 
-                duration, 
-                bandwidth
-            ])
+            boxes[signal_desc_idx] = np.array(
+                [
+                    signal_desc.start + 0.5 * duration,
+                    signal_desc.lower_frequency + 0.5 + 0.5 * bandwidth,
+                    duration,
+                    bandwidth,
+                ]
+            )
             if isinstance(signal_desc.class_name, list):
                 signal_desc.class_name = signal_desc.class_name[0]
             family_name = self.class_family_dict[signal_desc.class_name]
             labels.append(self.family_list.index(family_name))
-            
-        targets = {"labels":torch.Tensor(labels).long(), "boxes":torch.Tensor(boxes)}
+
+        targets = {"labels": torch.Tensor(labels).long(), "boxes": torch.Tensor(boxes)}
         return targets
-    
-    
+
+
 class DescToInstMaskDict(Transform):
     """Transform to transform SignalDescriptions into the class mask format
     using dictionaries of labels and masks, similar to the COCO image dataset
-    
+
     Args:
         class_list (:obj:`list`):
             List of class names. Used when converting SignalDescription class names
@@ -944,8 +1166,9 @@ class DescToInstMaskDict(Transform):
             Height of masks
 
     """
+
     def __init__(
-        self, 
+        self,
         class_list: List = [],
         width: int = 512,
         height: int = 512,
@@ -954,9 +1177,15 @@ def __init__(
         self.class_list = class_list
         self.width = width
         self.height = height
-    
-    def __call__(self, signal_description: Union[List[SignalDescription], SignalDescription]) -> np.ndarray:
-        signal_description = [signal_description] if isinstance(signal_description, SignalDescription) else signal_description
+
+    def __call__(
+        self, signal_description: Union[List[SignalDescription], SignalDescription]
+    ) -> np.ndarray:
+        signal_description = (
+            [signal_description]
+            if isinstance(signal_description, SignalDescription)
+            else signal_description
+        )
         num_objects = len(signal_description)
         labels = []
         masks = np.zeros((num_objects, self.height, self.width))
@@ -966,27 +1195,41 @@ def __call__(self, signal_description: Union[List[SignalDescription], SignalDesc
                 signal_desc.lower_frequency = -0.5
             if signal_desc.upper_frequency > 0.5:
                 signal_desc.upper_frequency = 0.5
-            if int((signal_desc.lower_frequency+0.5) * self.height) == int((signal_desc.upper_frequency+0.5) * self.height):
+            if int((signal_desc.lower_frequency + 0.5) * self.height) == int(
+                (signal_desc.upper_frequency + 0.5) * self.height
+            ):
                 masks[
                     signal_desc_idx,
-                    int((signal_desc.lower_frequency+0.5) * self.height) : int((signal_desc.upper_frequency+0.5) * self.height)+1,
-                    int(signal_desc.start * self.width) : int(signal_desc.stop * self.width),
+                    int((signal_desc.lower_frequency + 0.5) * self.height) : int(
+                        (signal_desc.upper_frequency + 0.5) * self.height
+                    )
+                    + 1,
+                    int(signal_desc.start * self.width) : int(
+                        signal_desc.stop * self.width
+                    ),
                 ] = 1.0
             else:
                 masks[
                     signal_desc_idx,
-                    int((signal_desc.lower_frequency+0.5) * self.height) : int((signal_desc.upper_frequency+0.5) * self.height),
-                    int(signal_desc.start * self.width) : int(signal_desc.stop * self.width),
+                    int((signal_desc.lower_frequency + 0.5) * self.height) : int(
+                        (signal_desc.upper_frequency + 0.5) * self.height
+                    ),
+                    int(signal_desc.start * self.width) : int(
+                        signal_desc.stop * self.width
+                    ),
                 ] = 1.0
-            
-        targets = {"labels":torch.Tensor(labels).long(), "masks":torch.Tensor(masks.astype(bool))}
+
+        targets = {
+            "labels": torch.Tensor(labels).long(),
+            "masks": torch.Tensor(masks.astype(bool)),
+        }
         return targets
 
-    
+
 class DescToSignalInstMaskDict(Transform):
     """Transform to transform SignalDescriptions into the class mask format
     using dictionaries of labels and masks, similar to the COCO image dataset
-    
+
     Args:
         width (:obj:`int`):
             Width of masks
@@ -994,17 +1237,24 @@ class DescToSignalInstMaskDict(Transform):
             Height of masks
 
     """
+
     def __init__(
-        self, 
+        self,
         width: int = 512,
         height: int = 512,
     ):
         super(DescToSignalInstMaskDict, self).__init__()
         self.width = width
         self.height = height
-    
-    def __call__(self, signal_description: Union[List[SignalDescription], SignalDescription]) -> np.ndarray:
-        signal_description = [signal_description] if isinstance(signal_description, SignalDescription) else signal_description
+
+    def __call__(
+        self, signal_description: Union[List[SignalDescription], SignalDescription]
+    ) -> np.ndarray:
+        signal_description = (
+            [signal_description]
+            if isinstance(signal_description, SignalDescription)
+            else signal_description
+        )
         num_objects = len(signal_description)
         labels = []
         masks = np.zeros((num_objects, self.height, self.width))
@@ -1014,30 +1264,44 @@ def __call__(self, signal_description: Union[List[SignalDescription], SignalDesc
                 signal_desc.lower_frequency = -0.5
             if signal_desc.upper_frequency > 0.5:
                 signal_desc.upper_frequency = 0.5
-            if int((signal_desc.lower_frequency+0.5) * self.height) == int((signal_desc.upper_frequency+0.5) * self.height):
+            if int((signal_desc.lower_frequency + 0.5) * self.height) == int(
+                (signal_desc.upper_frequency + 0.5) * self.height
+            ):
                 masks[
                     signal_desc_idx,
-                    int((signal_desc.lower_frequency+0.5) * self.height) : int((signal_desc.upper_frequency+0.5) * self.height)+1,
-                    int(signal_desc.start * self.width) : int(signal_desc.stop * self.width),
+                    int((signal_desc.lower_frequency + 0.5) * self.height) : int(
+                        (signal_desc.upper_frequency + 0.5) * self.height
+                    )
+                    + 1,
+                    int(signal_desc.start * self.width) : int(
+                        signal_desc.stop * self.width
+                    ),
                 ] = 1.0
             else:
                 masks[
                     signal_desc_idx,
-                    int((signal_desc.lower_frequency+0.5) * self.height) : int((signal_desc.upper_frequency+0.5) * self.height),
-                    int(signal_desc.start * self.width) : int(signal_desc.stop * self.width),
+                    int((signal_desc.lower_frequency + 0.5) * self.height) : int(
+                        (signal_desc.upper_frequency + 0.5) * self.height
+                    ),
+                    int(signal_desc.start * self.width) : int(
+                        signal_desc.stop * self.width
+                    ),
                 ] = 1.0
-            
-        targets = {"labels":torch.Tensor(labels).long(), "masks":torch.Tensor(masks.astype(bool))}
+
+        targets = {
+            "labels": torch.Tensor(labels).long(),
+            "masks": torch.Tensor(masks.astype(bool)),
+        }
         return targets
-    
-    
+
+
 class DescToSignalFamilyInstMaskDict(Transform):
     """Transform to transform SignalDescriptions into the class mask format
     using dictionaries of labels and masks, similar to the COCO image dataset.
     The labels with this target transform are set to be the class's family. If
     no `class_family_dict` is provided, the default mapping for the WBSig53
     modulation families is used.
-    
+
     Args:
         class_family_dict (:obj:`dict`):
             Dictionary mapping all class names to their families
@@ -1049,76 +1313,90 @@ class DescToSignalFamilyInstMaskDict(Transform):
             Height of resultant spectrogram mask
 
     """
+
     class_family_dict = {
-        '4ask':'ask',
-        '8ask':'ask',
-        '16ask':'ask',
-        '32ask':'ask',
-        '64ask':'ask',
-        'ook':'pam',
-        '4pam':'pam',
-        '8pam':'pam',
-        '16pam':'pam',
-        '32pam':'pam',
-        '64pam':'pam',
-        '2fsk':'fsk',
-        '2gfsk':'fsk',
-        '2msk':'fsk',
-        '2gmsk':'fsk',
-        '4fsk':'fsk',
-        '4gfsk':'fsk',
-        '4msk':'fsk',
-        '4gmsk':'fsk',
-        '8fsk':'fsk',
-        '8gfsk':'fsk',
-        '8msk':'fsk',
-        '8gmsk':'fsk',
-        '16fsk':'fsk',
-        '16gfsk':'fsk',
-        '16msk':'fsk',
-        '16gmsk':'fsk',
-        'bpsk':'psk',
-        'qpsk':'psk',
-        '8psk':'psk',
-        '16psk':'psk',
-        '32psk':'psk',
-        '64psk':'psk',
-        '16qam':'qam',
-        '32qam':'qam',
-        '32qam_cross':'qam',
-        '64qam':'qam',
-        '128qam_cross':'qam',
-        '256qam':'qam',
-        '512qam_cross':'qam',
-        '1024qam':'qam',
-        'ofdm-64':'ofdm',
-        'ofdm-72':'ofdm',
-        'ofdm-128':'ofdm',
-        'ofdm-180':'ofdm',
-        'ofdm-256':'ofdm',
-        'ofdm-300':'ofdm',
-        'ofdm-512':'ofdm',
-        'ofdm-600':'ofdm',
-        'ofdm-900':'ofdm',
-        'ofdm-1024':'ofdm',
-        'ofdm-1200':'ofdm',
-        'ofdm-2048':'ofdm',
+        "4ask": "ask",
+        "8ask": "ask",
+        "16ask": "ask",
+        "32ask": "ask",
+        "64ask": "ask",
+        "ook": "pam",
+        "4pam": "pam",
+        "8pam": "pam",
+        "16pam": "pam",
+        "32pam": "pam",
+        "64pam": "pam",
+        "2fsk": "fsk",
+        "2gfsk": "fsk",
+        "2msk": "fsk",
+        "2gmsk": "fsk",
+        "4fsk": "fsk",
+        "4gfsk": "fsk",
+        "4msk": "fsk",
+        "4gmsk": "fsk",
+        "8fsk": "fsk",
+        "8gfsk": "fsk",
+        "8msk": "fsk",
+        "8gmsk": "fsk",
+        "16fsk": "fsk",
+        "16gfsk": "fsk",
+        "16msk": "fsk",
+        "16gmsk": "fsk",
+        "bpsk": "psk",
+        "qpsk": "psk",
+        "8psk": "psk",
+        "16psk": "psk",
+        "32psk": "psk",
+        "64psk": "psk",
+        "16qam": "qam",
+        "32qam": "qam",
+        "32qam_cross": "qam",
+        "64qam": "qam",
+        "128qam_cross": "qam",
+        "256qam": "qam",
+        "512qam_cross": "qam",
+        "1024qam": "qam",
+        "ofdm-64": "ofdm",
+        "ofdm-72": "ofdm",
+        "ofdm-128": "ofdm",
+        "ofdm-180": "ofdm",
+        "ofdm-256": "ofdm",
+        "ofdm-300": "ofdm",
+        "ofdm-512": "ofdm",
+        "ofdm-600": "ofdm",
+        "ofdm-900": "ofdm",
+        "ofdm-1024": "ofdm",
+        "ofdm-1200": "ofdm",
+        "ofdm-2048": "ofdm",
     }
+
     def __init__(
-        self, 
-        width: int, 
+        self,
+        width: int,
         height: int,
-        class_family_dict: dict = None, 
+        class_family_dict: dict = None,
         family_list: list = None,
     ):
         super(DescToSignalFamilyInstMaskDict, self).__init__()
-        self.class_family_dict = class_family_dict if class_family_dict else self.class_family_dict
-        self.family_list = family_list if family_list else sorted(list(set(self.class_family_dict.values())))
+        self.class_family_dict = (
+            class_family_dict if class_family_dict else self.class_family_dict
+        )
+        self.family_list = (
+            family_list
+            if family_list
+            else sorted(list(set(self.class_family_dict.values())))
+        )
         self.width = width
         self.height = height
 
-    def __call__(self, signal_description: Union[List[SignalDescription], SignalDescription]) -> np.ndarray:
-        signal_description = [signal_description] if isinstance(signal_description, SignalDescription) else signal_description
+    def __call__(
+        self, signal_description: Union[List[SignalDescription], SignalDescription]
+    ) -> np.ndarray:
+        signal_description = (
+            [signal_description]
+            if isinstance(signal_description, SignalDescription)
+            else signal_description
+        )
         num_objects = len(signal_description)
         labels = []
         masks = np.zeros((num_objects, self.height, self.width))
@@ -1130,23 +1408,37 @@ def __call__(self, signal_description: Union[List[SignalDescription], SignalDesc
                 signal_desc.lower_frequency = -0.5
             if signal_desc.upper_frequency > 0.5:
                 signal_desc.upper_frequency = 0.5
-            if int((signal_desc.lower_frequency+0.5) * self.height) == int((signal_desc.upper_frequency+0.5) * self.height):
+            if int((signal_desc.lower_frequency + 0.5) * self.height) == int(
+                (signal_desc.upper_frequency + 0.5) * self.height
+            ):
                 masks[
                     signal_desc_idx,
-                    int((signal_desc.lower_frequency+0.5) * self.height) : int((signal_desc.upper_frequency+0.5) * self.height)+1,
-                    int(signal_desc.start * self.width) : int(signal_desc.stop * self.width),
+                    int((signal_desc.lower_frequency + 0.5) * self.height) : int(
+                        (signal_desc.upper_frequency + 0.5) * self.height
+                    )
+                    + 1,
+                    int(signal_desc.start * self.width) : int(
+                        signal_desc.stop * self.width
+                    ),
                 ] = 1.0
             else:
                 masks[
                     signal_desc_idx,
-                    int((signal_desc.lower_frequency+0.5) * self.height) : int((signal_desc.upper_frequency+0.5) * self.height),
-                    int(signal_desc.start * self.width) : int(signal_desc.stop * self.width),
+                    int((signal_desc.lower_frequency + 0.5) * self.height) : int(
+                        (signal_desc.upper_frequency + 0.5) * self.height
+                    ),
+                    int(signal_desc.start * self.width) : int(
+                        signal_desc.stop * self.width
+                    ),
                 ] = 1.0
-            
-        targets = {"labels":torch.Tensor(labels).long(), "masks":torch.Tensor(masks.astype(bool))}
+
+        targets = {
+            "labels": torch.Tensor(labels).long(),
+            "masks": torch.Tensor(masks.astype(bool)),
+        }
         return targets
 
-    
+
 class DescToListTuple(Transform):
     """Transform to transform SignalDescription into a list of tuples containing
     the modulation, start time, stop time, center frequency, bandwidth, and SNR
@@ -1155,16 +1447,23 @@ class DescToListTuple(Transform):
     Args:
         precision (:obj: `np.dtype`):
             Specify the data type precision for the tuple's information
-            
+
     """
+
     def __init__(self, precision: np.dtype = np.dtype(np.float16)):
         super(DescToListTuple, self).__init__()
         self.precision = precision
 
-    def __call__(self, signal_description: Union[List[SignalDescription], SignalDescription]) -> Union[List[str], str]:
+    def __call__(
+        self, signal_description: Union[List[SignalDescription], SignalDescription]
+    ) -> Union[List[str], str]:
         output = []
         # Handle cases of both SignalDescriptions and lists of SignalDescriptions
-        signal_description = [signal_description] if isinstance(signal_description, SignalDescription) else signal_description
+        signal_description = (
+            [signal_description]
+            if isinstance(signal_description, SignalDescription)
+            else signal_description
+        )
         # Loop through SignalDescription's, converting values of interest to tuples
         for signal_desc_idx, signal_desc in enumerate(signal_description):
             curr_tuple = (
@@ -1177,18 +1476,18 @@ def __call__(self, signal_description: Union[List[SignalDescription], SignalDesc
             )
             output.append(curr_tuple)
         return output
-    
-    
+
+
 class ListTupleToDesc(Transform):
     """Transform to transform a list of tuples to a list of SignalDescriptions
-    Sample rate and number of IQ samples optional arguments are provided in 
+    Sample rate and number of IQ samples optional arguments are provided in
     order to fill in additional information if desired. If a class list is
     provided, the class names are used with the list to fill in class indices
-    
+
     Args:
         sample_rate (:obj: `Optional[float]`):
             Optionally provide the sample rate for the SignalDescriptions
-        
+
         num_iq_samples (:obj: `Optional[int]`):
             Optionally provide the number of IQ samples for the SignalDescriptions
 
@@ -1196,10 +1495,11 @@ class ListTupleToDesc(Transform):
             Optionally provide the class list to fill in class indices
 
     """
+
     def __init__(
         self,
         sample_rate: Optional[float] = 1.0,
-        num_iq_samples: Optional[int] = int(512*512),
+        num_iq_samples: Optional[int] = int(512 * 512),
         class_list: Optional[List] = None,
     ):
         super(ListTupleToDesc, self).__init__()
@@ -1215,47 +1515,51 @@ def __call__(self, list_tuple: List[Tuple]) -> List[SignalDescription]:
                 sample_rate=self.sample_rate,
                 num_iq_samples=self.num_iq_samples,
                 class_name=curr_tuple[0],
-                class_index=self.class_list.index(curr_tuple[0]) if self.class_list else None,
+                class_index=self.class_list.index(curr_tuple[0])
+                if self.class_list
+                else None,
                 start=curr_tuple[1],
                 stop=curr_tuple[2],
                 center_frequency=curr_tuple[3],
                 bandwidth=curr_tuple[4],
-                lower_frequency=curr_tuple[3]-curr_tuple[4]/2,
-                upper_frequency=curr_tuple[3]+curr_tuple[4]/2,
+                lower_frequency=curr_tuple[3] - curr_tuple[4] / 2,
+                upper_frequency=curr_tuple[3] + curr_tuple[4] / 2,
                 snr=curr_tuple[5],
             )
             output.append(curr_signal_desc)
         return output
-    
-    
+
+
 class LabelSmoothing(Transform):
-    """Transform to transform a numpy array encoding to a smoothed version to 
+    """Transform to transform a numpy array encoding to a smoothed version to
     assist with overconfidence. The input hyperparameter `alpha` determines the
     degree of smoothing with the following equation:
-    
+
         output = (1 - alpha) / num_hot * input + alpha / num_classes,
-        
+
     Where,
         output ~ Smoothed output encoding
         alpha ~ Degree of smoothing to apply
         num_hot ~ Number of positively-labeled classes
         input ~ Input one/multi-hot encoding
         num_classes ~ Number of classes
-        
-    Note that the `LabelSmoothing` transform accepts a numpy encoding input, 
-    and as such, should be used in conjunction with a preceeding 
+
+    Note that the `LabelSmoothing` transform accepts a numpy encoding input,
+    and as such, should be used in conjunction with a preceeding
     DescTo... transform that maps the SignalDescription to the expected
     numpy encoding format.
-        
+
     Args:
         alpha (:obj:`float`):
             Degree of smoothing to apply
-    
+
     """
+
     def __init__(self, alpha: float = 0.1) -> np.ndarray:
         super(LabelSmoothing, self).__init__()
         self.alpha = alpha
 
     def __call__(self, encoding: np.ndarray) -> np.ndarray:
-        return (1 - self.alpha) / np.sum(encoding) * encoding + (self.alpha / encoding.shape[0])
-    
\ No newline at end of file
+        return (1 - self.alpha) / np.sum(encoding) * encoding + (
+            self.alpha / encoding.shape[0]
+        )
diff --git a/torchsig/utils/dataset.py b/torchsig/utils/dataset.py
index 9e223b0..d911037 100644
--- a/torchsig/utils/dataset.py
+++ b/torchsig/utils/dataset.py
@@ -7,20 +7,21 @@
 
 class SignalDataset(torch.utils.data.Dataset):
     """An abstract dataset class to be sub-classed by SignalDatasets
-    
+
     Args:
-        transform: 
+        transform:
             Transforms to be applied to SignalData Objects
-            
-        target_transform: 
+
+        target_transform:
             Transforms to be applied to dataset targets
-        
+
     """
+
     def __init__(
         self,
         transform: Optional[Callable] = None,
         target_transform: Optional[Callable] = None,
-        seed: Optional[int] = None
+        seed: Optional[int] = None,
     ):
         super(SignalDataset, self).__init__()
         self.random_generator = np.random.RandomState(seed)
@@ -39,30 +40,29 @@ class SignalFileDataset(SignalDataset):
     a set of files
 
     Args:
-        root: 
+        root:
             Root file path to search recursively for files
-            
-        indexer: 
+
+        indexer:
             Using root, constructs an index of data/meta-data
-            
-        reader: 
+
+        reader:
             Given a file path, produces an SignalData object
-            
-        index_filter: 
+
+        index_filter:
             Given an index, remove certain elements
-            
-        **\*\*kwargs:**
+
+        *\\*kwargs:**
             Keyword arguments
-        
+
     """
+
     def __init__(
         self,
         root: str,
         indexer: Callable[[str], List[Tuple[Any, SignalCapture]]],
         reader: Callable[[SignalCapture], SignalData],
-        index_filter: Optional[
-            Callable[[Tuple[Any, SignalCapture]], bool]
-        ] = None,
+        index_filter: Optional[Callable[[Tuple[Any, SignalCapture]], bool]] = None,
         **kwargs
     ):
         super(SignalFileDataset, self).__init__(**kwargs)
@@ -89,21 +89,22 @@ def __len__(self) -> int:
 
 class SignalTensorDataset(torch.utils.data.TensorDataset):
     """SignalTensorDataset converts Tensors to dataset of SignalData
-    
+
     Args:
-        transform: 
+        transform:
             Transforms to be applied to SignalData Objects
-            
-        target_transform: 
+
+        target_transform:
             Transforms to be applied to dataset targets
-            
-        **\*args:**
+
+        ***args:**
             Args
-        
-        **\*\*kwargs:**
-            \*tensors is passed on to the TensorDataset superclass
-        
+
+        ***kwargs:**
+            *tensors is passed on to the TensorDataset superclass
+
     """
+
     def __init__(
         self,
         transform: Optional[Callable] = None,
@@ -116,12 +117,14 @@ def __init__(
         self.target_transform = target_transform
 
     def __getitem__(self, index: int) -> Tuple[SignalData, Any]:
-        # We assume that single-precision Tensors are provided we return 
+        # We assume that single-precision Tensors are provided we return
         # double-precision numpy arrays for usage in the transform pipeline.
         signal_data = SignalData(
             data=deepcopy(self.tensors[0].numpy().tobytes()),
             item_type=np.dtype(np.float32),
-            data_type=np.dtype(np.float64) if self.tensors[0].dtype == torch.float else np.dtype(np.complex128)
+            data_type=np.dtype(np.float64)
+            if self.tensors[0].dtype == torch.float
+            else np.dtype(np.complex128),
         )
         target = tuple(self.tensors[idx][index] for idx in range(1, len(self.tensors)))
 
diff --git a/torchsig/utils/visualize.py b/torchsig/utils/visualize.py
index da3c78c..bba3dde 100644
--- a/torchsig/utils/visualize.py
+++ b/torchsig/utils/visualize.py
@@ -1,9 +1,11 @@
 import pywt
 import numpy as np
+import torch
 from copy import deepcopy
 from scipy import ndimage
 from scipy import signal as sp
 from matplotlib import pyplot as plt
+from matplotlib import patches
 from matplotlib.figure import Figure
 from torch.utils.data import dataloader
 from typing import Optional, Callable, Iterable, Union, Tuple, List
@@ -15,19 +17,20 @@ class Visualizer:
     Args:
         data_loader:
             A Dataloader to sample from for plotting
-            
+
         visualize_transform:
             Defines how to transform the data prior to plotting
-            
+
         visualize_target_transform:
             Defines how to transform the target prior to plotting
-            
+
     """
+
     def __init__(
         self,
         data_loader: dataloader,
         visualize_transform: Optional[Callable] = None,
-        visualize_target_transform: Optional[Callable] = None
+        visualize_target_transform: Optional[Callable] = None,
     ):
         self.data_loader = iter(data_loader)
         self.visualize_transform = visualize_transform
@@ -57,27 +60,28 @@ class SpectrogramVisualizer(Visualizer):
     Args:
         sample_rate:
             The sample rate of the input data
-            
+
         window:
             The window for use in the spectrogram
-            
+
         nperseg:
             Specify the segments for the spectrogram
-            
+
         noverlap:
             Specify the overlap for the spectrogram
-            
+
         nfft:
             Specify the number of FFT bins for the spectrogram
-            
-        **\*\*kwargs:**
+
+        ****kwargs:**
             Keyword arguments
-    
+
     """
+
     def __init__(
         self,
         sample_rate: float = 1.0,
-        window: Optional[Union[str, Tuple, np.ndarray]] = sp.windows.tukey(256, .25),
+        window: Optional[Union[str, Tuple, np.ndarray]] = sp.windows.tukey(256, 0.25),
         nperseg: int = 256,
         noverlap: Optional[int] = None,
         nfft: Optional[int] = None,
@@ -94,8 +98,11 @@ def _visualize(self, iq_data: np.ndarray, targets: np.ndarray) -> Figure:
         batch_size = iq_data.shape[0]
         figure = plt.figure()
         for sample_idx in range(batch_size):
-            plt.subplot(int(np.ceil(np.sqrt(batch_size))),
-                        int(np.sqrt(batch_size)), sample_idx + 1)
+            plt.subplot(
+                int(np.ceil(np.sqrt(batch_size))),
+                int(np.sqrt(batch_size)),
+                sample_idx + 1,
+            )
             _, _, spectrogram = sp.spectrogram(
                 x=iq_data[sample_idx],
                 fs=self.sample_rate,
@@ -103,15 +110,15 @@ def _visualize(self, iq_data: np.ndarray, targets: np.ndarray) -> Figure:
                 nperseg=self.nperseg,
                 noverlap=self.noverlap,
                 nfft=self.nfft,
-                return_onesided=False
+                return_onesided=False,
             )
-            spectrogram = 20 * np.log10(np.fft.fftshift(np.abs(spectrogram),axes=0))
+            spectrogram = 20 * np.log10(np.fft.fftshift(np.abs(spectrogram), axes=0))
             plt.imshow(
                 spectrogram,
                 vmin=np.min(spectrogram[spectrogram != -np.inf]),
                 vmax=np.max(spectrogram[spectrogram != np.inf]),
                 aspect="auto",
-                cmap="jet"
+                cmap="jet",
             )
             plt.xticks([])
             plt.yticks([])
@@ -125,20 +132,21 @@ class WaveletVisualizer(Visualizer):
     Args:
         wavelet:
             The wavelet to apply to the data prior to plotting
-            
+
         nscales:
             Specify the number of wavelet scales
-            
+
         sample_rate:
             The sample rate of the input data
-            
-        **\*\*kwargs:**
+
+        ****kwargs:**
             Keyword arguments
-            
+
     """
+
     def __init__(
         self,
-        wavelet: str = 'mexh',
+        wavelet: str = "mexh",
         nscales: int = 33,
         sample_rate: float = 1.0,
         **kwargs
@@ -152,14 +160,17 @@ def _visualize(self, iq_data: np.ndarray, targets: np.ndarray) -> Figure:
         batch_size = iq_data.shape[0]
         figure = plt.figure()
         for sample_idx in range(batch_size):
-            plt.subplot(int(np.ceil(np.sqrt(batch_size))),
-                        int(np.sqrt(batch_size)), sample_idx + 1)
+            plt.subplot(
+                int(np.ceil(np.sqrt(batch_size))),
+                int(np.sqrt(batch_size)),
+                sample_idx + 1,
+            )
             scales = np.arange(1, self.nscales)
             cwt_matrix, freqs = pywt.cwt(
                 iq_data[sample_idx],
                 scales=scales,
                 wavelet=self.wavelet,
-                sampling_period=1.0 / self.sample_rate
+                sampling_period=1.0 / self.sample_rate,
             )
             ts = np.arange(len(cwt_matrix[0])) / self.sample_rate
             plt.imshow(
@@ -168,7 +179,7 @@ def _visualize(self, iq_data: np.ndarray, targets: np.ndarray) -> Figure:
                 vmin=0,
                 vmax=np.abs(cwt_matrix).max(),
                 aspect="auto",
-                cmap="jet"  # 'PRGn'
+                cmap="jet",  # 'PRGn'
             )
             plt.xticks([])
             plt.yticks([])
@@ -180,10 +191,11 @@ class ConstellationVisualizer(Visualizer):
     """Visualize a constellation
 
     Args:
-        **\*\*kwargs:**
+        ****kwargs:**
             Keyword arguments
-            
+
     """
+
     def __init__(self, **kwargs):
         super(ConstellationVisualizer, self).__init__(**kwargs)
 
@@ -191,10 +203,12 @@ def _visualize(self, iq_data: np.ndarray, targets: np.ndarray) -> Figure:
         batch_size = iq_data.shape[0]
         figure = plt.figure()
         for sample_idx in range(batch_size):
-            plt.subplot(int(np.ceil(np.sqrt(batch_size))),
-                        int(np.sqrt(batch_size)), sample_idx + 1)
-            plt.scatter(np.real(iq_data[sample_idx]),
-                        np.imag(iq_data[sample_idx]))
+            plt.subplot(
+                int(np.ceil(np.sqrt(batch_size))),
+                int(np.sqrt(batch_size)),
+                sample_idx + 1,
+            )
+            plt.scatter(np.real(iq_data[sample_idx]), np.imag(iq_data[sample_idx]))
             plt.xticks([])
             plt.yticks([])
             plt.title(str(targets[sample_idx]))
@@ -205,10 +219,11 @@ class IQVisualizer(Visualizer):
     """Visualize time-series IQ data
 
     Args:
-        **\*\*kwargs:**
+        ****kwargs:**
             Keyword arguments
-            
+
     """
+
     def __init__(self, **kwargs):
         super(IQVisualizer, self).__init__(**kwargs)
 
@@ -216,8 +231,11 @@ def _visualize(self, iq_data: np.ndarray, targets: np.ndarray) -> Figure:
         batch_size = iq_data.shape[0]
         figure = plt.figure()
         for sample_idx in range(batch_size):
-            plt.subplot(int(np.ceil(np.sqrt(batch_size))),
-                        int(np.sqrt(batch_size)), sample_idx + 1)
+            plt.subplot(
+                int(np.ceil(np.sqrt(batch_size))),
+                int(np.sqrt(batch_size)),
+                sample_idx + 1,
+            )
             plt.plot(np.real(iq_data[sample_idx]))
             plt.plot(np.imag(iq_data[sample_idx]))
             plt.xticks([])
@@ -230,9 +248,10 @@ class TimeSeriesVisualizer(Visualizer):
     """Visualize time-series data directly
 
     Args:
-        **\*\*kwargs:**
+        **kwargs:**
             Keyword arguments
     """
+
     def __init__(self, **kwargs):
         super(TimeSeriesVisualizer, self).__init__(**kwargs)
 
@@ -240,8 +259,11 @@ def _visualize(self, data: np.ndarray, targets: np.ndarray) -> Figure:
         batch_size = data.shape[0]
         figure = plt.figure()
         for sample_idx in range(batch_size):
-            plt.subplot(int(np.ceil(np.sqrt(batch_size))),
-                        int(np.sqrt(batch_size)), sample_idx + 1)
+            plt.subplot(
+                int(np.ceil(np.sqrt(batch_size))),
+                int(np.sqrt(batch_size)),
+                sample_idx + 1,
+            )
             plt.plot(data[sample_idx])
             plt.xticks([])
             plt.yticks([])
@@ -253,10 +275,11 @@ class ImageVisualizer(Visualizer):
     """Visualize image data directly
 
     Args:
-        **\*\*kwargs:**
+        ****kwargs:**
             Keyword arguments
-            
+
     """
+
     def __init__(self, **kwargs):
         super(ImageVisualizer, self).__init__(**kwargs)
 
@@ -264,14 +287,17 @@ def _visualize(self, data: np.ndarray, targets: np.ndarray) -> Figure:
         batch_size = data.shape[0]
         figure = plt.figure()
         for sample_idx in range(batch_size):
-            plt.subplot(int(np.ceil(np.sqrt(batch_size))),
-                        int(np.sqrt(batch_size)), sample_idx + 1)
+            plt.subplot(
+                int(np.ceil(np.sqrt(batch_size))),
+                int(np.sqrt(batch_size)),
+                sample_idx + 1,
+            )
             plt.imshow(
                 data[sample_idx],
                 vmin=np.min(data[sample_idx][data[sample_idx] != -np.inf]),
                 vmax=np.max(data[sample_idx][data[sample_idx] != np.inf]),
                 aspect="auto",
-                cmap="jet"
+                cmap="jet",
             )
             plt.xticks([])
             plt.yticks([])
@@ -281,7 +307,7 @@ def _visualize(self, data: np.ndarray, targets: np.ndarray) -> Figure:
 
 
 class PSDVisualizer(Visualizer):
-    """ Visualize a PSD
+    """Visualize a PSD
 
     Args:
         fft_size:
@@ -296,24 +322,28 @@ def _visualize(self, iq_data: np.ndarray, targets: np.ndarray) -> Figure:
         batch_size = iq_data.shape[0]
         figure = plt.figure()
         for sample_idx in range(batch_size):
-            plt.subplot(int(np.ceil(np.sqrt(batch_size))),
-                        int(np.sqrt(batch_size)), sample_idx + 1)
+            plt.subplot(
+                int(np.ceil(np.sqrt(batch_size))),
+                int(np.sqrt(batch_size)),
+                sample_idx + 1,
+            )
             Pxx, freqs = plt.psd(iq_data[sample_idx], NFFT=self.fft_size, Fs=1)
             plt.xticks()
             plt.yticks()
             plt.title(str(targets[sample_idx]))
         return figure
-    
-    
+
+
 class MaskVisualizer(Visualizer):
-    """ Visualize data with mask label information overlaid
-    
+    """Visualize data with mask label information overlaid
+
     Args:
         **kwargs:
     """
+
     def __init__(self, **kwargs):
         super(MaskVisualizer, self).__init__(**kwargs)
-    
+
     def __next__(self) -> Figure:
         iq_data, targets = next(self.data_iter)
         if self.visualize_transform:
@@ -325,13 +355,16 @@ def __next__(self) -> Figure:
             targets = None
 
         return self._visualize(iq_data, targets)
-    
+
     def _visualize(self, data: np.ndarray, targets: np.ndarray) -> Figure:
         batch_size = data.shape[0]
         figure = plt.figure(frameon=False)
         for sample_idx in range(batch_size):
-            plt.subplot(int(np.ceil(np.sqrt(batch_size))), 
-                        int(np.sqrt(batch_size)), sample_idx + 1)
+            plt.subplot(
+                int(np.ceil(np.sqrt(batch_size))),
+                int(np.sqrt(batch_size)),
+                sample_idx + 1,
+            )
             extent = 0, data.shape[1], 0, data.shape[2]
             data_img = plt.imshow(
                 data[sample_idx],
@@ -354,22 +387,23 @@ def _visualize(self, data: np.ndarray, targets: np.ndarray) -> Figure:
             plt.xticks([])
             plt.yticks([])
             plt.title("Data")
-           
+
         return figure
 
-    
+
 class MaskClassVisualizer(Visualizer):
-    """ 
+    """
     Visualize data with mask label information overlaid and the class of the
     mask included in the title
-    
+
     Args:
         **kwargs:
     """
+
     def __init__(self, class_list, **kwargs):
         super(MaskClassVisualizer, self).__init__(**kwargs)
         self.class_list = class_list
-    
+
     def __next__(self) -> Figure:
         iq_data, targets = next(self.data_iter)
         if self.visualize_transform:
@@ -381,13 +415,18 @@ def __next__(self) -> Figure:
             targets = None
 
         return self._visualize(iq_data, targets, classes)
-    
-    def _visualize(self, data: np.ndarray, targets: np.ndarray, classes: List) -> Figure:
+
+    def _visualize(
+        self, data: np.ndarray, targets: np.ndarray, classes: List
+    ) -> Figure:
         batch_size = data.shape[0]
         figure = plt.figure(frameon=False)
         for sample_idx in range(batch_size):
-            plt.subplot(int(np.ceil(np.sqrt(batch_size))), 
-                        int(np.sqrt(batch_size)), sample_idx + 1)
+            plt.subplot(
+                int(np.ceil(np.sqrt(batch_size))),
+                int(np.sqrt(batch_size)),
+                sample_idx + 1,
+            )
             extent = 0, data.shape[1], 0, data.shape[2]
             data_img = plt.imshow(
                 data[sample_idx],
@@ -415,22 +454,23 @@ def _visualize(self, data: np.ndarray, targets: np.ndarray, classes: List) -> Fi
             plt.xticks([])
             plt.yticks([])
             plt.title(title)
-           
+
         return figure
-    
-    
+
+
 class SemanticMaskClassVisualizer(Visualizer):
-    """ 
+    """
     Visualize data with mask label information overlaid and the class of the
     mask included in the title
-    
+
     Args:
         **kwargs:
     """
+
     def __init__(self, class_list, **kwargs):
         super(SemanticMaskClassVisualizer, self).__init__(**kwargs)
         self.class_list = class_list
-    
+
     def __next__(self) -> Figure:
         iq_data, targets = next(self.data_iter)
         if self.visualize_transform:
@@ -440,13 +480,16 @@ def __next__(self) -> Figure:
             targets = self.visualize_target_transform(deepcopy(targets))
 
         return self._visualize(iq_data, targets)
-    
+
     def _visualize(self, data: np.ndarray, targets: np.ndarray) -> Figure:
         batch_size = data.shape[0]
         figure = plt.figure(frameon=False)
         for sample_idx in range(batch_size):
-            plt.subplot(int(np.ceil(np.sqrt(batch_size))), 
-                        int(np.sqrt(batch_size)), sample_idx + 1)
+            plt.subplot(
+                int(np.ceil(np.sqrt(batch_size))),
+                int(np.sqrt(batch_size)),
+                sample_idx + 1,
+            )
             extent = 0, data.shape[1], 0, data.shape[2]
             data_img = plt.imshow(
                 data[sample_idx],
@@ -465,29 +508,32 @@ def _visualize(self, data: np.ndarray, targets: np.ndarray) -> Figure:
                     extent=extent,
                 )
                 classes_present = list(set(targets[sample_idx].flatten().tolist()))
-                classes_present.remove(0.0) # Remove 'background' class
-                title = [self.class_list[int(class_idx-1)] for class_idx in classes_present]
+                classes_present.remove(0.0)  # Remove 'background' class
+                title = [
+                    self.class_list[int(class_idx - 1)] for class_idx in classes_present
+                ]
             else:
                 title = "Data"
             plt.xticks([])
             plt.yticks([])
             plt.title(title)
-           
-        return figure 
-    
-    
+
+        return figure
+
+
 class BoundingBoxVisualizer(Visualizer):
-    """ Visualize data with bounding box label information overlaid
-    
+    """Visualize data with bounding box label information overlaid
+
     Args:
         **kwargs:
     """
+
     def __init__(self, **kwargs):
         super(BoundingBoxVisualizer, self).__init__(**kwargs)
-    
+
     def __next__(self) -> Figure:
         iq_data, targets = next(self.data_iter)
-        
+
         if self.visualize_transform:
             iq_data = self.visualize_transform(deepcopy(iq_data))
 
@@ -497,17 +543,20 @@ def __next__(self) -> Figure:
             targets = targets
 
         return self._visualize(iq_data, targets)
-    
+
     def _visualize(self, data: np.ndarray, targets: np.ndarray) -> Figure:
         batch_size = data.shape[0]
         figure = plt.figure(frameon=False)
         for sample_idx in range(batch_size):
-            ax = plt.subplot(int(np.ceil(np.sqrt(batch_size))), 
-                        int(np.sqrt(batch_size)), sample_idx + 1)
-            
+            ax = plt.subplot(
+                int(np.ceil(np.sqrt(batch_size))),
+                int(np.sqrt(batch_size)),
+                sample_idx + 1,
+            )
+
             # Retrieve individual label
             ax.imshow(
-                data[sample_idx], 
+                data[sample_idx],
                 vmin=np.min(data[sample_idx]),
                 vmax=np.max(data[sample_idx]),
                 cmap="jet",
@@ -519,35 +568,65 @@ def _visualize(self, data: np.ndarray, targets: np.ndarray) -> Figure:
             for grid_cell_x_idx in range(label.shape[0]):
                 for grid_cell_y_idx in range(label.shape[1]):
                     if label[grid_cell_x_idx, grid_cell_y_idx, 0] == 1:
-                        duration = label[grid_cell_x_idx, grid_cell_y_idx, 2]*data[sample_idx].shape[0]
-                        bandwidth = label[grid_cell_x_idx, grid_cell_y_idx, 4]*data[sample_idx].shape[1]
-                        start_pixel = (grid_cell_x_idx*pixels_per_cell_x) + (label[grid_cell_x_idx, grid_cell_y_idx, 1]*pixels_per_cell_x) - duration/2
-                        low_freq = (grid_cell_y_idx*pixels_per_cell_y) + (label[grid_cell_x_idx, grid_cell_y_idx, 3]*pixels_per_cell_y) \
-                            - (label[grid_cell_x_idx, grid_cell_y_idx, 4]/2 * data[sample_idx].shape[1])
+                        duration = (
+                            label[grid_cell_x_idx, grid_cell_y_idx, 2]
+                            * data[sample_idx].shape[0]
+                        )
+                        bandwidth = (
+                            label[grid_cell_x_idx, grid_cell_y_idx, 4]
+                            * data[sample_idx].shape[1]
+                        )
+                        start_pixel = (
+                            (grid_cell_x_idx * pixels_per_cell_x)
+                            + (
+                                label[grid_cell_x_idx, grid_cell_y_idx, 1]
+                                * pixels_per_cell_x
+                            )
+                            - duration / 2
+                        )
+                        low_freq = (
+                            (grid_cell_y_idx * pixels_per_cell_y)
+                            + (
+                                label[grid_cell_x_idx, grid_cell_y_idx, 3]
+                                * pixels_per_cell_y
+                            )
+                            - (
+                                label[grid_cell_x_idx, grid_cell_y_idx, 4]
+                                / 2
+                                * data[sample_idx].shape[1]
+                            )
+                        )
 
                         rect = patches.Rectangle(
-                            (start_pixel,low_freq),
+                            (start_pixel, low_freq),
                             duration,
-                            bandwidth, # Bandwidth (pixels)
+                            bandwidth,  # Bandwidth (pixels)
                             linewidth=3,
-                            edgecolor='b',
-                            facecolor='none'
+                            edgecolor="b",
+                            facecolor="none",
                         )
                         ax.add_patch(rect)
-            plt.imshow(data[sample_idx], aspect='auto', cmap="jet",vmin=np.min(data[sample_idx]),vmax=np.max(data[sample_idx]))
+            plt.imshow(
+                data[sample_idx],
+                aspect="auto",
+                cmap="jet",
+                vmin=np.min(data[sample_idx]),
+                vmax=np.max(data[sample_idx]),
+            )
             plt.xticks([])
             plt.yticks([])
             plt.title("Data")
-           
+
         return figure
-    
-    
+
+
 class AnchorBoxVisualizer(Visualizer):
-    """ Visualize data with anchor box label information overlaid
-    
+    """Visualize data with anchor box label information overlaid
+
     Args:
         **kwargs:
     """
+
     def __init__(
         self,
         data_loader: dataloader,
@@ -560,10 +639,10 @@ def __init__(
         self.visualize_target_transform = visualize_target_transform
         self.anchor_boxes = anchor_boxes
         self.num_anchor_boxes = len(anchor_boxes)
-        
+
     def __next__(self) -> Figure:
         iq_data, targets = next(self.data_iter)
-        
+
         if self.visualize_transform:
             iq_data = self.visualize_transform(deepcopy(iq_data))
 
@@ -573,17 +652,20 @@ def __next__(self) -> Figure:
             targets = targets
 
         return self._visualize(iq_data, targets)
-    
+
     def _visualize(self, data: np.ndarray, targets: np.ndarray) -> Figure:
         batch_size = data.shape[0]
         figure = plt.figure(frameon=False)
         for sample_idx in range(batch_size):
-            ax = plt.subplot(int(np.ceil(np.sqrt(batch_size))), 
-                        int(np.sqrt(batch_size)), sample_idx + 1)
-            
+            ax = plt.subplot(
+                int(np.ceil(np.sqrt(batch_size))),
+                int(np.sqrt(batch_size)),
+                sample_idx + 1,
+            )
+
             # Retrieve individual label
             ax.imshow(
-                data[sample_idx], 
+                data[sample_idx],
                 vmin=np.min(data[sample_idx]),
                 vmax=np.max(data[sample_idx]),
                 cmap="jet",
@@ -595,37 +677,89 @@ def _visualize(self, data: np.ndarray, targets: np.ndarray) -> Figure:
             for grid_cell_x_idx in range(label.shape[0]):
                 for grid_cell_y_idx in range(label.shape[1]):
                     for anchor_idx in range(self.num_anchor_boxes):
-                        if label[grid_cell_x_idx, grid_cell_y_idx, 0+5*anchor_idx] == 1:
-                            duration = label[grid_cell_x_idx, grid_cell_y_idx, 2+5*anchor_idx]*self.anchor_boxes[anchor_idx][0]*data[sample_idx].shape[0]
-                            bandwidth = label[grid_cell_x_idx, grid_cell_y_idx, 4+5*anchor_idx]*self.anchor_boxes[anchor_idx][1]*data[sample_idx].shape[1]
-                            start_pixel = (grid_cell_x_idx*pixels_per_cell_x) + (label[grid_cell_x_idx, grid_cell_y_idx, 1+5*anchor_idx]*pixels_per_cell_x) - duration/2
-                            low_freq = (grid_cell_y_idx*pixels_per_cell_y) + (label[grid_cell_x_idx, grid_cell_y_idx, 3+5*anchor_idx]*pixels_per_cell_y) \
-                                - (label[grid_cell_x_idx, grid_cell_y_idx, 4+5*anchor_idx]*self.anchor_boxes[anchor_idx][1]/2 * data[sample_idx].shape[1])
+                        if (
+                            label[grid_cell_x_idx, grid_cell_y_idx, 0 + 5 * anchor_idx]
+                            == 1
+                        ):
+                            duration = (
+                                label[
+                                    grid_cell_x_idx, grid_cell_y_idx, 2 + 5 * anchor_idx
+                                ]
+                                * self.anchor_boxes[anchor_idx][0]
+                                * data[sample_idx].shape[0]
+                            )
+                            bandwidth = (
+                                label[
+                                    grid_cell_x_idx, grid_cell_y_idx, 4 + 5 * anchor_idx
+                                ]
+                                * self.anchor_boxes[anchor_idx][1]
+                                * data[sample_idx].shape[1]
+                            )
+                            start_pixel = (
+                                (grid_cell_x_idx * pixels_per_cell_x)
+                                + (
+                                    label[
+                                        grid_cell_x_idx,
+                                        grid_cell_y_idx,
+                                        1 + 5 * anchor_idx,
+                                    ]
+                                    * pixels_per_cell_x
+                                )
+                                - duration / 2
+                            )
+                            low_freq = (
+                                (grid_cell_y_idx * pixels_per_cell_y)
+                                + (
+                                    label[
+                                        grid_cell_x_idx,
+                                        grid_cell_y_idx,
+                                        3 + 5 * anchor_idx,
+                                    ]
+                                    * pixels_per_cell_y
+                                )
+                                - (
+                                    label[
+                                        grid_cell_x_idx,
+                                        grid_cell_y_idx,
+                                        4 + 5 * anchor_idx,
+                                    ]
+                                    * self.anchor_boxes[anchor_idx][1]
+                                    / 2
+                                    * data[sample_idx].shape[1]
+                                )
+                            )
 
                             rect = patches.Rectangle(
-                                (start_pixel,low_freq),
+                                (start_pixel, low_freq),
                                 duration,
-                                bandwidth, # Bandwidth (pixels)
+                                bandwidth,  # Bandwidth (pixels)
                                 linewidth=3,
-                                edgecolor='b',
-                                facecolor='none'
+                                edgecolor="b",
+                                facecolor="none",
                             )
                             ax.add_patch(rect)
-                        
-            plt.imshow(data[sample_idx], aspect='auto', cmap="jet",vmin=np.min(data[sample_idx]),vmax=np.max(data[sample_idx]))
+
+            plt.imshow(
+                data[sample_idx],
+                aspect="auto",
+                cmap="jet",
+                vmin=np.min(data[sample_idx]),
+                vmax=np.max(data[sample_idx]),
+            )
             plt.xticks([])
             plt.yticks([])
             plt.title("Data")
-           
+
         return figure
-    
-    
+
+
 ###############################################################################
 # Visualizer Transform Functions
 ###############################################################################
 
+
 def two_channel_to_complex(tensor: np.ndarray) -> np.ndarray:
-    """Visualizer data transform: Transform two channel IQ data to complex IQ 
+    """Visualizer data transform: Transform two channel IQ data to complex IQ
     data for visualization
 
     """
@@ -644,27 +778,29 @@ def complex_spectrogram_to_magnitude(tensor: np.ndarray) -> np.ndarray:
     """
     batch_size = tensor.shape[0]
     new_tensor = np.zeros(
-        (batch_size, tensor.shape[2], tensor.shape[3]), dtype=np.float64)
+        (batch_size, tensor.shape[2], tensor.shape[3]), dtype=np.float64
+    )
     for idx in range(tensor.shape[0]):
-        new_tensor[idx] = 20 * np.log10(tensor[idx, 0]**2 + tensor[idx, 1]**2)
+        new_tensor[idx] = 20 * np.log10(tensor[idx, 0] ** 2 + tensor[idx, 1] ** 2)
     return new_tensor
 
 
 def magnitude_spectrogram(tensor: np.ndarray) -> np.ndarray:
-    """Visualizer data transform: Transform magnitude spectrogram data for 
+    """Visualizer data transform: Transform magnitude spectrogram data for
     plotting (mode = 'psd')
 
     """
     batch_size = tensor.shape[0]
     new_tensor = np.zeros(
-        (batch_size, tensor.shape[1], tensor.shape[2]), dtype=np.float64)
+        (batch_size, tensor.shape[1], tensor.shape[2]), dtype=np.float64
+    )
     for idx in range(tensor.shape[0]):
         new_tensor[idx] = 20 * np.log10(tensor[idx])
     return new_tensor
 
 
 def iq_to_complex_magnitude(tensor: np.ndarray) -> np.ndarray:
-    """Visualizer data transform: Complex IQ to time series magnitude for 
+    """Visualizer data transform: Complex IQ to time series magnitude for
     TimeSeriesVisualizer
 
     """
@@ -676,7 +812,7 @@ def iq_to_complex_magnitude(tensor: np.ndarray) -> np.ndarray:
 
 
 def binary_label_format(tensor: np.ndarray) -> List[str]:
-    """Visualizer target transform: Format binary labels for titles in 
+    """Visualizer target transform: Format binary labels for titles in
     visualizer
 
     """
@@ -688,7 +824,7 @@ def binary_label_format(tensor: np.ndarray) -> List[str]:
 
 
 def onehot_label_format(tensor: np.ndarray) -> List[str]:
-    """Visualizer target transform: Format onehot labels for titles in 
+    """Visualizer target transform: Format onehot labels for titles in
     visualizer
 
     """
@@ -700,35 +836,35 @@ def onehot_label_format(tensor: np.ndarray) -> List[str]:
 
 
 def multihot_label_format(tensor: np.ndarray, class_list: List[str]) -> List[str]:
-    """Target Transform: Format multihot labels for titles in visualizer
-    
-    """
+    """Target Transform: Format multihot labels for titles in visualizer"""
     batch_size = tensor.shape[0]
     label = []
     for idx in range(batch_size):
         curr_label = []
         for class_idx in range(len(class_list)):
-            if tensor[idx][class_idx] > (1/len(class_list)):
+            if tensor[idx][class_idx] > (1 / len(class_list)):
                 curr_label.append(class_list[class_idx])
         label.append(curr_label)
     return label
 
 
 def mask_to_outline(tensor: np.ndarray) -> List[str]:
-    """Target Transform: Transforms masks for all bursts to outlines for the 
-    MaskVisualizer. Overlapping mask outlines are represented as a single 
+    """Target Transform: Transforms masks for all bursts to outlines for the
+    MaskVisualizer. Overlapping mask outlines are represented as a single
     polygon.
-    
+
     """
     batch_size = tensor.shape[0]
     labels = []
-    struct = ndimage.generate_binary_structure(2,2)
+    struct = ndimage.generate_binary_structure(2, 2)
     for idx in range(batch_size):
         label = tensor[idx].numpy()
         label = np.sum(label, axis=0)
-        label[label>0] = 1
+        label[label > 0] = 1
         label = label - ndimage.binary_erosion(label)
-        label = ndimage.binary_dilation(label, structure=struct, iterations=3).astype(label.dtype)
+        label = ndimage.binary_dilation(label, structure=struct, iterations=3).astype(
+            label.dtype
+        )
         label = np.ma.masked_where(label == 0, label)
         labels.append(label)
     return labels
@@ -736,21 +872,24 @@ def mask_to_outline(tensor: np.ndarray) -> List[str]:
 
 def mask_to_outline_overlap(tensor: np.ndarray) -> List[str]:
     """Target Transform: Transforms masks for each burst to individual outlines
-    for the MaskVisualizer. Overlapping mask outlines are still shown as 
+    for the MaskVisualizer. Overlapping mask outlines are still shown as
     overlapping.
-    
+
     """
     batch_size = tensor.shape[0]
     labels = []
-    struct = ndimage.generate_binary_structure(2,2)
+    struct = ndimage.generate_binary_structure(2, 2)
     for idx in range(batch_size):
         label = tensor[idx].numpy()
         for individual_burst_idx in range(label.shape[0]):
-            label[individual_burst_idx] = label[individual_burst_idx] - \
-                ndimage.binary_erosion(label[individual_burst_idx])
+            label[individual_burst_idx] = label[
+                individual_burst_idx
+            ] - ndimage.binary_erosion(label[individual_burst_idx])
         label = np.sum(label, axis=0)
-        label[label>0] = 1
-        label = ndimage.binary_dilation(label, structure=struct, iterations=2).astype(label.dtype)
+        label[label > 0] = 1
+        label = ndimage.binary_dilation(label, structure=struct, iterations=2).astype(
+            label.dtype
+        )
         label = np.ma.masked_where(label == 0, label)
         labels.append(label)
     return labels
@@ -759,13 +898,13 @@ def mask_to_outline_overlap(tensor: np.ndarray) -> List[str]:
 def overlay_mask(tensor: np.ndarray) -> List[str]:
     """Target Transform: Transforms multi-dimensional mask to binary overlay of
     full mask.
-    
+
     """
     batch_size = tensor.shape[0]
     labels = []
     for idx in range(batch_size):
         label = torch.sum(tensor[idx], axis=0).numpy()
-        label[label>0] = 1
+        label[label > 0] = 1
         label = np.ma.masked_where(label == 0, label)
         labels.append(label)
     return labels
@@ -775,23 +914,26 @@ def mask_class_to_outline(tensor: np.ndarray) -> List[str]:
     """Target Transform: Transforms masks for each burst to individual outlines
     for the MaskClassVisualizer. Overlapping mask outlines are still shown as
     overlapping. Each bursts' class index is also returned.
-    
+
     """
     batch_size = tensor.shape[0]
     labels = []
     class_idx = []
-    struct = ndimage.generate_binary_structure(2,2)
+    struct = ndimage.generate_binary_structure(2, 2)
     for idx in range(batch_size):
         label = tensor[idx].numpy()
         class_idx_curr = []
         for individual_burst_idx in range(label.shape[0]):
             if np.count_nonzero(label[individual_burst_idx]) > 0:
                 class_idx_curr.append(individual_burst_idx)
-            label[individual_burst_idx] = label[individual_burst_idx] - \
-                ndimage.binary_erosion(label[individual_burst_idx])
+            label[individual_burst_idx] = label[
+                individual_burst_idx
+            ] - ndimage.binary_erosion(label[individual_burst_idx])
         label = np.sum(label, axis=0)
-        label[label>0] = 1
-        label = ndimage.binary_dilation(label, structure=struct, iterations=2).astype(label.dtype)
+        label[label > 0] = 1
+        label = ndimage.binary_dilation(label, structure=struct, iterations=2).astype(
+            label.dtype
+        )
         label = np.ma.masked_where(label == 0, label)
         class_idx.append(class_idx_curr)
         labels.append(label)
diff --git a/torchsig/version.py b/torchsig/version.py
deleted file mode 100644
index 541f859..0000000
--- a/torchsig/version.py
+++ /dev/null
@@ -1 +0,0 @@
-__version__ = '0.1.0'
\ No newline at end of file