openvinotoolkit · samet-akcay · Apr 12, 2022 · Apr 7, 2022 · Apr 11, 2022 · Apr 12, 2022
diff --git a/anomalib/data/utils/image.py b/anomalib/data/utils/image.py
@@ -14,11 +14,14 @@
 # See the License for the specific language governing permissions
 # and limitations under the License.
 
+import math
 from pathlib import Path
 from typing import List, Union
 
 import cv2
 import numpy as np
+import torch.nn.functional as F
+from torch import Tensor
 from torchvision.datasets.folder import IMG_EXTENSIONS
 
 
@@ -66,3 +69,23 @@ def read_image(path: Union[str, Path]) -> np.ndarray:
     image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
 
     return image
+
+
+def pad_nextpow2(batch: Tensor) -> Tensor:
+    """Compute required padding from input size and return padded images.
+
+    Finds the largest dimension and computes a square image of dimensions that are of the power of 2.
+    In case the image dimension is odd, it returns the image with an extra padding on one side.
+
+    Args:
+        batch (Tensor): Input images
+
+    Returns:
+        batch: Padded batch
+    """
+    # find the largest dimension
+    l_dim = 2 ** math.ceil(math.log(max(*batch.shape[-2:]), 2))
+    padding_w = [math.ceil((l_dim - batch.shape[-2]) / 2), math.floor((l_dim - batch.shape[-2]) / 2)]
+    padding_h = [math.ceil((l_dim - batch.shape[-1]) / 2), math.floor((l_dim - batch.shape[-1]) / 2)]
+    padded_batch = F.pad(batch, pad=[*padding_h, *padding_w])
+    return padded_batch
diff --git a/anomalib/models/ganomaly/model.py b/anomalib/models/ganomaly/model.py
@@ -24,6 +24,7 @@
 from pytorch_lightning.callbacks import EarlyStopping
 from torch import Tensor, optim
 
+from anomalib.data.utils.image import pad_nextpow2
 from anomalib.models.components import AnomalyModule
 
 from .torch_model import GanomalyModel
@@ -38,8 +39,9 @@ class GanomalyLightning(AnomalyModule):
 
     def __init__(self, hparams: Union[DictConfig, ListConfig]):
         super().__init__(hparams)
+
         self.model: GanomalyModel = GanomalyModel(
-            input_size=hparams.model.input_size[0],
+            input_size=hparams.model.input_size,
             num_input_channels=3,
             n_features=hparams.model.n_features,
             latent_vec_size=hparams.model.latent_vec_size,
@@ -99,18 +101,19 @@ def training_step(self, batch, _, optimizer_idx):  # pylint: disable=arguments-d
             Dict[str, Tensor]: Loss
         """
         images = batch["image"]
+        padded_images = pad_nextpow2(images)
         loss: Dict[str, Tensor]
 
         # Discriminator
         if optimizer_idx == 0:
             # forward pass
-            loss_discriminator = self.model.get_discriminator_loss(images)
+            loss_discriminator = self.model.get_discriminator_loss(padded_images)
             loss = {"loss": loss_discriminator}
 
         # Generator
         else:
             # forward pass
-            loss_generator = self.model.get_generator_loss(images)
+            loss_generator = self.model.get_generator_loss(padded_images)
 
             loss = {"loss": loss_generator}
 

diff --git a/anomalib/models/ganomaly/torch_model.py b/anomalib/models/ganomaly/torch_model.py
@@ -12,16 +12,19 @@
 
 
 import math
+from typing import Tuple
 
 import torch
 from torch import Tensor, nn
 
+from anomalib.data.utils.image import pad_nextpow2
+
 
 class Encoder(nn.Module):
     """Encoder Network.
 
     Args:
-        input_size (int): Size of input image
+        input_size (Tuple[int, int]): Size of input image
         latent_vec_size (int): Size of latent vector z
         num_input_channels (int): Number of input channels in the image
         n_features (int): Number of features per convolution layer
@@ -31,7 +34,7 @@ class Encoder(nn.Module):
 
     def __init__(
         self,
-        input_size: int,
+        input_size: Tuple[int, int],
         latent_vec_size: int,
         num_input_channels: int,
         n_features: int,
@@ -40,13 +43,10 @@ def __init__(
     ):
         super().__init__()
 
-        assert input_size % 16 == 0, "Input size should be a multiple of 16"
-
         self.input_layers = nn.Sequential()
-
         self.input_layers.add_module(
             f"initial-conv-{num_input_channels}-{n_features}",
-            nn.Conv2d(num_input_channels, n_features, kernel_size=4, stride=2, padding=1, bias=False),
+            nn.Conv2d(num_input_channels, n_features, kernel_size=4, stride=2, padding=4, bias=False),
         )
         self.input_layers.add_module(f"initial-relu-{n_features}", nn.LeakyReLU(0.2, inplace=True))
 
@@ -63,8 +63,8 @@ def __init__(
 
         # Create pyramid features to reach latent vector
         self.pyramid_features = nn.Sequential()
-        input_size = input_size // 2
-        while input_size > 4:
+        pyramid_dim = min(*input_size) // 2  # Use the smaller dimension to create pyramid.
+        while pyramid_dim > 4:
             in_features = n_features
             out_features = n_features * 2
             self.pyramid_features.add_module(
@@ -74,12 +74,17 @@ def __init__(
             self.pyramid_features.add_module(f"pyramid-{out_features}-batchnorm", nn.BatchNorm2d(out_features))
             self.pyramid_features.add_module(f"pyramid-{out_features}-relu", nn.LeakyReLU(0.2, inplace=True))
             n_features = out_features
-            input_size = input_size // 2
+            pyramid_dim = pyramid_dim // 2
 
         # Final conv
         if add_final_conv_layer:
             self.final_conv_layer = nn.Conv2d(
-                n_features, latent_vec_size, kernel_size=4, stride=1, padding=0, bias=False
+                n_features,
+                latent_vec_size,
+                kernel_size=4,
+                stride=1,
+                padding=0,
+                bias=False,
             )
 
     def forward(self, input_tensor: Tensor):
@@ -98,7 +103,7 @@ class Decoder(nn.Module):
     """Decoder Network.
 
     Args:
-        input_size (int): Size of input image
+        input_size (Tuple[int, int]): Size of input image
         latent_vec_size (int): Size of latent vector z
         num_input_channels (int): Number of input channels in the image
         n_features (int): Number of features per convolution layer
@@ -107,39 +112,57 @@ class Decoder(nn.Module):
     """
 
     def __init__(
-        self, input_size: int, latent_vec_size: int, num_input_channels: int, n_features: int, extra_layers: int = 0
+        self,
+        input_size: Tuple[int, int],
+        latent_vec_size: int,
+        num_input_channels: int,
+        n_features: int,
+        extra_layers: int = 0,
     ):
         super().__init__()
-        assert input_size % 16 == 0, "Input size should be a multiple of 16"
 
         self.latent_input = nn.Sequential()
 
         # Calculate input channel size to recreate inverse pyramid
-        exp_factor = int(math.log(input_size // 4, 2)) - 1
+        exp_factor = math.ceil(math.log(min(input_size) // 2, 2)) - 2
         n_input_features = n_features * (2**exp_factor)
 
         # CNN layer for latent vector input
         self.latent_input.add_module(
             f"initial-{latent_vec_size}-{n_input_features}-convt",
-            nn.ConvTranspose2d(latent_vec_size, n_input_features, kernel_size=4, stride=1, padding=0, bias=False),
+            nn.ConvTranspose2d(
+                latent_vec_size,
+                n_input_features,
+                kernel_size=4,
+                stride=1,
+                padding=0,
+                bias=False,
+            ),
         )
         self.latent_input.add_module(f"initial-{n_input_features}-batchnorm", nn.BatchNorm2d(n_input_features))
         self.latent_input.add_module(f"initial-{n_input_features}-relu", nn.ReLU(True))
 
         # Create inverse pyramid
         self.inverse_pyramid = nn.Sequential()
-        input_size = input_size // 2
-        while input_size > 4:
+        pyramid_dim = min(*input_size) // 2  # Use the smaller dimension to create pyramid.
+        while pyramid_dim > 4:
             in_features = n_input_features
             out_features = n_input_features // 2
             self.inverse_pyramid.add_module(
                 f"pyramid-{in_features}-{out_features}-convt",
-                nn.ConvTranspose2d(in_features, out_features, kernel_size=4, stride=2, padding=1, bias=False),
+                nn.ConvTranspose2d(
+                    in_features,
+                    out_features,
+                    kernel_size=4,
+                    stride=2,
+                    padding=1,
+                    bias=False,
+                ),
             )
             self.inverse_pyramid.add_module(f"pyramid-{out_features}-batchnorm", nn.BatchNorm2d(out_features))
             self.inverse_pyramid.add_module(f"pyramid-{out_features}-relu", nn.ReLU(True))
             n_input_features = out_features
-            input_size = input_size // 2
+            pyramid_dim = pyramid_dim // 2
 
         # Extra Layers
         self.extra_layers = nn.Sequential()
@@ -159,7 +182,14 @@ def __init__(
         self.final_layers = nn.Sequential()
         self.final_layers.add_module(
             f"final-{n_input_features}-{num_input_channels}-convt",
-            nn.ConvTranspose2d(n_input_features, num_input_channels, kernel_size=4, stride=2, padding=1, bias=False),
+            nn.ConvTranspose2d(
+                n_input_features,
+                num_input_channels,
+                kernel_size=4,
+                stride=2,
+                padding=1,
+                bias=False,
+            ),
         )
         self.final_layers.add_module(f"final-{num_input_channels}-tanh", nn.Tanh())
 
@@ -178,13 +208,13 @@ class Discriminator(nn.Module):
         Made of only one encoder layer which takes x and x_hat to produce a score.
 
     Args:
-        input_size (int): Input image size.
+        input_size (Tuple[int,int]): Input image size.
         num_input_channels (int): Number of image channels.
         n_features (int): Number of feature maps in each convolution layer.
         extra_layers (int, optional): Add extra intermediate layers. Defaults to 0.
     """
 
-    def __init__(self, input_size: int, num_input_channels: int, n_features: int, extra_layers: int = 0):
+    def __init__(self, input_size: Tuple[int, int], num_input_channels: int, n_features: int, extra_layers: int = 0):
         super().__init__()
         encoder = Encoder(input_size, 1, num_input_channels, n_features, extra_layers)
         layers = []
@@ -212,7 +242,7 @@ class Generator(nn.Module):
     Made of an encoder-decoder-encoder architecture.
 
     Args:
-        input_size (int): Size of input data.
+        input_size (Tuple[int,int]): Size of input data.
         latent_vec_size (int): Dimension of latent vector produced between the first encoder-decoder.
         num_input_channels (int): Number of channels in input image.
         n_features (int): Number of feature maps in each convolution layer.
@@ -222,7 +252,7 @@ class Generator(nn.Module):
 
     def __init__(
         self,
-        input_size: int,
+        input_size: Tuple[int, int],
         latent_vec_size: int,
         num_input_channels: int,
         n_features: int,
@@ -250,7 +280,7 @@ class GanomalyModel(nn.Module):
     """Ganomaly Model.
 
     Args:
-        input_size (int): Input dimension of a square tensor.
+        input_size (Tuple[int,int]): Input dimension.
         num_input_channels (int): Number of input channels.
         n_features (int): Number of features layers in the CNNs.
         latent_vec_size (int): Size of autoencoder latent vector.
@@ -263,7 +293,7 @@ class GanomalyModel(nn.Module):
 
     def __init__(
         self,
-        input_size: int,
+        input_size: Tuple[int, int],
         num_input_channels: int,
         n_features: int,
         latent_vec_size: int,
@@ -348,7 +378,9 @@ def get_generator_loss(self, images: Tensor) -> Tensor:
         pred_fake, _ = self.discriminator(fake)
 
         error_enc = self.loss_enc(latent_i, latent_o)
+
         error_con = self.loss_con(images, fake)
+
         error_adv = self.loss_adv(pred_real, pred_fake)
 
         loss_generator = error_adv * self.wadv + error_con * self.wcon + error_enc * self.wenc
@@ -363,6 +395,7 @@ def forward(self, batch: Tensor) -> Tensor:
         Returns:
             Tensor: Regeneration scores.
         """
+        padded_batch = pad_nextpow2(batch)
         self.generator.eval()
-        _, latent_i, latent_o = self.generator(batch)
+        _, latent_i, latent_o = self.generator(padded_batch)
         return torch.mean(torch.pow((latent_i - latent_o), 2), dim=1).view(-1)  # convert nx1x1 to n