mindee · charlesmindee · Mar 9, 2022 · Feb 16, 2022 · Feb 16, 2022 · Feb 16, 2022
diff --git a/doctr/transforms/modules/pytorch.py b/doctr/transforms/modules/pytorch.py
@@ -4,8 +4,9 @@
 # See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
 
 import math
-from typing import Any, Dict, Tuple, Union
+from typing import Any, Dict, Optional, Tuple, Union
 
+import numpy as np
 import torch
 from PIL.Image import Image
 from torch.nn.functional import pad
@@ -27,7 +28,12 @@ def __init__(
         self.preserve_aspect_ratio = preserve_aspect_ratio
         self.symmetric_pad = symmetric_pad
 
-    def forward(self, img: torch.Tensor) -> torch.Tensor:
+    def forward(
+        self,
+        img: torch.Tensor,
+        target: Optional[np.ndarray] = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, np.ndarray]]:
+
         target_ratio = self.size[0] / self.size[1]
         actual_ratio = img.shape[-2] / img.shape[-1]
         if not self.preserve_aspect_ratio or (target_ratio == actual_ratio):
@@ -41,11 +47,27 @@ def forward(self, img: torch.Tensor) -> torch.Tensor:
 
             # Scale image
             img = F.resize(img, tmp_size, self.interpolation)
+            raw_shape = img.shape[-2:]
             # Pad (inverted in pytorch)
             _pad = (0, self.size[1] - img.shape[-1], 0, self.size[0] - img.shape[-2])
             if self.symmetric_pad:
                 half_pad = (math.ceil(_pad[1] / 2), math.ceil(_pad[3] / 2))
                 _pad = (half_pad[0], _pad[1] - half_pad[0], half_pad[1], _pad[3] - half_pad[1])
+
+            # In case boxes are provided, resize boxes if needed (for detection task if preserve aspect ratio)
+            if target is not None:
+                if self.preserve_aspect_ratio:
+                    # Get absolute coords
+                    if target.shape[1:] == (4,):
+                        target[:, [0, 2]] *= raw_shape[-1] / self.size[1]
+                        target[:, [1, 3]] *= raw_shape[-2] / self.size[0]
+                    elif target.shape[1:] == (4, 2):
+                        target[..., 0] *= raw_shape[-1] / self.size[1]
+                        target[..., 1] *= raw_shape[-2] / self.size[0]
+                    else:
+                        raise AssertionError
+                return pad(img, _pad), target
+
             return pad(img, _pad)
 
     def __repr__(self) -> str:

diff --git a/doctr/transforms/modules/tensorflow.py b/doctr/transforms/modules/tensorflow.py
@@ -4,7 +4,7 @@
 # See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
 
 import random
-from typing import Any, Callable, Dict, Iterable, List, Tuple, Union
+from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
 
 import numpy as np
 import tensorflow as tf
@@ -75,9 +75,15 @@ def extra_repr(self) -> str:
             _repr += f", preserve_aspect_ratio={self.preserve_aspect_ratio}, symmetric_pad={self.symmetric_pad}"
         return _repr
 
-    def __call__(self, img: tf.Tensor) -> tf.Tensor:
+    def __call__(
+        self,
+        img: tf.Tensor,
+        target: Optional[np.ndarray] = None,
+    ) -> Union[tf.Tensor, Tuple[tf.Tensor, np.ndarray]]:
+
         input_dtype = img.dtype
         img = tf.image.resize(img, self.output_size, self.method, self.preserve_aspect_ratio)
+        raw_shape = img.shape[:2]
         if self.preserve_aspect_ratio:
             # pad width
             if not self.symmetric_pad:
@@ -87,6 +93,21 @@ def __call__(self, img: tf.Tensor) -> tf.Tensor:
             else:
                 offset = (int((self.output_size[0] - img.shape[0]) / 2), 0)
             img = tf.image.pad_to_bounding_box(img, *offset, *self.output_size)
+
+        # In case boxes are provided, resize boxes if needed (for detection task if preserve aspect ratio)
+        if target is not None:
+            if self.preserve_aspect_ratio:
+                # Get absolute coords
+                if target.shape[1:] == (4,):
+                    target[:, [0, 2]] *= raw_shape[1] / self.output_size[1]
+                    target[:, [1, 3]] *= raw_shape[0] / self.output_size[0]
+                elif target.shape[1:] == (4, 2):
+                    target[..., 0] *= raw_shape[1] / self.output_size[1]
+                    target[..., 1] *= raw_shape[0] / self.output_size[0]
+                else:
+                    raise AssertionError
+            return tf.cast(img, dtype=input_dtype), target
+
         return tf.cast(img, dtype=input_dtype)
 
 

diff --git a/references/detection/train_pytorch.py b/references/detection/train_pytorch.py
@@ -181,7 +181,7 @@ def main(args):
     val_set = DetectionDataset(
         img_folder=os.path.join(args.val_path, 'images'),
         label_path=os.path.join(args.val_path, 'labels.json'),
-        img_transforms=T.Resize((args.input_size, args.input_size)),
+        img_transforms=T.Resize((args.input_size, args.input_size), preserve_aspect_ratio=True),
         use_polygons=args.rotation and not args.eval_straight,
     )
     val_loader = DataLoader(
@@ -243,17 +243,18 @@ def main(args):
         img_folder=os.path.join(args.train_path, 'images'),
         label_path=os.path.join(args.train_path, 'labels.json'),
         img_transforms=Compose(
-            ([T.Resize((args.input_size, args.input_size))] if not args.rotation else [])
-            + [
+            [
                 # Augmentations
                 T.RandomApply(T.ColorInversion(), .1),
                 ColorJitter(brightness=0.3, contrast=0.3, saturation=0.3, hue=0.02),
             ]
         ),
-        sample_transforms=T.SampleCompose([
-            T.RandomRotate(90, expand=True),
-            T.ImageTransform(T.Resize((args.input_size, args.input_size))),
-        ]) if args.rotation else None,
+        sample_transforms=T.SampleCompose(
+            [T.Resize((args.input_size, args.input_size), preserve_aspect_ratio=True)]
+            + ([T.RandomRotate(90, expand=True),
+                T.ImageTransform(T.Resize((args.input_size, args.input_size), preserve_aspect_ratio=True))
+                ] if args.rotation else [])
+        ),
         use_polygons=args.rotation,
     )
 

diff --git a/references/detection/train_tensorflow.py b/references/detection/train_tensorflow.py
@@ -141,7 +141,7 @@ def main(args):
     val_set = DetectionDataset(
         img_folder=os.path.join(args.val_path, 'images'),
         label_path=os.path.join(args.val_path, 'labels.json'),
-        img_transforms=T.Resize((args.input_size, args.input_size)),
+        img_transforms=T.Resize((args.input_size, args.input_size), preserve_aspect_ratio=True),
         use_polygons=args.rotation and not args.eval_straight,
     )
     val_loader = DataLoader(
@@ -189,8 +189,7 @@ def main(args):
         img_folder=os.path.join(args.train_path, 'images'),
         label_path=os.path.join(args.train_path, 'labels.json'),
         img_transforms=T.Compose(
-            ([T.Resize((args.input_size, args.input_size))] if not args.rotation else [])
-            + [
+            [
                 # Augmentations
                 T.RandomApply(T.ColorInversion(), .1),
                 T.RandomJpegQuality(60),
@@ -199,10 +198,12 @@ def main(args):
                 T.RandomBrightness(.3),
             ]
         ),
-        sample_transforms=T.SampleCompose([
-            T.RandomRotate(90, expand=True),
-            T.ImageTransform(T.Resize((args.input_size, args.input_size))),
-        ]) if args.rotation else None,
+        sample_transforms=T.SampleCompose(
+            [T.Resize((args.input_size, args.input_size), preserve_aspect_ratio=True)]
+            + ([T.RandomRotate(90, expand=True),
+                T.ImageTransform(T.Resize((args.input_size, args.input_size), preserve_aspect_ratio=True))
+                ] if args.rotation else [])
+        ),
         use_polygons=args.rotation,
     )
     train_loader = DataLoader(

diff --git a/tests/tensorflow/test_models_detection_tf.py b/tests/tensorflow/test_models_detection_tf.py
@@ -66,7 +66,7 @@ def test_detection_models(arch_name, input_shape, output_size, out_prob):
         np.array([[.75, .75, .5, .5, 0], [.65, .7, .3, .4, 0]], dtype=np.float32),
     ]
     loss = model(input_tensor, target, training=True)['loss']
-    assert isinstance(loss, tf.Tensor) and ((loss - out['loss']) / loss).numpy() < 21e-2
+    assert isinstance(loss, tf.Tensor) and ((loss - out['loss']) / loss).numpy() < 25e-2
 
 
 @pytest.fixture(scope="session")