[Refactor] Change the meaning of momentum in EMA (#1581)

* fix ema momentum meaning, results in codes and config changes * fix comments * Update mmedit/engine/hooks/ema.py Co-authored-by: Yanhong Zeng <zengyh1900@gmail.com> * complete warning for old user * fix lint * fix ut * test warning * pytest capture warning * fix lint Co-authored-by: Yanhong Zeng <zengyh1900@gmail.com>
open-mmlab · Jan 18, 2023 · 6ed0f25 · 6ed0f25
1 parent 328c875
commit 6ed0f25
Show file tree

Hide file tree

Showing 35 changed files with 104 additions and 47 deletions.
diff --git a/configs/biggan/biggan-deep_cvt-hugging-face-rgb_imagenet1k-128x128.py b/configs/biggan/biggan-deep_cvt-hugging-face-rgb_imagenet1k-128x128.py
@@ -6,7 +6,7 @@
 ema_config = dict(
     type='ExponentialMovingAverage',
     interval=1,
-    momentum=0.9999,
+    momentum=0.0001,
     update_buffers=True,
     start_iter=20000)
 

diff --git a/configs/biggan/biggan-deep_cvt-hugging-face_rgb_imagenet1k-256x256.py b/configs/biggan/biggan-deep_cvt-hugging-face_rgb_imagenet1k-256x256.py
@@ -11,7 +11,7 @@
 ema_config = dict(
     type='ExponentialMovingAverage',
     interval=1,
-    momentum=0.9999,
+    momentum=0.0001,
     update_buffers=True,
     start_iter=20000)
 

diff --git a/configs/biggan/biggan-deep_cvt-hugging-face_rgb_imagenet1k-512x512.py b/configs/biggan/biggan-deep_cvt-hugging-face_rgb_imagenet1k-512x512.py
@@ -11,7 +11,7 @@
 ema_config = dict(
     type='ExponentialMovingAverage',
     interval=1,
-    momentum=0.9999,
+    momentum=0.0001,
     update_buffers=True,
     start_iter=20000)
 

diff --git a/configs/biggan/biggan_2xb25-500kiters_cifar10-32x32.py b/configs/biggan/biggan_2xb25-500kiters_cifar10-32x32.py
@@ -7,7 +7,7 @@
 ema_config = dict(
     type='ExponentialMovingAverage',
     interval=1,
-    momentum=0.9999,
+    momentum=0.0001,
     start_iter=1000)
 
 model = dict(

diff --git a/configs/biggan/biggan_ajbrock-sn_8xb32-1500kiters_imagenet1k-128x128.py b/configs/biggan/biggan_ajbrock-sn_8xb32-1500kiters_imagenet1k-128x128.py
@@ -8,7 +8,7 @@
 ema_config = dict(
     type='ExponentialMovingAverage',
     interval=1,
-    momentum=0.9999,
+    momentum=0.0001,
     update_buffers=True,
     start_iter=20000)
 

diff --git a/configs/biggan/biggan_cvt-BigGAN-PyTorch-rgb_imagenet1k-128x128.py b/configs/biggan/biggan_cvt-BigGAN-PyTorch-rgb_imagenet1k-128x128.py
@@ -6,7 +6,7 @@
 ema_config = dict(
     type='ExponentialMovingAverage',
     interval=1,
-    momentum=0.9999,
+    momentum=0.0001,
     update_buffers=True,
     start_iter=20000)
 

diff --git a/...s/positional_encoding_in_gans/mspie-stylegan2-config-c_c2_8xb3-1100kiters_ffhq-256-512.py b/...s/positional_encoding_in_gans/mspie-stylegan2-config-c_c2_8xb3-1100kiters_ffhq-256-512.py
@@ -7,7 +7,7 @@
 ema_config = dict(
     type='ExponentialMovingAverage',
     interval=1,
-    momentum=0.5**(32. / (ema_half_life * 1000.)))
+    momentum=1. - (0.5**(32. / (ema_half_life * 1000.))))
 
 model = dict(
     type='MSPIEStyleGAN2',

diff --git a/...s/positional_encoding_in_gans/mspie-stylegan2-config-d_c2_8xb3-1100kiters_ffhq-256-512.py b/...s/positional_encoding_in_gans/mspie-stylegan2-config-d_c2_8xb3-1100kiters_ffhq-256-512.py
@@ -7,7 +7,7 @@
 ema_config = dict(
     type='ExponentialMovingAverage',
     interval=1,
-    momentum=0.5**(32. / (ema_half_life * 1000.)))
+    momentum=1. - (0.5**(32. / (ema_half_life * 1000.))))
 
 model = dict(
     type='MSPIEStyleGAN2',

diff --git a/...s/positional_encoding_in_gans/mspie-stylegan2-config-e_c2_8xb3-1100kiters_ffhq-256-512.py b/...s/positional_encoding_in_gans/mspie-stylegan2-config-e_c2_8xb3-1100kiters_ffhq-256-512.py
@@ -7,7 +7,7 @@
 ema_config = dict(
     type='ExponentialMovingAverage',
     interval=1,
-    momentum=0.5**(32. / (ema_half_life * 1000.)))
+    momentum=1. - (0.5**(32. / (ema_half_life * 1000.))))
 
 model = dict(
     type='MSPIEStyleGAN2',

diff --git a/.../positional_encoding_in_gans/mspie-stylegan2-config-f_c1_8xb2-1600kiters_ffhq-256-1024.py b/.../positional_encoding_in_gans/mspie-stylegan2-config-f_c1_8xb2-1600kiters_ffhq-256-1024.py
@@ -29,7 +29,7 @@
 ema_config = dict(
     type='ExponentialMovingAverage',
     interval=1,
-    momentum=0.5**(32. / (ema_half_life * 1000.)))
+    momentum=1. - (0.5**(32. / (ema_half_life * 1000.))))
 
 model = dict(
     type='MSPIEStyleGAN2',

diff --git a/...s/positional_encoding_in_gans/mspie-stylegan2-config-f_c2_8xb3-1100kiters_ffhq-256-512.py b/...s/positional_encoding_in_gans/mspie-stylegan2-config-f_c2_8xb3-1100kiters_ffhq-256-512.py
@@ -7,7 +7,7 @@
 ema_config = dict(
     type='ExponentialMovingAverage',
     interval=1,
-    momentum=0.5**(32. / (ema_half_life * 1000.)))
+    momentum=1. - (0.5**(32. / (ema_half_life * 1000.))))
 
 model = dict(
     type='MSPIEStyleGAN2',

diff --git a/...s/positional_encoding_in_gans/mspie-stylegan2-config-f_c2_8xb3-1100kiters_ffhq-256-896.py b/...s/positional_encoding_in_gans/mspie-stylegan2-config-f_c2_8xb3-1100kiters_ffhq-256-896.py
@@ -7,7 +7,7 @@
 ema_config = dict(
     type='ExponentialMovingAverage',
     interval=1,
-    momentum=0.5**(32. / (ema_half_life * 1000.)))
+    momentum=1. - (0.5**(32. / (ema_half_life * 1000.))))
 
 model = dict(
     type='MSPIEStyleGAN2',

diff --git a/...s/positional_encoding_in_gans/mspie-stylegan2-config-g_c1_8xb3-1100kiters_ffhq-256-512.py b/...s/positional_encoding_in_gans/mspie-stylegan2-config-g_c1_8xb3-1100kiters_ffhq-256-512.py
@@ -7,7 +7,7 @@
 ema_config = dict(
     type='ExponentialMovingAverage',
     interval=1,
-    momentum=0.5**(32. / (ema_half_life * 1000.)))
+    momentum=1. - (0.5**(32. / (ema_half_life * 1000.))))
 
 model = dict(
     type='MSPIEStyleGAN2',

diff --git a/...s/positional_encoding_in_gans/mspie-stylegan2-config-h_c2_8xb3-1100kiters_ffhq-256-512.py b/...s/positional_encoding_in_gans/mspie-stylegan2-config-h_c2_8xb3-1100kiters_ffhq-256-512.py
@@ -7,7 +7,7 @@
 ema_config = dict(
     type='ExponentialMovingAverage',
     interval=1,
-    momentum=0.5**(32. / (ema_half_life * 1000.)))
+    momentum=1. - (0.5**(32. / (ema_half_life * 1000.))))
 
 model = dict(
     type='MSPIEStyleGAN2',

diff --git a/...s/positional_encoding_in_gans/mspie-stylegan2-config-i_c2_8xb3-1100kiters_ffhq-256-512.py b/...s/positional_encoding_in_gans/mspie-stylegan2-config-i_c2_8xb3-1100kiters_ffhq-256-512.py
@@ -7,7 +7,7 @@
 ema_config = dict(
     type='ExponentialMovingAverage',
     interval=1,
-    momentum=0.5**(32. / (ema_half_life * 1000.)))
+    momentum=1. - (0.5**(32. / (ema_half_life * 1000.))))
 
 model = dict(
     type='MSPIEStyleGAN2',

diff --git a/...s/positional_encoding_in_gans/mspie-stylegan2-config-j_c2_8xb3-1100kiters_ffhq-256-512.py b/...s/positional_encoding_in_gans/mspie-stylegan2-config-j_c2_8xb3-1100kiters_ffhq-256-512.py
@@ -7,7 +7,7 @@
 ema_config = dict(
     type='ExponentialMovingAverage',
     interval=1,
-    momentum=0.5**(32. / (ema_half_life * 1000.)))
+    momentum=1. - (0.5**(32. / (ema_half_life * 1000.))))
 
 model = dict(
     type='MSPIEStyleGAN2',

diff --git a/...s/positional_encoding_in_gans/mspie-stylegan2-config-k_c2_8xb3-1100kiters_ffhq-256-512.py b/...s/positional_encoding_in_gans/mspie-stylegan2-config-k_c2_8xb3-1100kiters_ffhq-256-512.py
@@ -7,7 +7,7 @@
 ema_config = dict(
     type='ExponentialMovingAverage',
     interval=1,
-    momentum=0.5**(32. / (ema_half_life * 1000.)))
+    momentum=1. - (0.5**(32. / (ema_half_life * 1000.))))
 
 model = dict(
     type='MSPIEStyleGAN2',

diff --git a/configs/positional_encoding_in_gans/stylegan2_c2_8xb3-1100kiters_ffhq-256x256.py b/configs/positional_encoding_in_gans/stylegan2_c2_8xb3-1100kiters_ffhq-256x256.py
@@ -20,7 +20,7 @@
     ema_config=dict(
         type='ExponentialMovingAverage',
         interval=1,
-        momentum=0.5**(32. / (ema_half_life * 1000.))))
+        momentum=1. - (0.5**(32. / (ema_half_life * 1000.)))))
 
 optim_wrapper = dict(
     generator=dict(

diff --git a/configs/positional_encoding_in_gans/stylegan2_c2_8xb3-1100kiters_ffhq-512x512.py b/configs/positional_encoding_in_gans/stylegan2_c2_8xb3-1100kiters_ffhq-512x512.py
@@ -20,7 +20,7 @@
     ema_config=dict(
         type='ExponentialMovingAverage',
         interval=1,
-        momentum=0.5**(32. / (ema_half_life * 1000.))))
+        momentum=1. - (0.5**(32. / (ema_half_life * 1000.)))))
 
 optim_wrapper = dict(
     generator=dict(

diff --git a/configs/real_basicvsr/realbasicvsr_wogan-c64b20-2x30x8_8xb2-lr1e-4-300k_reds.py b/configs/real_basicvsr/realbasicvsr_wogan-c64b20-2x30x8_8xb2-lr1e-4-300k_reds.py
@@ -300,7 +300,7 @@
         type='ExponentialMovingAverageHook',
         module_keys=('generator_ema'),
         interval=1,
-        interp_cfg=dict(momentum=0.999),
+        interp_cfg=dict(momentum=0.001),
     )
 ]
 

diff --git a/configs/real_esrgan/realesrnet_c64b23g32_4xb12-lr2e-4-1000k_df2k-ost.py b/configs/real_esrgan/realesrnet_c64b23g32_4xb12-lr2e-4-1000k_df2k-ost.py
@@ -258,6 +258,6 @@
         type='ExponentialMovingAverageHook',
         module_keys=('generator_ema'),
         interval=1,
-        interp_cfg=dict(momentum=0.999),
+        interp_cfg=dict(momentum=0.001),
     )
 ]
diff --git a/configs/styleganv1/styleganv1_ffhq-1024x1024_8xb4-25Mimgs.py b/configs/styleganv1/styleganv1_ffhq-1024x1024_8xb4-25Mimgs.py
@@ -7,7 +7,8 @@
 # MODEL
 model_wrapper_cfg = dict(find_unused_parameters=True)
 ema_half_life = 10.  # G_smoothing_kimg
-ema_config = dict(interval=1, momentum=0.5**(32. / (ema_half_life * 1000.)))
+ema_config = dict(
+    interval=1, momentum=1. - (0.5**(32. / (ema_half_life * 1000.))))
 model = dict(
     generator=dict(out_size=1024),
     discriminator=dict(in_size=1024),

diff --git a/configs/styleganv1/styleganv1_ffhq-256x256_8xb4-25Mimgs.py b/configs/styleganv1/styleganv1_ffhq-256x256_8xb4-25Mimgs.py
@@ -7,7 +7,8 @@
 # MODEL
 model_wrapper_cfg = dict(find_unused_parameters=True)
 ema_half_life = 10.  # G_smoothing_kimg
-ema_config = dict(interval=1, momentum=0.5**(32. / (ema_half_life * 1000.)))
+ema_config = dict(
+    interval=1, momentum=1. - (0.5**(32. / (ema_half_life * 1000.))))
 model = dict(
     generator=dict(out_size=256),
     discriminator=dict(in_size=256),

diff --git a/configs/styleganv2/stylegan2_c2_8xb4-800kiters_ffhq-256x256.py b/configs/styleganv2/stylegan2_c2_8xb4-800kiters_ffhq-256x256.py
@@ -20,7 +20,7 @@
     ema_config=dict(
         type='ExponentialMovingAverage',
         interval=1,
-        momentum=0.5**(32. / (ema_half_life * 1000.))),
+        momentum=1. - (0.5**(32. / (ema_half_life * 1000.)))),
     loss_config=dict(
         r1_loss_weight=10. / 2. * d_reg_interval,
         r1_interval=d_reg_interval,

diff --git a/configs/styleganv2/stylegan2_c2_8xb4-800kiters_lsun-cat-256x256.py b/configs/styleganv2/stylegan2_c2_8xb4-800kiters_lsun-cat-256x256.py
@@ -20,7 +20,7 @@
     ema_config=dict(
         type='ExponentialMovingAverage',
         interval=1,
-        momentum=0.5**(32. / (ema_half_life * 1000.))),
+        momentum=1. - (0.5**(32. / (ema_half_life * 1000.)))),
     loss_config=dict(
         r1_loss_weight=10. / 2. * d_reg_interval,
         r1_interval=d_reg_interval,

diff --git a/configs/styleganv2/stylegan2_c2_8xb4-800kiters_lsun-church-256x256.py b/configs/styleganv2/stylegan2_c2_8xb4-800kiters_lsun-church-256x256.py
@@ -20,7 +20,7 @@
     ema_config=dict(
         type='ExponentialMovingAverage',
         interval=1,
-        momentum=0.5**(32. / (ema_half_life * 1000.))),
+        momentum=1. - (0.5**(32. / (ema_half_life * 1000.)))),
     loss_config=dict(
         r1_loss_weight=10. / 2. * d_reg_interval,
         r1_interval=d_reg_interval,

diff --git a/configs/styleganv2/stylegan2_c2_8xb4-800kiters_lsun-horse-256x256.py b/configs/styleganv2/stylegan2_c2_8xb4-800kiters_lsun-horse-256x256.py
@@ -20,7 +20,7 @@
     ema_config=dict(
         type='ExponentialMovingAverage',
         interval=1,
-        momentum=0.5**(32. / (ema_half_life * 1000.))),
+        momentum=1. - (0.5**(32. / (ema_half_life * 1000.)))),
     loss_config=dict(
         r1_loss_weight=10. / 2. * d_reg_interval,
         r1_interval=d_reg_interval,

diff --git a/configs/styleganv2/stylegan2_c2_8xb4_ffhq-1024x1024.py b/configs/styleganv2/stylegan2_c2_8xb4_ffhq-1024x1024.py
@@ -20,7 +20,7 @@
     ema_config=dict(
         type='ExponentialMovingAverage',
         interval=1,
-        momentum=0.5**(32. / (ema_half_life * 1000.))),
+        momentum=1. - (0.5**(32. / (ema_half_life * 1000.)))),
     loss_config=dict(
         r1_loss_weight=10. / 2. * d_reg_interval,
         r1_interval=d_reg_interval,

diff --git a/configs/styleganv2/stylegan2_c2_8xb4_lsun-car-384x512.py b/configs/styleganv2/stylegan2_c2_8xb4_lsun-car-384x512.py
@@ -18,7 +18,7 @@
     ema_config=dict(
         type='ExponentialMovingAverage',
         interval=1,
-        momentum=0.5**(32. / (ema_half_life * 1000.))),
+        momentum=1. - (0.5**(32. / (ema_half_life * 1000.)))),
     loss_config=dict(
         r1_loss_weight=10. / 2. * d_reg_interval,
         r1_interval=d_reg_interval,

diff --git a/mmedit/engine/hooks/ema.py b/mmedit/engine/hooks/ema.py
@@ -69,22 +69,36 @@ def __init__(self,
             getattr(self, interp_mode), **self.interp_cfg)
 
     @staticmethod
-    def lerp(a, b, momentum=0.999, momentum_nontrainable=0., trainable=True):
-        """This is the function to perform linear interpolation between a and
-        b.
+    def lerp(a, b, momentum=0.001, momentum_nontrainable=1., trainable=True):
+        """Does a linear interpolation of two parameters/ buffers.
 
         Args:
-            a (float): number a
-            b (float): bumber b
-            momentum (float, optional): momentum. Defaults to 0.999.
-            momentum_nontrainable (float, optional): Defaults to 0.
-            trainable (bool, optional): trainable flag. Defaults to True.
-
+            a (torch.Tensor): Interpolation start point, refer to orig state.
+            b (torch.Tensor): Interpolation end point, refer to ema state.
+            momentum (float, optional): The weight for the interpolation
+                formula. Defaults to 0.001.
+            momentum_nontrainable (float, optional): The weight for the
+                interpolation formula used for nontrainable parameters.
+                Defaults to 1..
+            trainable (bool, optional): Whether input parameters is trainable.
+                If set to False, momentum_nontrainable will be used.
+                Defaults to True.
         Returns:
-            _type_: _description_
+            torch.Tensor: Interpolation result.
         """
+        assert 0.0 < momentum < 1.0, 'momentum must be in range (0.0, 1.0)'\
+                                     f'but got {momentum}'
+        assert 0.0 < momentum_nontrainable <= 1.0, (
+            'momentum_nontrainable must be in range (0.0, 1.0] but got '
+            f'{momentum_nontrainable}')
+        if momentum > 0.5:
+            warnings.warn(
+                'The value of momentum in EMA is usually a small number,'
+                'which is different from the conventional notion of '
+                f'momentum but got {momentum}. Please make sure the '
+                f'value is correct.')
         m = momentum if trainable else momentum_nontrainable
-        return a + (b - a) * m
+        return b + (a - b) * m
 
     def every_n_iters(self, runner: Runner, n: int):
         """This is the function to perform every n iterations.

diff --git a/mmedit/models/base_models/average_model.py b/mmedit/models/base_models/average_model.py
@@ -49,6 +49,12 @@ def __init__(self,
         super().__init__(model, interval, device, update_buffers)
         assert 0.0 < momentum < 1.0, 'momentum must be in range (0.0, 1.0)'\
                                      f'but got {momentum}'
+        if momentum > 0.5:
+            warnings.warn(
+                'The value of momentum in EMA is usually a small number,'
+                'which is different from the conventional notion of '
+                f'momentum but got {momentum}. Please make sure the '
+                f'value is correct.')
         self.momentum = momentum
 
     def avg_func(self, averaged_param: Tensor, source_param: Tensor,
@@ -230,8 +236,11 @@ def avg_func(self, averaged_param: Tensor, source_param: Tensor,
             steps (int): The number of times the parameters have been
                 updated.
         """
-        momentum = self.rampup(self.steps, self.ema_kimg, self.ema_rampup,
-                               self.batch_size, self.eps)
+        momentum = 1. - self.rampup(self.steps, self.ema_kimg, self.ema_rampup,
+                                    self.batch_size, self.eps)
+        if not (0.0 < momentum < 1.0):
+            warnings.warn('RampUp momentum must be in range (0.0, 1.0)'
+                          f'but got {momentum}')
         averaged_param.mul_(1 - momentum).add_(source_param, alpha=momentum)
 
     def _load_from_state_dict(self, state_dict: dict, prefix: str,

diff --git a/tests/test_engine/test_hooks/test_ema.py b/tests/test_engine/test_hooks/test_ema.py
@@ -156,6 +156,19 @@ def test_ema_hook(self):
         assert torch.equal(runner.model.module_a.a, torch.tensor([0.25, 0.5]))
         assert torch.equal(ema_states['a'], torch.tensor([0.375, 0.75]))
 
+        # test warning
+        with pytest.warns(UserWarning):
+            default_config = dict(
+                module_keys=('module_a_ema', 'module_b_ema'),
+                interval=1,
+                interp_cfg=dict(momentum=0.6))
+            cfg_ = deepcopy(default_config)
+            ema = ExponentialMovingAverageHook(**cfg_)
+            ema.lerp(
+                torch.tensor([0.25, 0.5]),
+                torch.tensor([0.25, 0.5]),
+                momentum=0.6)
+
     @pytest.mark.skipif(not torch.cuda.is_available(), reason='requires cuda')
     def test_ema_hook_cuda(self):
         ema = ExponentialMovingAverageHook(**self.default_config)