Fix RMSProp update rule (apache#6235)

sifmelcara · piiswrong · commit 73826fc2d57b · 2017-05-13T14:53:24.000-07:00
* Fix RMSProp update rule

Follow the formula presents in Alex's paper,
this prevents taking square root of a negative
value (caused by arithmetic error).

* Fix the formula of non centered version of RMSProp

* Fix RMSProp update rule in python test

* Fix RMSProp update rule in perl test
diff --git a/perl-package/AI-MXNet/t/test_optimizers.t b/perl-package/AI-MXNet/t/test_optimizers.t
@@ -166,7 +166,7 @@ method update($index, $weight, $grad, $state)
             $grad = mx->nd->clip($grad, -$self->clip_gradient, $self->clip_gradient);
         }
         $n .= (1 - $self->gamma1) * ($grad * $grad) + $self->gamma1 * $n;
-        $weight -= $lr * $grad/(mx->nd->sqrt($n) + $self->epsilon);
+        $weight -= $lr * $grad/(mx->nd->sqrt($n + $self->epsilon));
     }
     else
     {
@@ -177,7 +177,7 @@ method update($index, $weight, $grad, $state)
         }
         $n .= (1 - $self->gamma1) * ($grad * $grad) + $self->gamma1 * $n;
         $g .= (1 - $self->gamma1) * $grad + $self->gamma1 * $g;
-        $delta .= ($self->gamma2) * $delta - $lr * $grad/(mx->nd->sqrt($n - $g*$g) + $self->epsilon);
+        $delta .= ($self->gamma2) * $delta - $lr * $grad/(mx->nd->sqrt($n - $g*$g + $self->epsilon));
         $weight += $delta;
     }
     if($self->clip_weights)
diff --git a/src/operator/optimizer_op-inl.h b/src/operator/optimizer_op-inl.h
@@ -300,17 +300,17 @@ inline void RMSPropAlexUpdate(const nnvm::NodeAttrs &attrs,
       delta = scalar<DType>(param.gamma2) * delta -
               scalar<DType>(param.lr) *
                   (F<clip>(grad, DType(param.clip_gradient)) /
-                   (F<square_root>(state_n - state_g * state_g) +
-                    scalar<DType>(param.epsilon)));
+                   (F<square_root>(state_n - state_g * state_g +
+                                   scalar<DType>(param.epsilon))));
     } else {
       state_n = scalar<DType>(1.f - param.gamma1) * (grad * grad) +
                 scalar<DType>(param.gamma1) * state_n;
       state_g = scalar<DType>(1.f - param.gamma1) * grad +
                 scalar<DType>(param.gamma1) * state_g;
       delta = scalar<DType>(param.gamma2) * delta -
               scalar<DType>(param.lr) *
-                  (grad / (F<square_root>(state_n - state_g * state_g) +
-                           scalar<DType>(param.epsilon)));
+                  (grad / (F<square_root>(state_n - state_g * state_g +
+                                          scalar<DType>(param.epsilon))));
     }
 
     if (param.clip_weights >= 0.0f) {
@@ -386,33 +386,35 @@ inline void RMSPropUpdate(const nnvm::NodeAttrs &attrs, const OpContext &ctx,
       if (param.clip_weights >= 0.0f) {
         Assign(out, req[0],
                F<clip>(weight -
-                           scalar<DType>(param.lr) *
-                               (F<clip>(grad, DType(param.clip_gradient)) /
-                                (F<square_root>(state_n) +
-                                 scalar<DType>(param.epsilon))),
+                       scalar<DType>(param.lr) *
+                           (F<clip>(grad, DType(param.clip_gradient)) /
+                            (F<square_root>(state_n +
+                                            scalar<DType>(param.epsilon)))),
                        DType(param.clip_weights)));
       } else {
         Assign(out, req[0], weight -
-                                scalar<DType>(param.lr) *
-                                    (F<clip>(grad, DType(param.clip_gradient)) /
-                                     (F<square_root>(state_n) +
-                                      scalar<DType>(param.epsilon))));
+                            scalar<DType>(param.lr) *
+                              (F<clip>(grad, DType(param.clip_gradient)) /
+                               (F<square_root>(state_n +
+                                               scalar<DType>(param.epsilon)))));
       }
     } else {
       state_n = scalar<DType>(1.f - param.gamma1) * (grad * grad) +
                 scalar<DType>(param.gamma1) * state_n;
       if (param.clip_weights >= 0.0f) {
         Assign(out, req[0],
                F<clip>(weight -
-                           scalar<DType>(param.lr) *
-                               (grad / (F<square_root>(state_n) +
-                                        scalar<DType>(param.epsilon))),
+                       scalar<DType>(param.lr) *
+                           (grad /
+                            (F<square_root>(state_n +
+                                            scalar<DType>(param.epsilon)))),
                        DType(param.clip_weights)));
       } else {
         Assign(out, req[0], weight -
-                                scalar<DType>(param.lr) *
-                                    (grad / (F<square_root>(state_n) +
-                                             scalar<DType>(param.epsilon))));
+                            scalar<DType>(param.lr) *
+                              (grad /
+                               (F<square_root>(state_n +
+                                               scalar<DType>(param.epsilon)))));
       }
     }
   });
diff --git a/tests/python/unittest/test_optimizer.py b/tests/python/unittest/test_optimizer.py
@@ -301,15 +301,15 @@ def update(self, index, weight, grad, state):
             if self.clip_gradient is not None:
                 grad = mx.nd.clip(grad, -self.clip_gradient, self.clip_gradient)
             n[:] = (1 - self.gamma1) * (grad * grad) + self.gamma1 * n
-            weight[:] -= lr * grad/(mx.nd.sqrt(n) + self.epsilon)
+            weight[:] -= lr * grad/(mx.nd.sqrt(n + self.epsilon))
 
         else:
             n, g, delta = state
             if self.clip_gradient is not None:
                 grad = mx.nd.clip(grad, -self.clip_gradient, self.clip_gradient)
             n[:] = (1 - self.gamma1) * (grad * grad) + self.gamma1 * n
             g[:] = (1 - self.gamma1) * grad + self.gamma1 * g
-            delta[:] = (self.gamma2) * delta - lr * grad/(mx.nd.sqrt(n - g*g) + self.epsilon)
+            delta[:] = (self.gamma2) * delta - lr * grad/(mx.nd.sqrt(n - g*g + self.epsilon))
             weight[:] += delta
 
         if self.clip_weights:

Original file line number	Diff line number	Diff line change
`@@ -166,7 +166,7 @@ method update($index, $weight, $grad, $state)`
`166`	`166`	`$grad = mx->nd->clip($grad, -$self->clip_gradient, $self->clip_gradient);`
`167`	`167`	`}`
`168`	`168`	`$n .= (1 - $self->gamma1) * ($grad * $grad) + $self->gamma1 * $n;`
`169`		`- $weight -= $lr * $grad/(mx->nd->sqrt($n) + $self->epsilon);`
	`169`	`+ $weight -= $lr * $grad/(mx->nd->sqrt($n + $self->epsilon));`
`170`	`170`	`}`
`171`	`171`	`else`
`172`	`172`	`{`
`@@ -177,7 +177,7 @@ method update($index, $weight, $grad, $state)`
`177`	`177`	`}`
`178`	`178`	`$n .= (1 - $self->gamma1) * ($grad * $grad) + $self->gamma1 * $n;`
`179`	`179`	`$g .= (1 - $self->gamma1) * $grad + $self->gamma1 * $g;`
`180`		`- $delta .= ($self->gamma2) * $delta - $lr * $grad/(mx->nd->sqrt($n - $g*$g) + $self->epsilon);`
	`180`	`+ $delta .= ($self->gamma2) * $delta - $lr * $grad/(mx->nd->sqrt($n - $g*$g + $self->epsilon));`
`181`	`181`	`$weight += $delta;`
`182`	`182`	`}`
`183`	`183`	`if($self->clip_weights)`