diff --git a/common/nn/functions.go b/common/nn/functions.go
index 9d75e6595..080fbe5c6 100644
--- a/common/nn/functions.go
+++ b/common/nn/functions.go
@@ -23,7 +23,7 @@ func Add(x0 *Tensor, x ...*Tensor) *Tensor {
 	output := x0
 	for _, x1 := range x {
 		if len(x0.shape) < len(x1.shape) {
-			x0, x1 = x1, x0
+			output, x1 = x1, output
 		}
 		for i := 0; i < len(x1.shape); i++ {
 			if x0.shape[len(x0.shape)-len(x1.shape)+i] != x1.shape[i] {
@@ -214,7 +214,7 @@ func SoftmaxCrossEntropy(x, y *Tensor) *Tensor {
 //
 //	(1 + target) * math32.Log(1+math32.Exp(-prediction)) / 2 + (1 - target) * math32.Log(1+math32.Exp(prediction)) / 2
 func BCEWithLogits(target, prediction *Tensor) *Tensor {
-	return Add(
+	return Mean(Add(
 		Div(
 			Mul(
 				Add(NewScalar(1), target),
@@ -224,5 +224,5 @@ func BCEWithLogits(target, prediction *Tensor) *Tensor {
 			Mul(
 				Sub(NewScalar(1), target),
 				Log(Add(NewScalar(1), Exp(prediction)))),
-			NewScalar(2)))
+			NewScalar(2))))
 }
diff --git a/common/nn/layers.go b/common/nn/layers.go
index 12999e700..f7897ffe7 100644
--- a/common/nn/layers.go
+++ b/common/nn/layers.go
@@ -38,9 +38,10 @@ type LinearLayer struct {
 }
 
 func NewLinear(in, out int) Layer {
+	bound := 1.0 / math32.Sqrt(float32(in))
 	return &LinearLayer{
-		W: Normal(0, 1.0/math32.Sqrt(float32(in)), in, out).RequireGrad(),
-		B: Zeros(out).RequireGrad(),
+		W: Uniform(-bound, bound, in, out),
+		B: Zeros(out),
 	}
 }
 
@@ -73,7 +74,7 @@ type EmbeddingLayer struct {
 func NewEmbedding(n int, shape ...int) Layer {
 	wShape := append([]int{n}, shape...)
 	return &EmbeddingLayer{
-		W: Rand(wShape...),
+		W: Normal(0, 0.01, wShape...),
 	}
 }
 
diff --git a/common/nn/nn_test.go b/common/nn/nn_test.go
index 41b3c82bc..b26416ab3 100644
--- a/common/nn/nn_test.go
+++ b/common/nn/nn_test.go
@@ -24,8 +24,10 @@ import (
 	"strconv"
 	"strings"
 	"testing"
+	"time"
 
 	"github.com/chewxy/math32"
+	"github.com/klauspost/cpuid/v2"
 	"github.com/samber/lo"
 	"github.com/schollz/progressbar/v3"
 	"github.com/stretchr/testify/assert"
@@ -208,7 +210,25 @@ func openMNISTFile(path string) (*Tensor, *Tensor, error) {
 	return NewTensor(images, len(labels), 784), NewTensor(labels, len(labels)), nil
 }
 
+func accuracy(prediction, target *Tensor) float32 {
+	var precision float32
+	for i, gt := range target.data {
+		if prediction.Slice(i, i+1).argmax()[1] == int(gt) {
+			precision += 1
+		}
+	}
+	precision /= float32(len(target.data))
+	return precision
+}
+
 func TestMNIST(t *testing.T) {
+	if cpuid.CPU.VendorString != "Apple" && !cpuid.CPU.Supports(cpuid.AVX512F, cpuid.AVX512DQ) {
+		// Since the test takes a long time, we run the test only in development environment.
+		// 1. Mac with Apple Silicon.
+		// 2. x86 CPU with AVX512 support.
+		t.Skip("Skip test on non-development environment.")
+	}
+
 	train, test, err := mnist()
 	assert.NoError(t, err)
 
@@ -219,13 +239,14 @@ func TestMNIST(t *testing.T) {
 	)
 	optimizer := NewAdam(model.Parameters(), 0.001)
 
-	var (
-		sumLoss   float32
+	const (
 		batchSize = 1000
+		numEpoch  = 5
 	)
-	for i := 0; i < 3; i++ {
-		sumLoss = 0
-		bar := progressbar.Default(int64(train.A.shape[0]), fmt.Sprintf("Epoch %v/%v", i+1, 3))
+	for i := 0; i < numEpoch; i++ {
+		startTime := time.Now()
+		sumLoss, sumAcc := float32(0), float32(0)
+		bar := progressbar.Default(int64(train.A.shape[0]), fmt.Sprintf("Epoch %v/%v", i+1, numEpoch))
 		for j := 0; j < train.A.shape[0]; j += batchSize {
 			xBatch := train.A.Slice(j, j+batchSize)
 			yBatch := train.B.Slice(j, j+batchSize)
@@ -238,22 +259,18 @@ func TestMNIST(t *testing.T) {
 
 			optimizer.Step()
 			sumLoss += loss.data[0]
+			sumAcc += accuracy(yPred, yBatch)
 			bar.Add(batchSize)
 		}
 		sumLoss /= float32(train.A.shape[0] / batchSize)
+		sumAcc /= float32(train.A.shape[0] / batchSize)
 		bar.Finish()
+		fmt.Println("Duration:", time.Since(startTime), "Loss:", sumLoss, "Accuracy:", sumAcc)
 	}
-	assert.Less(t, sumLoss, float32(0.4))
 
-	testPred := model.Forward(test.A)
-	var precision float32
-	for i, gt := range test.B.data {
-		if testPred.Slice(i, i+1).argmax()[1] == int(gt) {
-			precision += 1
-		}
-	}
-	precision /= float32(len(test.B.data))
-	assert.Greater(t, float64(precision), 0.92)
+	testAcc := accuracy(model.Forward(test.A), test.B)
+	fmt.Println("Test Accuracy:", testAcc)
+	assert.Greater(t, float64(testAcc), 0.96)
 }
 
 func spiral() (*Tensor, *Tensor, error) {
diff --git a/common/nn/op_test.go b/common/nn/op_test.go
index 162f8661d..65db603f3 100644
--- a/common/nn/op_test.go
+++ b/common/nn/op_test.go
@@ -57,14 +57,14 @@ func allClose(t *testing.T, a, b *Tensor) {
 
 func TestAdd(t *testing.T) {
 	// (2,3) + (2,3) -> (2,3)
-	x := NewVariable([]float32{1, 2, 3, 4, 5, 6}, 2, 3)
-	y := NewVariable([]float32{2, 3, 4, 5, 6, 7}, 2, 3)
+	x := NewTensor([]float32{1, 2, 3, 4, 5, 6}, 2, 3)
+	y := NewTensor([]float32{2, 3, 4, 5, 6, 7}, 2, 3)
 	z := Add(x, y)
 	assert.Equal(t, []float32{3, 5, 7, 9, 11, 13}, z.data)
 
 	// Test gradient
-	x = Rand(2, 3).RequireGrad()
-	y = Rand(2, 3).RequireGrad()
+	x = Rand(2, 3)
+	y = Rand(2, 3)
 	z = Add(x, y)
 	z.Backward()
 	dx := numericalDiff(func(x *Tensor) *Tensor { return Add(x, y) }, x)
@@ -73,8 +73,8 @@ func TestAdd(t *testing.T) {
 	allClose(t, y.grad, dy)
 
 	// (2,3) + () -> (2,3)
-	x = NewVariable([]float32{1, 2, 3, 4, 5, 6}, 2, 3)
-	y = NewVariable([]float32{2})
+	x = NewTensor([]float32{1, 2, 3, 4, 5, 6}, 2, 3)
+	y = NewTensor([]float32{2})
 	z = Add(x, y)
 	assert.Equal(t, []float32{3, 4, 5, 6, 7, 8}, z.data)
 
@@ -84,8 +84,8 @@ func TestAdd(t *testing.T) {
 	assert.Equal(t, []float32{6}, y.grad.data)
 
 	// (2,3) + (3) -> (2,3)
-	x = NewVariable([]float32{1, 2, 3, 4, 5, 6}, 2, 3)
-	y = NewVariable([]float32{2, 3, 4}, 3)
+	x = NewTensor([]float32{1, 2, 3, 4, 5, 6}, 2, 3)
+	y = NewTensor([]float32{2, 3, 4}, 3)
 	z = Add(x, y)
 	assert.Equal(t, []float32{3, 5, 7, 6, 8, 10}, z.data)
 
@@ -97,14 +97,14 @@ func TestAdd(t *testing.T) {
 
 func TestSub(t *testing.T) {
 	// (2,3) - (2,3) -> (2,3)
-	x := NewVariable([]float32{1, 2, 3, 4, 5, 6}, 2, 3)
-	y := NewVariable([]float32{2, 3, 4, 5, 6, 7}, 2, 3)
+	x := NewTensor([]float32{1, 2, 3, 4, 5, 6}, 2, 3)
+	y := NewTensor([]float32{2, 3, 4, 5, 6, 7}, 2, 3)
 	z := Sub(x, y)
 	assert.Equal(t, []float32{-1, -1, -1, -1, -1, -1}, z.data)
 
 	// Test gradient
-	x = Rand(2, 3).RequireGrad()
-	y = Rand(2, 3).RequireGrad()
+	x = Rand(2, 3)
+	y = Rand(2, 3)
 	z = Sub(x, y)
 	z.Backward()
 	dx := numericalDiff(func(x *Tensor) *Tensor { return Sub(x, y) }, x)
@@ -113,8 +113,8 @@ func TestSub(t *testing.T) {
 	allClose(t, y.grad, dy)
 
 	// (2,3) - () -> (2,3)
-	x = NewVariable([]float32{1, 2, 3, 4, 5, 6}, 2, 3)
-	y = NewVariable([]float32{2})
+	x = NewTensor([]float32{1, 2, 3, 4, 5, 6}, 2, 3)
+	y = NewTensor([]float32{2})
 	z = Sub(x, y)
 	assert.Equal(t, []float32{-1, 0, 1, 2, 3, 4}, z.data)
 
@@ -124,8 +124,8 @@ func TestSub(t *testing.T) {
 	assert.Equal(t, []float32{-6}, y.grad.data)
 
 	// (2,3) - (3) -> (2,3)
-	x = NewVariable([]float32{1, 2, 3, 4, 5, 6}, 2, 3)
-	y = NewVariable([]float32{2, 3, 4}, 3)
+	x = NewTensor([]float32{1, 2, 3, 4, 5, 6}, 2, 3)
+	y = NewTensor([]float32{2, 3, 4}, 3)
 	z = Sub(x, y)
 	assert.Equal(t, []float32{-1, -1, -1, 2, 2, 2}, z.data)
 
@@ -137,14 +137,14 @@ func TestSub(t *testing.T) {
 
 func TestMul(t *testing.T) {
 	// (2,3) * (2,3) -> (2,3)
-	x := NewVariable([]float32{1, 2, 3, 4, 5, 6}, 2, 3)
-	y := NewVariable([]float32{2, 3, 4, 5, 6, 7}, 2, 3)
+	x := NewTensor([]float32{1, 2, 3, 4, 5, 6}, 2, 3)
+	y := NewTensor([]float32{2, 3, 4, 5, 6, 7}, 2, 3)
 	z := Mul(x, y)
 	assert.Equal(t, []float32{2, 6, 12, 20, 30, 42}, z.data)
 
 	// Test gradient
-	x = Rand(2, 3).RequireGrad()
-	y = Rand(2, 3).RequireGrad()
+	x = Rand(2, 3)
+	y = Rand(2, 3)
 	z = Mul(x, y)
 	z.Backward()
 	dx := numericalDiff(func(x *Tensor) *Tensor { return Mul(x, y) }, x)
@@ -153,8 +153,8 @@ func TestMul(t *testing.T) {
 	allClose(t, y.grad, dy)
 
 	// (2,3) * () -> (2,3)
-	x = NewVariable([]float32{1, 2, 3, 4, 5, 6}, 2, 3)
-	y = NewVariable([]float32{2})
+	x = NewTensor([]float32{1, 2, 3, 4, 5, 6}, 2, 3)
+	y = NewTensor([]float32{2})
 	z = Mul(x, y)
 	assert.Equal(t, []float32{2, 4, 6, 8, 10, 12}, z.data)
 
@@ -164,8 +164,8 @@ func TestMul(t *testing.T) {
 	assert.Equal(t, []float32{21}, y.grad.data)
 
 	// (2,3) * (3) -> (2,3)
-	x = NewVariable([]float32{1, 2, 3, 4, 5, 6}, 2, 3)
-	y = NewVariable([]float32{2, 3, 4}, 3)
+	x = NewTensor([]float32{1, 2, 3, 4, 5, 6}, 2, 3)
+	y = NewTensor([]float32{2, 3, 4}, 3)
 	z = Mul(x, y)
 	assert.Equal(t, []float32{2, 6, 12, 8, 15, 24}, z.data)
 
@@ -177,8 +177,8 @@ func TestMul(t *testing.T) {
 
 func TestDiv(t *testing.T) {
 	// (2,3) / (2,3) -> (2,3)
-	x := NewVariable([]float32{1, 2, 3, 4, 5, 6}, 2, 3)
-	y := NewVariable([]float32{2, 3, 4, 5, 6, 7}, 2, 3)
+	x := NewTensor([]float32{1, 2, 3, 4, 5, 6}, 2, 3)
+	y := NewTensor([]float32{2, 3, 4, 5, 6, 7}, 2, 3)
 	z := Div(x, y)
 	assert.InDeltaSlice(t, []float32{0.5, 2.0 / 3.0, 0.75, 4.0 / 5.0, 5.0 / 6.0, 6.0 / 7.0}, z.data, 1e-6)
 
@@ -190,8 +190,8 @@ func TestDiv(t *testing.T) {
 	allClose(t, y.grad, dy)
 
 	// (2,3) / () -> (2,3)
-	x = NewVariable([]float32{1, 2, 3, 4, 5, 6}, 2, 3)
-	y = NewVariable([]float32{2})
+	x = NewTensor([]float32{1, 2, 3, 4, 5, 6}, 2, 3)
+	y = NewTensor([]float32{2})
 	z = Div(x, y)
 	assert.InDeltaSlice(t, []float32{0.5, 1, 1.5, 2, 2.5, 3}, z.data, 1e-6)
 
@@ -201,8 +201,8 @@ func TestDiv(t *testing.T) {
 	assert.InDeltaSlice(t, []float32{-21.0 / 4.0}, y.grad.data, 1e-6)
 
 	// (2,3) / (3) -> (2,3)
-	x = NewVariable([]float32{1, 2, 3, 4, 5, 6}, 2, 3)
-	y = NewVariable([]float32{2, 3, 4}, 3)
+	x = NewTensor([]float32{1, 2, 3, 4, 5, 6}, 2, 3)
+	y = NewTensor([]float32{2, 3, 4}, 3)
 	z = Div(x, y)
 	assert.InDeltaSlice(t, []float32{0.5, 2.0 / 3.0, 3.0 / 4.0, 2, 5.0 / 3.0, 1.5}, z.data, 1e-6)
 
@@ -214,12 +214,12 @@ func TestDiv(t *testing.T) {
 
 func TestSquare(t *testing.T) {
 	// (2,3) -> (2,3)
-	x := NewVariable([]float32{1, 2, 3, 4, 5, 6}, 2, 3)
+	x := NewTensor([]float32{1, 2, 3, 4, 5, 6}, 2, 3)
 	y := Square(x)
 	assert.Equal(t, []float32{1, 4, 9, 16, 25, 36}, y.data)
 
 	// Test gradient
-	x = Rand(2, 3).RequireGrad()
+	x = Rand(2, 3)
 	y = Square(x)
 	y.Backward()
 	dx := numericalDiff(Square, x)
@@ -228,14 +228,14 @@ func TestSquare(t *testing.T) {
 
 func TestPow(t *testing.T) {
 	// (2,3) ** (2,3) -> (2,3)
-	x := NewVariable([]float32{1, 2, 3, 4, 5, 6}, 2, 3)
-	y := NewVariable([]float32{2, 3, 4, 5, 6, 7}, 2, 3)
+	x := NewTensor([]float32{1, 2, 3, 4, 5, 6}, 2, 3)
+	y := NewTensor([]float32{2, 3, 4, 5, 6, 7}, 2, 3)
 	z := Pow(x, y)
 	assert.InDeltaSlice(t, []float32{1, 8, 81, 1024, 15625, 279936}, z.data, 1e-6)
 
 	// Test gradient
-	x = Rand(2, 3).RequireGrad()
-	y = Rand(2, 3).RequireGrad()
+	x = Rand(2, 3)
+	y = Rand(2, 3)
 	z = Pow(x, y)
 	z.Backward()
 	dx := numericalDiff(func(x *Tensor) *Tensor { return Pow(x, y) }, x)
@@ -244,8 +244,8 @@ func TestPow(t *testing.T) {
 	allClose(t, y.grad, dy)
 
 	// (2,3) ** () -> (2,3)
-	x = NewVariable([]float32{1, 2, 3, 4, 5, 6}, 2, 3)
-	y = NewVariable([]float32{2})
+	x = NewTensor([]float32{1, 2, 3, 4, 5, 6}, 2, 3)
+	y = NewTensor([]float32{2})
 	z = Pow(x, y)
 	assert.InDeltaSlice(t, []float32{1, 4, 9, 16, 25, 36}, z.data, 1e-6)
 
@@ -264,12 +264,12 @@ func TestPow(t *testing.T) {
 
 func TestExp(t *testing.T) {
 	// (2,3) -> (2,3)
-	x := NewVariable([]float32{0, 1, 2, 3, 4, 5}, 2, 3)
+	x := NewTensor([]float32{0, 1, 2, 3, 4, 5}, 2, 3)
 	y := Exp(x)
 	assert.InDeltaSlice(t, []float32{1, math32.Exp(1), math32.Exp(2), math32.Exp(3), math32.Exp(4), math32.Exp(5)}, y.data, 1e-5)
 
 	// Test gradient
-	x = Rand(2, 3).RequireGrad()
+	x = Rand(2, 3)
 	y = Exp(x)
 	y.Backward()
 	dx := numericalDiff(Exp, x)
@@ -278,12 +278,12 @@ func TestExp(t *testing.T) {
 
 func TestLog(t *testing.T) {
 	// (2,3) -> (2,3)
-	x := NewVariable([]float32{1, 2, 3, 4, 5, 6}, 2, 3)
+	x := NewTensor([]float32{1, 2, 3, 4, 5, 6}, 2, 3)
 	y := Log(x)
 	assert.InDeltaSlice(t, []float32{0, math32.Log(2), math32.Log(3), math32.Log(4), math32.Log(5), math32.Log(6)}, y.data, 1e-6)
 
 	// Test gradient
-	x = Rand(2, 3).RequireGrad()
+	x = Rand(2, 3)
 	y = Log(x)
 	y.Backward()
 	dx := numericalDiff(Log, x)
@@ -292,24 +292,24 @@ func TestLog(t *testing.T) {
 
 func TestSum(t *testing.T) {
 	// (2,3) -> ()
-	x := NewVariable([]float32{1, 2, 3, 4, 5, 6}, 2, 3)
+	x := NewTensor([]float32{1, 2, 3, 4, 5, 6}, 2, 3)
 	y := Sum(x)
 	assert.Equal(t, []float32{21}, y.data)
 
 	// Test gradient
-	x = Rand(2, 3).RequireGrad()
+	x = Rand(2, 3)
 	y = Sum(x)
 	y.Backward()
 	assert.Equal(t, []float32{1, 1, 1, 1, 1, 1}, x.grad.data)
 
 	// (2,3,2) -> (2,2)
-	x = NewVariable([]float32{1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6}, 2, 3, 2)
+	x = NewTensor([]float32{1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6}, 2, 3, 2)
 	y = Sum(x, 1)
 	assert.Equal(t, []int{2, 2}, y.shape)
 	assert.Equal(t, []float32{9, 12, 9, 12}, y.data)
 
 	// Test gradient
-	x = Rand(2, 3, 2).RequireGrad()
+	x = Rand(2, 3, 2)
 	y = Sum(x, 1)
 	y.Backward()
 	assert.Equal(t, []int{2, 3, 2}, x.grad.shape)
@@ -318,12 +318,12 @@ func TestSum(t *testing.T) {
 
 func TestMean(t *testing.T) {
 	// (2,3) -> ()
-	x := NewVariable([]float32{1, 2, 3, 4, 5, 6}, 2, 3)
+	x := NewTensor([]float32{1, 2, 3, 4, 5, 6}, 2, 3)
 	y := Mean(x)
 	assert.Equal(t, []float32{3.5}, y.data)
 
 	// Test gradient
-	x = Rand(2, 3).RequireGrad()
+	x = Rand(2, 3)
 	y = Mean(x)
 	y.Backward()
 	assert.Equal(t, []float32{1.0 / 6, 1.0 / 6, 1.0 / 6, 1.0 / 6, 1.0 / 6, 1.0 / 6}, x.grad.data)
@@ -331,12 +331,12 @@ func TestMean(t *testing.T) {
 
 func TestCos(t *testing.T) {
 	// (2,3) -> (2,3)
-	x := NewVariable([]float32{0, 0.1, 0.2, 0.3, 0.4, 0.5}, 2, 3)
+	x := NewTensor([]float32{0, 0.1, 0.2, 0.3, 0.4, 0.5}, 2, 3)
 	y := Cos(x)
 	assert.InDeltaSlice(t, []float32{1, 0.9950041652780258, 0.9800665778412416, 0.955336489125606, 0.9210609940028851, 0.8775825618903728}, y.data, 1e-6)
 
 	// Test gradient
-	x = Rand(2, 3).RequireGrad()
+	x = Rand(2, 3)
 	y = Cos(x)
 	y.Backward()
 	dx := numericalDiff(Cos, x)
@@ -345,12 +345,12 @@ func TestCos(t *testing.T) {
 
 func TestSin(t *testing.T) {
 	// (2,3) -> (2,3)
-	x := NewVariable([]float32{0, 1, 2, 3, 4, 5}, 2, 3)
+	x := NewTensor([]float32{0, 1, 2, 3, 4, 5}, 2, 3)
 	y := Sin(x)
 	assert.InDeltaSlice(t, []float32{0, 0.8414709848078965, 0.9092974268256817, 0.1411200080598672, -0.7568024953079282, -0.9589242746631385}, y.data, 1e-6)
 
 	// Test gradient
-	x = Rand(2, 3).RequireGrad()
+	x = Rand(2, 3)
 	y = Sin(x)
 	y.Backward()
 	dx := numericalDiff(Sin, x)
@@ -359,8 +359,8 @@ func TestSin(t *testing.T) {
 
 func TestMatMul(t *testing.T) {
 	// (2,3) * (3,4) -> (2,4)
-	x := NewVariable([]float32{1, 2, 3, 4, 5, 6}, 2, 3)
-	y := NewVariable([]float32{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}, 3, 4)
+	x := NewTensor([]float32{1, 2, 3, 4, 5, 6}, 2, 3)
+	y := NewTensor([]float32{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}, 3, 4)
 	z := MatMul(x, y)
 	assert.Equal(t, []int{2, 4}, z.shape)
 	assert.Equal(t, []float32{38, 44, 50, 56, 83, 98, 113, 128}, z.data)
@@ -373,8 +373,8 @@ func TestMatMul(t *testing.T) {
 	assert.Equal(t, []float32{5, 5, 5, 5, 7, 7, 7, 7, 9, 9, 9, 9}, y.grad.data)
 
 	// (3,2).T * (3,4) -> (2,4)
-	x = Rand(3, 2).RequireGrad()
-	y = Rand(3, 4).RequireGrad()
+	x = Rand(3, 2)
+	y = Rand(3, 4)
 	z = MatMul(x, y, true, false)
 	assert.Equal(t, []int{2, 4}, z.shape)
 	z.Backward()
@@ -382,8 +382,8 @@ func TestMatMul(t *testing.T) {
 	assert.Equal(t, []int{3, 4}, y.grad.shape)
 
 	// (2,3) * (4,3).T -> (2,4)
-	x = Rand(2, 3).RequireGrad()
-	y = Rand(4, 3).RequireGrad()
+	x = Rand(2, 3)
+	y = Rand(4, 3)
 	z = MatMul(x, y, false, true)
 	assert.Equal(t, []int{2, 4}, z.shape)
 	z.Backward()
@@ -391,8 +391,8 @@ func TestMatMul(t *testing.T) {
 	assert.Equal(t, []int{4, 3}, y.grad.shape)
 
 	// (3,2).T * (4,3).T -> (2,4)
-	x = Rand(3, 2).RequireGrad()
-	y = Rand(4, 3).RequireGrad()
+	x = Rand(3, 2)
+	y = Rand(4, 3)
 	z = MatMul(x, y, true, true)
 	assert.Equal(t, []int{2, 4}, z.shape)
 	z.Backward()
@@ -401,8 +401,8 @@ func TestMatMul(t *testing.T) {
 
 func TestBMM(t *testing.T) {
 	// (2,2,3) * (2,3,4) -> (2,2,4)
-	x := NewVariable([]float32{1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6}, 2, 2, 3)
-	y := NewVariable([]float32{
+	x := NewTensor([]float32{1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6}, 2, 2, 3)
+	y := NewTensor([]float32{
 		1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
 		1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
 	}, 2, 3, 4)
@@ -427,24 +427,24 @@ func TestBMM(t *testing.T) {
 	}, y.grad.data)
 
 	// (2,3,2).T * (2,3,4) -> (2,2,4)
-	x = Rand(2, 3, 2).RequireGrad()
-	y = Rand(2, 3, 4).RequireGrad()
+	x = Rand(2, 3, 2)
+	y = Rand(2, 3, 4)
 	z = BMM(x, y, true, false)
 	assert.Equal(t, []int{2, 2, 4}, z.shape)
 	z.Backward()
 	assert.Equal(t, []int{2, 3, 2}, x.grad.shape)
 
 	// (2,2,3) * (2,4,3).T -> (2,2,4)
-	x = Rand(2, 2, 3).RequireGrad()
-	y = Rand(2, 4, 3).RequireGrad()
+	x = Rand(2, 2, 3)
+	y = Rand(2, 4, 3)
 	z = BMM(x, y, false, true)
 	assert.Equal(t, []int{2, 2, 4}, z.shape)
 	z.Backward()
 	assert.Equal(t, []int{2, 2, 3}, x.grad.shape)
 
 	// (2,3,2).T * (2,43).T -> (2,2,4)
-	x = Rand(2, 3, 2).RequireGrad()
-	y = Rand(2, 4, 3).RequireGrad()
+	x = Rand(2, 3, 2)
+	y = Rand(2, 4, 3)
 	z = BMM(x, y, true, true)
 	assert.Equal(t, []int{2, 2, 4}, z.shape)
 	z.Backward()
@@ -453,7 +453,7 @@ func TestBMM(t *testing.T) {
 
 func TestBroadcast(t *testing.T) {
 	// (2) -> (2,3)
-	x := NewVariable([]float32{1, 2}, 2)
+	x := NewTensor([]float32{1, 2}, 2)
 	y := Broadcast(x, 3)
 	assert.Equal(t, []float32{1, 1, 1, 2, 2, 2}, y.data)
 
@@ -464,8 +464,8 @@ func TestBroadcast(t *testing.T) {
 
 func TestEmbedding(t *testing.T) {
 	// (2,3) -> (2,3,2)
-	x := NewVariable([]float32{0, 1, 0, 3, 0, 5}, 2, 3)
-	w := NewVariable([]float32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}, 6, 2)
+	x := NewTensor([]float32{0, 1, 0, 3, 0, 5}, 2, 3)
+	w := NewTensor([]float32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}, 6, 2)
 	y := Embedding(w, x)
 	assert.Equal(t, []int{2, 3, 2}, y.shape)
 	assert.Equal(t, []float32{0, 1, 2, 3, 0, 1, 6, 7, 0, 1, 10, 11}, y.data)
@@ -476,8 +476,8 @@ func TestEmbedding(t *testing.T) {
 	assert.Equal(t, []float32{3, 3, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1}, w.grad.data)
 
 	// (2,3) -> (2,3,1,2)
-	x = NewVariable([]float32{0, 1, 0, 3, 0, 5}, 2, 3)
-	w = NewVariable([]float32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}, 6, 1, 2)
+	x = NewTensor([]float32{0, 1, 0, 3, 0, 5}, 2, 3)
+	w = NewTensor([]float32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}, 6, 1, 2)
 	y = Embedding(w, x)
 	assert.Equal(t, []int{2, 3, 1, 2}, y.shape)
 	assert.Equal(t, []float32{0, 1, 2, 3, 0, 1, 6, 7, 0, 1, 10, 11}, y.data)
@@ -490,12 +490,12 @@ func TestEmbedding(t *testing.T) {
 
 func TestSigmoid(t *testing.T) {
 	// (2,3) -> (2,3)
-	x := NewVariable([]float32{0, 1, 2, 3, 4, 5}, 2, 3)
+	x := NewTensor([]float32{0, 1, 2, 3, 4, 5}, 2, 3)
 	y := Sigmoid(x)
 	assert.InDeltaSlice(t, []float32{0.5, 0.7310585786300049, 0.8807970779778823, 0.9525741268224334, 0.9820137900379085, 0.9933071490757153}, y.data, 1e-6)
 
 	// Test gradient
-	x = Rand(2, 3).RequireGrad()
+	x = Rand(2, 3)
 	y = Sigmoid(x)
 	y.Backward()
 	dx := numericalDiff(Sigmoid, x)
@@ -504,12 +504,12 @@ func TestSigmoid(t *testing.T) {
 
 func TestReLu(t *testing.T) {
 	// (2,3) -> (2,3)
-	x := NewVariable([]float32{-1, 0, 1, 2, 3, 4}, 2, 3)
+	x := NewTensor([]float32{-1, 0, 1, 2, 3, 4}, 2, 3)
 	y := ReLu(x)
 	assert.Equal(t, []float32{0, 0, 1, 2, 3, 4}, y.data)
 
 	// Test gradient
-	x = Rand(2, 3).RequireGrad()
+	x = Rand(2, 3)
 	y = ReLu(x)
 	y.Backward()
 	dx := numericalDiff(ReLu, x)
@@ -518,7 +518,7 @@ func TestReLu(t *testing.T) {
 
 func TestSoftmax(t *testing.T) {
 	// (1,3) -> (1,3)
-	x := NewVariable([]float32{3.0, 1.0, 0.2}, 1, 3)
+	x := NewTensor([]float32{3.0, 1.0, 0.2}, 1, 3)
 	y := Softmax(x, 1)
 	assert.Equal(t, []int{1, 3}, y.shape)
 	assert.InDeltaSlice(t, []float32{0.8360188027814407, 0.11314284146556013, 0.05083835575299916}, y.data, 1e-6)
@@ -531,7 +531,7 @@ func TestSoftmax(t *testing.T) {
 
 func TestFlatten(t *testing.T) {
 	// (2,3) -> (6)
-	x := NewVariable([]float32{1, 2, 3, 4, 5, 6}, 2, 3)
+	x := NewTensor([]float32{1, 2, 3, 4, 5, 6}, 2, 3)
 	y := Flatten(x)
 	assert.Equal(t, []float32{1, 2, 3, 4, 5, 6}, y.data)
 
@@ -542,7 +542,7 @@ func TestFlatten(t *testing.T) {
 
 func TestReshape(t *testing.T) {
 	// (2,3) -> (3,2)
-	x := NewVariable([]float32{1, 2, 3, 4, 5, 6}, 2, 3)
+	x := NewTensor([]float32{1, 2, 3, 4, 5, 6}, 2, 3)
 	y := Reshape(x, 3, 2)
 	assert.Equal(t, []float32{1, 2, 3, 4, 5, 6}, y.data)
 
@@ -553,8 +553,8 @@ func TestReshape(t *testing.T) {
 
 func TestSoftmaxCrossEntropy(t *testing.T) {
 	// (2,3) -> (2,3)
-	x := NewVariable([]float32{0.3, 2.9, 4.0, 0.2, 1.0, 3.0}, 3, 2)
-	y := NewVariable([]float32{1, 0, 1}, 3)
+	x := NewTensor([]float32{0.3, 2.9, 4.0, 0.2, 1.0, 3.0}, 3, 2)
+	y := NewTensor([]float32{1, 0, 1}, 3)
 	z := SoftmaxCrossEntropy(x, y)
 	assert.Empty(t, z.shape)
 	assert.InDelta(t, float32(0.07356563982184072), z.data[0], 1e-4)
@@ -567,7 +567,7 @@ func TestSoftmaxCrossEntropy(t *testing.T) {
 
 func TestReuseLeaf(t *testing.T) {
 	// x + x
-	x := NewVariable([]float32{1, 2, 3, 4, 5, 6}, 2, 3)
+	x := NewTensor([]float32{1, 2, 3, 4, 5, 6}, 2, 3)
 	y := Add(x, x)
 	assert.Equal(t, []float32{2, 4, 6, 8, 10, 12}, y.data)
 
@@ -579,15 +579,15 @@ func TestReuseLeaf(t *testing.T) {
 
 func TestReuseNode(t *testing.T) {
 	// x^2 + x^2
-	x := NewVariable([]float32{1, 2, 3, 4, 5, 6}, 2, 3)
-	temp := Pow(x, NewVariable([]float32{2}))
+	x := NewTensor([]float32{1, 2, 3, 4, 5, 6}, 2, 3)
+	temp := Pow(x, NewTensor([]float32{2}))
 	y := Add(temp, temp)
 	assert.Equal(t, []float32{2, 8, 18, 32, 50, 72}, y.data)
 
 	// Test gradient
 	y.Backward()
 	dx := numericalDiff(func(x *Tensor) *Tensor {
-		temp := Pow(x, NewVariable([]float32{2}))
+		temp := Pow(x, NewTensor([]float32{2}))
 		return Add(temp, temp)
 	}, x)
 	allClose(t, x.grad, dx)
@@ -595,16 +595,16 @@ func TestReuseNode(t *testing.T) {
 
 func TestDependency(t *testing.T) {
 	// x^2 + 2x^2
-	x := NewVariable([]float32{1, 2, 3, 4, 5, 6}, 2, 3)
-	temp := Pow(x, NewVariable([]float32{2}))
-	y := Add(temp, Mul(NewVariable([]float32{2}), temp))
+	x := NewTensor([]float32{1, 2, 3, 4, 5, 6}, 2, 3)
+	temp := Pow(x, NewTensor([]float32{2}))
+	y := Add(temp, Mul(NewTensor([]float32{2}), temp))
 	assert.Equal(t, []float32{3, 12, 27, 48, 75, 108}, y.data)
 
 	// Test gradient
 	y.Backward()
 	dx := numericalDiff(func(x *Tensor) *Tensor {
-		temp := Pow(x, NewVariable([]float32{2}))
-		return Add(temp, Mul(NewVariable([]float32{2}), temp))
+		temp := Pow(x, NewTensor([]float32{2}))
+		return Add(temp, Mul(NewTensor([]float32{2}), temp))
 	}, x)
 	allClose(t, x.grad, dx)
 }
diff --git a/common/nn/optimizers.go b/common/nn/optimizers.go
index 314980ade..745bfd087 100644
--- a/common/nn/optimizers.go
+++ b/common/nn/optimizers.go
@@ -16,7 +16,6 @@ package nn
 
 import (
 	"github.com/chewxy/math32"
-	"github.com/google/uuid"
 )
 
 type Optimizer interface {
@@ -60,8 +59,9 @@ type Adam struct {
 	beta1 float32
 	beta2 float32
 	eps   float32
-	ms    map[uuid.UUID]*Tensor
-	vs    map[uuid.UUID]*Tensor
+	ms    map[*Tensor]*Tensor
+	vs    map[*Tensor]*Tensor
+	t     float32
 }
 
 func NewAdam(params []*Tensor, alpha float32) Optimizer {
@@ -71,28 +71,33 @@ func NewAdam(params []*Tensor, alpha float32) Optimizer {
 		beta1:         0.9,
 		beta2:         0.999,
 		eps:           1e-8,
-		ms:            make(map[uuid.UUID]*Tensor),
-		vs:            make(map[uuid.UUID]*Tensor),
+		ms:            make(map[*Tensor]*Tensor),
+		vs:            make(map[*Tensor]*Tensor),
 	}
 }
 
 func (a *Adam) Step() {
+	a.t++
 	for _, p := range a.params {
-		if _, ok := a.ms[p.id]; !ok {
-			a.ms[p.id] = Zeros(p.shape...)
-			a.vs[p.id] = Zeros(p.shape...)
+		if _, ok := a.ms[p]; !ok {
+			a.ms[p] = Zeros(p.shape...)
+			a.vs[p] = Zeros(p.shape...)
 		}
 
-		m, v := a.ms[p.id], a.vs[p.id]
+		m, v := a.ms[p], a.vs[p]
 		grad := p.grad.data
 
+		fix1 := 1 - math32.Pow(a.beta1, a.t)
+		fix2 := 1 - math32.Pow(a.beta2, a.t)
+		lr := a.alpha * math32.Sqrt(fix2) / fix1
+
 		for i := range m.data {
 			// m += (1 - beta1) * (grad - m)
 			m.data[i] += (1 - a.beta1) * (grad[i] - m.data[i])
 			// v += (1 - beta2) * (grad * grad - v)
 			v.data[i] += (1 - a.beta2) * (grad[i]*grad[i] - v.data[i])
 			// param.data -= self.lr * m / (xp.sqrt(v) + eps)
-			p.data[i] -= a.alpha * m.data[i] / (math32.Sqrt(v.data[i]) + a.eps)
+			p.data[i] -= lr * m.data[i] / (math32.Sqrt(v.data[i]) + a.eps)
 		}
 	}
 }
diff --git a/common/nn/tensor.go b/common/nn/tensor.go
index 962e70d0b..9fdb147dd 100644
--- a/common/nn/tensor.go
+++ b/common/nn/tensor.go
@@ -23,7 +23,6 @@ import (
 
 	"github.com/chewxy/math32"
 	mapset "github.com/deckarep/golang-set/v2"
-	"github.com/google/uuid"
 	"github.com/samber/lo"
 	"github.com/zhenghaoz/gorse/base/floats"
 	"github.com/zhenghaoz/gorse/protocol"
@@ -35,9 +34,6 @@ type Tensor struct {
 	shape []int
 	grad  *Tensor
 	op    op
-
-	requireGrad bool
-	id          uuid.UUID // Only assigned if requireGrad is true
 }
 
 func NewTensor(data []float32, shape ...int) *Tensor {
@@ -54,10 +50,6 @@ func NewTensor(data []float32, shape ...int) *Tensor {
 	}
 }
 
-func NewVariable(data []float32, shape ...int) *Tensor {
-	return NewTensor(data, shape...).RequireGrad()
-}
-
 func NewScalar(data float32) *Tensor {
 	return &Tensor{
 		data:  []float32{data},
@@ -96,6 +88,21 @@ func Rand(shape ...int) *Tensor {
 	}
 }
 
+func Uniform(low, high float32, shape ...int) *Tensor {
+	n := 1
+	for _, s := range shape {
+		n *= s
+	}
+	data := make([]float32, n)
+	for i := range data {
+		data[i] = rand.Float32()*(high-low) + low
+	}
+	return &Tensor{
+		data:  data,
+		shape: shape,
+	}
+}
+
 func Normal(mean, std float32, shape ...int) *Tensor {
 	n := 1
 	for _, s := range shape {
@@ -159,12 +166,6 @@ func (t *Tensor) NoGrad() *Tensor {
 	return t
 }
 
-func (t *Tensor) RequireGrad() *Tensor {
-	t.requireGrad = true
-	t.id = uuid.New()
-	return t
-}
-
 func (t *Tensor) Shape() []int {
 	return t.shape
 }
diff --git a/common/nn/tensor_test.go b/common/nn/tensor_test.go
index 23ec2075f..25a444d38 100644
--- a/common/nn/tensor_test.go
+++ b/common/nn/tensor_test.go
@@ -41,7 +41,7 @@ func TestTensor_SliceIndices(t *testing.T) {
 }
 
 func TestTensor_Max(t *testing.T) {
-	x := NewVariable([]float32{3, 2, 5, 6, 0, 0}, 6)
+	x := NewTensor([]float32{3, 2, 5, 6, 0, 0}, 6)
 	y := x.max(0, false)
 	assert.Len(t, y.shape, 0)
 	assert.Equal(t, []float32{6}, y.data)
@@ -49,7 +49,7 @@ func TestTensor_Max(t *testing.T) {
 	assert.Panics(t, func() { x.max(-1, false) })
 	assert.Panics(t, func() { x.max(2, false) })
 
-	x = NewVariable([]float32{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}, 3, 2, 2)
+	x = NewTensor([]float32{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}, 3, 2, 2)
 	y = x.max(1, false)
 	assert.Equal(t, []int{3, 2}, y.shape)
 	assert.Equal(t, []float32{3, 4, 7, 8, 11, 12}, y.data)
@@ -59,7 +59,7 @@ func TestTensor_Max(t *testing.T) {
 }
 
 func TestTensor_Sum(t *testing.T) {
-	x := NewVariable([]float32{1, 2, 3, 4, 5, 6}, 6)
+	x := NewTensor([]float32{1, 2, 3, 4, 5, 6}, 6)
 	y := x.sum(0, false)
 	assert.Len(t, y.shape, 0)
 	assert.Equal(t, []float32{21}, y.data)
@@ -67,7 +67,7 @@ func TestTensor_Sum(t *testing.T) {
 	assert.Panics(t, func() { x.sum(-1, false) })
 	assert.Panics(t, func() { x.sum(2, false) })
 
-	x = NewVariable([]float32{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}, 3, 2, 2)
+	x = NewTensor([]float32{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}, 3, 2, 2)
 	y = x.sum(1, false)
 	assert.Equal(t, []int{3, 2}, y.shape)
 	assert.Equal(t, []float32{4, 6, 12, 14, 20, 22}, y.data)