From 84b3bf77a202f25e864c5bf61fa8e5a06fc2cc55 Mon Sep 17 00:00:00 2001 From: Zhenghao Zhang Date: Sun, 5 Jan 2025 02:34:39 +0800 Subject: [PATCH 1/3] nn: remove `requireGrad` and `id` --- common/nn/functions.go | 6 +- common/nn/layers.go | 6 +- common/nn/op_test.go | 182 +++++++++++++++++++-------------------- common/nn/optimizers.go | 17 ++-- common/nn/tensor.go | 14 --- common/nn/tensor_test.go | 8 +- 6 files changed, 109 insertions(+), 124 deletions(-) diff --git a/common/nn/functions.go b/common/nn/functions.go index 9d75e6595..080fbe5c6 100644 --- a/common/nn/functions.go +++ b/common/nn/functions.go @@ -23,7 +23,7 @@ func Add(x0 *Tensor, x ...*Tensor) *Tensor { output := x0 for _, x1 := range x { if len(x0.shape) < len(x1.shape) { - x0, x1 = x1, x0 + output, x1 = x1, output } for i := 0; i < len(x1.shape); i++ { if x0.shape[len(x0.shape)-len(x1.shape)+i] != x1.shape[i] { @@ -214,7 +214,7 @@ func SoftmaxCrossEntropy(x, y *Tensor) *Tensor { // // (1 + target) * math32.Log(1+math32.Exp(-prediction)) / 2 + (1 - target) * math32.Log(1+math32.Exp(prediction)) / 2 func BCEWithLogits(target, prediction *Tensor) *Tensor { - return Add( + return Mean(Add( Div( Mul( Add(NewScalar(1), target), @@ -224,5 +224,5 @@ func BCEWithLogits(target, prediction *Tensor) *Tensor { Mul( Sub(NewScalar(1), target), Log(Add(NewScalar(1), Exp(prediction)))), - NewScalar(2))) + NewScalar(2)))) } diff --git a/common/nn/layers.go b/common/nn/layers.go index ce93af37c..3774cb8c4 100644 --- a/common/nn/layers.go +++ b/common/nn/layers.go @@ -30,8 +30,8 @@ type linearLayer struct { func NewLinear(in, out int) Layer { return &linearLayer{ - w: Normal(0, 1.0/math32.Sqrt(float32(in)), in, out).RequireGrad(), - b: Zeros(out).RequireGrad(), + w: Normal(0, 1.0/math32.Sqrt(float32(in)), in, out), + b: Zeros(out), } } @@ -64,7 +64,7 @@ type embeddingLayer struct { func NewEmbedding(n int, shape ...int) Layer { wShape := append([]int{n}, shape...) return &embeddingLayer{ - w: Rand(wShape...), + w: Normal(0, 0.01, wShape...), } } diff --git a/common/nn/op_test.go b/common/nn/op_test.go index 162f8661d..65db603f3 100644 --- a/common/nn/op_test.go +++ b/common/nn/op_test.go @@ -57,14 +57,14 @@ func allClose(t *testing.T, a, b *Tensor) { func TestAdd(t *testing.T) { // (2,3) + (2,3) -> (2,3) - x := NewVariable([]float32{1, 2, 3, 4, 5, 6}, 2, 3) - y := NewVariable([]float32{2, 3, 4, 5, 6, 7}, 2, 3) + x := NewTensor([]float32{1, 2, 3, 4, 5, 6}, 2, 3) + y := NewTensor([]float32{2, 3, 4, 5, 6, 7}, 2, 3) z := Add(x, y) assert.Equal(t, []float32{3, 5, 7, 9, 11, 13}, z.data) // Test gradient - x = Rand(2, 3).RequireGrad() - y = Rand(2, 3).RequireGrad() + x = Rand(2, 3) + y = Rand(2, 3) z = Add(x, y) z.Backward() dx := numericalDiff(func(x *Tensor) *Tensor { return Add(x, y) }, x) @@ -73,8 +73,8 @@ func TestAdd(t *testing.T) { allClose(t, y.grad, dy) // (2,3) + () -> (2,3) - x = NewVariable([]float32{1, 2, 3, 4, 5, 6}, 2, 3) - y = NewVariable([]float32{2}) + x = NewTensor([]float32{1, 2, 3, 4, 5, 6}, 2, 3) + y = NewTensor([]float32{2}) z = Add(x, y) assert.Equal(t, []float32{3, 4, 5, 6, 7, 8}, z.data) @@ -84,8 +84,8 @@ func TestAdd(t *testing.T) { assert.Equal(t, []float32{6}, y.grad.data) // (2,3) + (3) -> (2,3) - x = NewVariable([]float32{1, 2, 3, 4, 5, 6}, 2, 3) - y = NewVariable([]float32{2, 3, 4}, 3) + x = NewTensor([]float32{1, 2, 3, 4, 5, 6}, 2, 3) + y = NewTensor([]float32{2, 3, 4}, 3) z = Add(x, y) assert.Equal(t, []float32{3, 5, 7, 6, 8, 10}, z.data) @@ -97,14 +97,14 @@ func TestAdd(t *testing.T) { func TestSub(t *testing.T) { // (2,3) - (2,3) -> (2,3) - x := NewVariable([]float32{1, 2, 3, 4, 5, 6}, 2, 3) - y := NewVariable([]float32{2, 3, 4, 5, 6, 7}, 2, 3) + x := NewTensor([]float32{1, 2, 3, 4, 5, 6}, 2, 3) + y := NewTensor([]float32{2, 3, 4, 5, 6, 7}, 2, 3) z := Sub(x, y) assert.Equal(t, []float32{-1, -1, -1, -1, -1, -1}, z.data) // Test gradient - x = Rand(2, 3).RequireGrad() - y = Rand(2, 3).RequireGrad() + x = Rand(2, 3) + y = Rand(2, 3) z = Sub(x, y) z.Backward() dx := numericalDiff(func(x *Tensor) *Tensor { return Sub(x, y) }, x) @@ -113,8 +113,8 @@ func TestSub(t *testing.T) { allClose(t, y.grad, dy) // (2,3) - () -> (2,3) - x = NewVariable([]float32{1, 2, 3, 4, 5, 6}, 2, 3) - y = NewVariable([]float32{2}) + x = NewTensor([]float32{1, 2, 3, 4, 5, 6}, 2, 3) + y = NewTensor([]float32{2}) z = Sub(x, y) assert.Equal(t, []float32{-1, 0, 1, 2, 3, 4}, z.data) @@ -124,8 +124,8 @@ func TestSub(t *testing.T) { assert.Equal(t, []float32{-6}, y.grad.data) // (2,3) - (3) -> (2,3) - x = NewVariable([]float32{1, 2, 3, 4, 5, 6}, 2, 3) - y = NewVariable([]float32{2, 3, 4}, 3) + x = NewTensor([]float32{1, 2, 3, 4, 5, 6}, 2, 3) + y = NewTensor([]float32{2, 3, 4}, 3) z = Sub(x, y) assert.Equal(t, []float32{-1, -1, -1, 2, 2, 2}, z.data) @@ -137,14 +137,14 @@ func TestSub(t *testing.T) { func TestMul(t *testing.T) { // (2,3) * (2,3) -> (2,3) - x := NewVariable([]float32{1, 2, 3, 4, 5, 6}, 2, 3) - y := NewVariable([]float32{2, 3, 4, 5, 6, 7}, 2, 3) + x := NewTensor([]float32{1, 2, 3, 4, 5, 6}, 2, 3) + y := NewTensor([]float32{2, 3, 4, 5, 6, 7}, 2, 3) z := Mul(x, y) assert.Equal(t, []float32{2, 6, 12, 20, 30, 42}, z.data) // Test gradient - x = Rand(2, 3).RequireGrad() - y = Rand(2, 3).RequireGrad() + x = Rand(2, 3) + y = Rand(2, 3) z = Mul(x, y) z.Backward() dx := numericalDiff(func(x *Tensor) *Tensor { return Mul(x, y) }, x) @@ -153,8 +153,8 @@ func TestMul(t *testing.T) { allClose(t, y.grad, dy) // (2,3) * () -> (2,3) - x = NewVariable([]float32{1, 2, 3, 4, 5, 6}, 2, 3) - y = NewVariable([]float32{2}) + x = NewTensor([]float32{1, 2, 3, 4, 5, 6}, 2, 3) + y = NewTensor([]float32{2}) z = Mul(x, y) assert.Equal(t, []float32{2, 4, 6, 8, 10, 12}, z.data) @@ -164,8 +164,8 @@ func TestMul(t *testing.T) { assert.Equal(t, []float32{21}, y.grad.data) // (2,3) * (3) -> (2,3) - x = NewVariable([]float32{1, 2, 3, 4, 5, 6}, 2, 3) - y = NewVariable([]float32{2, 3, 4}, 3) + x = NewTensor([]float32{1, 2, 3, 4, 5, 6}, 2, 3) + y = NewTensor([]float32{2, 3, 4}, 3) z = Mul(x, y) assert.Equal(t, []float32{2, 6, 12, 8, 15, 24}, z.data) @@ -177,8 +177,8 @@ func TestMul(t *testing.T) { func TestDiv(t *testing.T) { // (2,3) / (2,3) -> (2,3) - x := NewVariable([]float32{1, 2, 3, 4, 5, 6}, 2, 3) - y := NewVariable([]float32{2, 3, 4, 5, 6, 7}, 2, 3) + x := NewTensor([]float32{1, 2, 3, 4, 5, 6}, 2, 3) + y := NewTensor([]float32{2, 3, 4, 5, 6, 7}, 2, 3) z := Div(x, y) assert.InDeltaSlice(t, []float32{0.5, 2.0 / 3.0, 0.75, 4.0 / 5.0, 5.0 / 6.0, 6.0 / 7.0}, z.data, 1e-6) @@ -190,8 +190,8 @@ func TestDiv(t *testing.T) { allClose(t, y.grad, dy) // (2,3) / () -> (2,3) - x = NewVariable([]float32{1, 2, 3, 4, 5, 6}, 2, 3) - y = NewVariable([]float32{2}) + x = NewTensor([]float32{1, 2, 3, 4, 5, 6}, 2, 3) + y = NewTensor([]float32{2}) z = Div(x, y) assert.InDeltaSlice(t, []float32{0.5, 1, 1.5, 2, 2.5, 3}, z.data, 1e-6) @@ -201,8 +201,8 @@ func TestDiv(t *testing.T) { assert.InDeltaSlice(t, []float32{-21.0 / 4.0}, y.grad.data, 1e-6) // (2,3) / (3) -> (2,3) - x = NewVariable([]float32{1, 2, 3, 4, 5, 6}, 2, 3) - y = NewVariable([]float32{2, 3, 4}, 3) + x = NewTensor([]float32{1, 2, 3, 4, 5, 6}, 2, 3) + y = NewTensor([]float32{2, 3, 4}, 3) z = Div(x, y) assert.InDeltaSlice(t, []float32{0.5, 2.0 / 3.0, 3.0 / 4.0, 2, 5.0 / 3.0, 1.5}, z.data, 1e-6) @@ -214,12 +214,12 @@ func TestDiv(t *testing.T) { func TestSquare(t *testing.T) { // (2,3) -> (2,3) - x := NewVariable([]float32{1, 2, 3, 4, 5, 6}, 2, 3) + x := NewTensor([]float32{1, 2, 3, 4, 5, 6}, 2, 3) y := Square(x) assert.Equal(t, []float32{1, 4, 9, 16, 25, 36}, y.data) // Test gradient - x = Rand(2, 3).RequireGrad() + x = Rand(2, 3) y = Square(x) y.Backward() dx := numericalDiff(Square, x) @@ -228,14 +228,14 @@ func TestSquare(t *testing.T) { func TestPow(t *testing.T) { // (2,3) ** (2,3) -> (2,3) - x := NewVariable([]float32{1, 2, 3, 4, 5, 6}, 2, 3) - y := NewVariable([]float32{2, 3, 4, 5, 6, 7}, 2, 3) + x := NewTensor([]float32{1, 2, 3, 4, 5, 6}, 2, 3) + y := NewTensor([]float32{2, 3, 4, 5, 6, 7}, 2, 3) z := Pow(x, y) assert.InDeltaSlice(t, []float32{1, 8, 81, 1024, 15625, 279936}, z.data, 1e-6) // Test gradient - x = Rand(2, 3).RequireGrad() - y = Rand(2, 3).RequireGrad() + x = Rand(2, 3) + y = Rand(2, 3) z = Pow(x, y) z.Backward() dx := numericalDiff(func(x *Tensor) *Tensor { return Pow(x, y) }, x) @@ -244,8 +244,8 @@ func TestPow(t *testing.T) { allClose(t, y.grad, dy) // (2,3) ** () -> (2,3) - x = NewVariable([]float32{1, 2, 3, 4, 5, 6}, 2, 3) - y = NewVariable([]float32{2}) + x = NewTensor([]float32{1, 2, 3, 4, 5, 6}, 2, 3) + y = NewTensor([]float32{2}) z = Pow(x, y) assert.InDeltaSlice(t, []float32{1, 4, 9, 16, 25, 36}, z.data, 1e-6) @@ -264,12 +264,12 @@ func TestPow(t *testing.T) { func TestExp(t *testing.T) { // (2,3) -> (2,3) - x := NewVariable([]float32{0, 1, 2, 3, 4, 5}, 2, 3) + x := NewTensor([]float32{0, 1, 2, 3, 4, 5}, 2, 3) y := Exp(x) assert.InDeltaSlice(t, []float32{1, math32.Exp(1), math32.Exp(2), math32.Exp(3), math32.Exp(4), math32.Exp(5)}, y.data, 1e-5) // Test gradient - x = Rand(2, 3).RequireGrad() + x = Rand(2, 3) y = Exp(x) y.Backward() dx := numericalDiff(Exp, x) @@ -278,12 +278,12 @@ func TestExp(t *testing.T) { func TestLog(t *testing.T) { // (2,3) -> (2,3) - x := NewVariable([]float32{1, 2, 3, 4, 5, 6}, 2, 3) + x := NewTensor([]float32{1, 2, 3, 4, 5, 6}, 2, 3) y := Log(x) assert.InDeltaSlice(t, []float32{0, math32.Log(2), math32.Log(3), math32.Log(4), math32.Log(5), math32.Log(6)}, y.data, 1e-6) // Test gradient - x = Rand(2, 3).RequireGrad() + x = Rand(2, 3) y = Log(x) y.Backward() dx := numericalDiff(Log, x) @@ -292,24 +292,24 @@ func TestLog(t *testing.T) { func TestSum(t *testing.T) { // (2,3) -> () - x := NewVariable([]float32{1, 2, 3, 4, 5, 6}, 2, 3) + x := NewTensor([]float32{1, 2, 3, 4, 5, 6}, 2, 3) y := Sum(x) assert.Equal(t, []float32{21}, y.data) // Test gradient - x = Rand(2, 3).RequireGrad() + x = Rand(2, 3) y = Sum(x) y.Backward() assert.Equal(t, []float32{1, 1, 1, 1, 1, 1}, x.grad.data) // (2,3,2) -> (2,2) - x = NewVariable([]float32{1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6}, 2, 3, 2) + x = NewTensor([]float32{1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6}, 2, 3, 2) y = Sum(x, 1) assert.Equal(t, []int{2, 2}, y.shape) assert.Equal(t, []float32{9, 12, 9, 12}, y.data) // Test gradient - x = Rand(2, 3, 2).RequireGrad() + x = Rand(2, 3, 2) y = Sum(x, 1) y.Backward() assert.Equal(t, []int{2, 3, 2}, x.grad.shape) @@ -318,12 +318,12 @@ func TestSum(t *testing.T) { func TestMean(t *testing.T) { // (2,3) -> () - x := NewVariable([]float32{1, 2, 3, 4, 5, 6}, 2, 3) + x := NewTensor([]float32{1, 2, 3, 4, 5, 6}, 2, 3) y := Mean(x) assert.Equal(t, []float32{3.5}, y.data) // Test gradient - x = Rand(2, 3).RequireGrad() + x = Rand(2, 3) y = Mean(x) y.Backward() assert.Equal(t, []float32{1.0 / 6, 1.0 / 6, 1.0 / 6, 1.0 / 6, 1.0 / 6, 1.0 / 6}, x.grad.data) @@ -331,12 +331,12 @@ func TestMean(t *testing.T) { func TestCos(t *testing.T) { // (2,3) -> (2,3) - x := NewVariable([]float32{0, 0.1, 0.2, 0.3, 0.4, 0.5}, 2, 3) + x := NewTensor([]float32{0, 0.1, 0.2, 0.3, 0.4, 0.5}, 2, 3) y := Cos(x) assert.InDeltaSlice(t, []float32{1, 0.9950041652780258, 0.9800665778412416, 0.955336489125606, 0.9210609940028851, 0.8775825618903728}, y.data, 1e-6) // Test gradient - x = Rand(2, 3).RequireGrad() + x = Rand(2, 3) y = Cos(x) y.Backward() dx := numericalDiff(Cos, x) @@ -345,12 +345,12 @@ func TestCos(t *testing.T) { func TestSin(t *testing.T) { // (2,3) -> (2,3) - x := NewVariable([]float32{0, 1, 2, 3, 4, 5}, 2, 3) + x := NewTensor([]float32{0, 1, 2, 3, 4, 5}, 2, 3) y := Sin(x) assert.InDeltaSlice(t, []float32{0, 0.8414709848078965, 0.9092974268256817, 0.1411200080598672, -0.7568024953079282, -0.9589242746631385}, y.data, 1e-6) // Test gradient - x = Rand(2, 3).RequireGrad() + x = Rand(2, 3) y = Sin(x) y.Backward() dx := numericalDiff(Sin, x) @@ -359,8 +359,8 @@ func TestSin(t *testing.T) { func TestMatMul(t *testing.T) { // (2,3) * (3,4) -> (2,4) - x := NewVariable([]float32{1, 2, 3, 4, 5, 6}, 2, 3) - y := NewVariable([]float32{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}, 3, 4) + x := NewTensor([]float32{1, 2, 3, 4, 5, 6}, 2, 3) + y := NewTensor([]float32{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}, 3, 4) z := MatMul(x, y) assert.Equal(t, []int{2, 4}, z.shape) assert.Equal(t, []float32{38, 44, 50, 56, 83, 98, 113, 128}, z.data) @@ -373,8 +373,8 @@ func TestMatMul(t *testing.T) { assert.Equal(t, []float32{5, 5, 5, 5, 7, 7, 7, 7, 9, 9, 9, 9}, y.grad.data) // (3,2).T * (3,4) -> (2,4) - x = Rand(3, 2).RequireGrad() - y = Rand(3, 4).RequireGrad() + x = Rand(3, 2) + y = Rand(3, 4) z = MatMul(x, y, true, false) assert.Equal(t, []int{2, 4}, z.shape) z.Backward() @@ -382,8 +382,8 @@ func TestMatMul(t *testing.T) { assert.Equal(t, []int{3, 4}, y.grad.shape) // (2,3) * (4,3).T -> (2,4) - x = Rand(2, 3).RequireGrad() - y = Rand(4, 3).RequireGrad() + x = Rand(2, 3) + y = Rand(4, 3) z = MatMul(x, y, false, true) assert.Equal(t, []int{2, 4}, z.shape) z.Backward() @@ -391,8 +391,8 @@ func TestMatMul(t *testing.T) { assert.Equal(t, []int{4, 3}, y.grad.shape) // (3,2).T * (4,3).T -> (2,4) - x = Rand(3, 2).RequireGrad() - y = Rand(4, 3).RequireGrad() + x = Rand(3, 2) + y = Rand(4, 3) z = MatMul(x, y, true, true) assert.Equal(t, []int{2, 4}, z.shape) z.Backward() @@ -401,8 +401,8 @@ func TestMatMul(t *testing.T) { func TestBMM(t *testing.T) { // (2,2,3) * (2,3,4) -> (2,2,4) - x := NewVariable([]float32{1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6}, 2, 2, 3) - y := NewVariable([]float32{ + x := NewTensor([]float32{1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6}, 2, 2, 3) + y := NewTensor([]float32{ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, }, 2, 3, 4) @@ -427,24 +427,24 @@ func TestBMM(t *testing.T) { }, y.grad.data) // (2,3,2).T * (2,3,4) -> (2,2,4) - x = Rand(2, 3, 2).RequireGrad() - y = Rand(2, 3, 4).RequireGrad() + x = Rand(2, 3, 2) + y = Rand(2, 3, 4) z = BMM(x, y, true, false) assert.Equal(t, []int{2, 2, 4}, z.shape) z.Backward() assert.Equal(t, []int{2, 3, 2}, x.grad.shape) // (2,2,3) * (2,4,3).T -> (2,2,4) - x = Rand(2, 2, 3).RequireGrad() - y = Rand(2, 4, 3).RequireGrad() + x = Rand(2, 2, 3) + y = Rand(2, 4, 3) z = BMM(x, y, false, true) assert.Equal(t, []int{2, 2, 4}, z.shape) z.Backward() assert.Equal(t, []int{2, 2, 3}, x.grad.shape) // (2,3,2).T * (2,43).T -> (2,2,4) - x = Rand(2, 3, 2).RequireGrad() - y = Rand(2, 4, 3).RequireGrad() + x = Rand(2, 3, 2) + y = Rand(2, 4, 3) z = BMM(x, y, true, true) assert.Equal(t, []int{2, 2, 4}, z.shape) z.Backward() @@ -453,7 +453,7 @@ func TestBMM(t *testing.T) { func TestBroadcast(t *testing.T) { // (2) -> (2,3) - x := NewVariable([]float32{1, 2}, 2) + x := NewTensor([]float32{1, 2}, 2) y := Broadcast(x, 3) assert.Equal(t, []float32{1, 1, 1, 2, 2, 2}, y.data) @@ -464,8 +464,8 @@ func TestBroadcast(t *testing.T) { func TestEmbedding(t *testing.T) { // (2,3) -> (2,3,2) - x := NewVariable([]float32{0, 1, 0, 3, 0, 5}, 2, 3) - w := NewVariable([]float32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}, 6, 2) + x := NewTensor([]float32{0, 1, 0, 3, 0, 5}, 2, 3) + w := NewTensor([]float32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}, 6, 2) y := Embedding(w, x) assert.Equal(t, []int{2, 3, 2}, y.shape) assert.Equal(t, []float32{0, 1, 2, 3, 0, 1, 6, 7, 0, 1, 10, 11}, y.data) @@ -476,8 +476,8 @@ func TestEmbedding(t *testing.T) { assert.Equal(t, []float32{3, 3, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1}, w.grad.data) // (2,3) -> (2,3,1,2) - x = NewVariable([]float32{0, 1, 0, 3, 0, 5}, 2, 3) - w = NewVariable([]float32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}, 6, 1, 2) + x = NewTensor([]float32{0, 1, 0, 3, 0, 5}, 2, 3) + w = NewTensor([]float32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}, 6, 1, 2) y = Embedding(w, x) assert.Equal(t, []int{2, 3, 1, 2}, y.shape) assert.Equal(t, []float32{0, 1, 2, 3, 0, 1, 6, 7, 0, 1, 10, 11}, y.data) @@ -490,12 +490,12 @@ func TestEmbedding(t *testing.T) { func TestSigmoid(t *testing.T) { // (2,3) -> (2,3) - x := NewVariable([]float32{0, 1, 2, 3, 4, 5}, 2, 3) + x := NewTensor([]float32{0, 1, 2, 3, 4, 5}, 2, 3) y := Sigmoid(x) assert.InDeltaSlice(t, []float32{0.5, 0.7310585786300049, 0.8807970779778823, 0.9525741268224334, 0.9820137900379085, 0.9933071490757153}, y.data, 1e-6) // Test gradient - x = Rand(2, 3).RequireGrad() + x = Rand(2, 3) y = Sigmoid(x) y.Backward() dx := numericalDiff(Sigmoid, x) @@ -504,12 +504,12 @@ func TestSigmoid(t *testing.T) { func TestReLu(t *testing.T) { // (2,3) -> (2,3) - x := NewVariable([]float32{-1, 0, 1, 2, 3, 4}, 2, 3) + x := NewTensor([]float32{-1, 0, 1, 2, 3, 4}, 2, 3) y := ReLu(x) assert.Equal(t, []float32{0, 0, 1, 2, 3, 4}, y.data) // Test gradient - x = Rand(2, 3).RequireGrad() + x = Rand(2, 3) y = ReLu(x) y.Backward() dx := numericalDiff(ReLu, x) @@ -518,7 +518,7 @@ func TestReLu(t *testing.T) { func TestSoftmax(t *testing.T) { // (1,3) -> (1,3) - x := NewVariable([]float32{3.0, 1.0, 0.2}, 1, 3) + x := NewTensor([]float32{3.0, 1.0, 0.2}, 1, 3) y := Softmax(x, 1) assert.Equal(t, []int{1, 3}, y.shape) assert.InDeltaSlice(t, []float32{0.8360188027814407, 0.11314284146556013, 0.05083835575299916}, y.data, 1e-6) @@ -531,7 +531,7 @@ func TestSoftmax(t *testing.T) { func TestFlatten(t *testing.T) { // (2,3) -> (6) - x := NewVariable([]float32{1, 2, 3, 4, 5, 6}, 2, 3) + x := NewTensor([]float32{1, 2, 3, 4, 5, 6}, 2, 3) y := Flatten(x) assert.Equal(t, []float32{1, 2, 3, 4, 5, 6}, y.data) @@ -542,7 +542,7 @@ func TestFlatten(t *testing.T) { func TestReshape(t *testing.T) { // (2,3) -> (3,2) - x := NewVariable([]float32{1, 2, 3, 4, 5, 6}, 2, 3) + x := NewTensor([]float32{1, 2, 3, 4, 5, 6}, 2, 3) y := Reshape(x, 3, 2) assert.Equal(t, []float32{1, 2, 3, 4, 5, 6}, y.data) @@ -553,8 +553,8 @@ func TestReshape(t *testing.T) { func TestSoftmaxCrossEntropy(t *testing.T) { // (2,3) -> (2,3) - x := NewVariable([]float32{0.3, 2.9, 4.0, 0.2, 1.0, 3.0}, 3, 2) - y := NewVariable([]float32{1, 0, 1}, 3) + x := NewTensor([]float32{0.3, 2.9, 4.0, 0.2, 1.0, 3.0}, 3, 2) + y := NewTensor([]float32{1, 0, 1}, 3) z := SoftmaxCrossEntropy(x, y) assert.Empty(t, z.shape) assert.InDelta(t, float32(0.07356563982184072), z.data[0], 1e-4) @@ -567,7 +567,7 @@ func TestSoftmaxCrossEntropy(t *testing.T) { func TestReuseLeaf(t *testing.T) { // x + x - x := NewVariable([]float32{1, 2, 3, 4, 5, 6}, 2, 3) + x := NewTensor([]float32{1, 2, 3, 4, 5, 6}, 2, 3) y := Add(x, x) assert.Equal(t, []float32{2, 4, 6, 8, 10, 12}, y.data) @@ -579,15 +579,15 @@ func TestReuseLeaf(t *testing.T) { func TestReuseNode(t *testing.T) { // x^2 + x^2 - x := NewVariable([]float32{1, 2, 3, 4, 5, 6}, 2, 3) - temp := Pow(x, NewVariable([]float32{2})) + x := NewTensor([]float32{1, 2, 3, 4, 5, 6}, 2, 3) + temp := Pow(x, NewTensor([]float32{2})) y := Add(temp, temp) assert.Equal(t, []float32{2, 8, 18, 32, 50, 72}, y.data) // Test gradient y.Backward() dx := numericalDiff(func(x *Tensor) *Tensor { - temp := Pow(x, NewVariable([]float32{2})) + temp := Pow(x, NewTensor([]float32{2})) return Add(temp, temp) }, x) allClose(t, x.grad, dx) @@ -595,16 +595,16 @@ func TestReuseNode(t *testing.T) { func TestDependency(t *testing.T) { // x^2 + 2x^2 - x := NewVariable([]float32{1, 2, 3, 4, 5, 6}, 2, 3) - temp := Pow(x, NewVariable([]float32{2})) - y := Add(temp, Mul(NewVariable([]float32{2}), temp)) + x := NewTensor([]float32{1, 2, 3, 4, 5, 6}, 2, 3) + temp := Pow(x, NewTensor([]float32{2})) + y := Add(temp, Mul(NewTensor([]float32{2}), temp)) assert.Equal(t, []float32{3, 12, 27, 48, 75, 108}, y.data) // Test gradient y.Backward() dx := numericalDiff(func(x *Tensor) *Tensor { - temp := Pow(x, NewVariable([]float32{2})) - return Add(temp, Mul(NewVariable([]float32{2}), temp)) + temp := Pow(x, NewTensor([]float32{2})) + return Add(temp, Mul(NewTensor([]float32{2}), temp)) }, x) allClose(t, x.grad, dx) } diff --git a/common/nn/optimizers.go b/common/nn/optimizers.go index 314980ade..9d13ed4dc 100644 --- a/common/nn/optimizers.go +++ b/common/nn/optimizers.go @@ -16,7 +16,6 @@ package nn import ( "github.com/chewxy/math32" - "github.com/google/uuid" ) type Optimizer interface { @@ -60,8 +59,8 @@ type Adam struct { beta1 float32 beta2 float32 eps float32 - ms map[uuid.UUID]*Tensor - vs map[uuid.UUID]*Tensor + ms map[*Tensor]*Tensor + vs map[*Tensor]*Tensor } func NewAdam(params []*Tensor, alpha float32) Optimizer { @@ -71,19 +70,19 @@ func NewAdam(params []*Tensor, alpha float32) Optimizer { beta1: 0.9, beta2: 0.999, eps: 1e-8, - ms: make(map[uuid.UUID]*Tensor), - vs: make(map[uuid.UUID]*Tensor), + ms: make(map[*Tensor]*Tensor), + vs: make(map[*Tensor]*Tensor), } } func (a *Adam) Step() { for _, p := range a.params { - if _, ok := a.ms[p.id]; !ok { - a.ms[p.id] = Zeros(p.shape...) - a.vs[p.id] = Zeros(p.shape...) + if _, ok := a.ms[p]; !ok { + a.ms[p] = Zeros(p.shape...) + a.vs[p] = Zeros(p.shape...) } - m, v := a.ms[p.id], a.vs[p.id] + m, v := a.ms[p], a.vs[p] grad := p.grad.data for i := range m.data { diff --git a/common/nn/tensor.go b/common/nn/tensor.go index 695236ea0..3e8e01a15 100644 --- a/common/nn/tensor.go +++ b/common/nn/tensor.go @@ -23,7 +23,6 @@ import ( "github.com/chewxy/math32" mapset "github.com/deckarep/golang-set/v2" - "github.com/google/uuid" "github.com/samber/lo" "github.com/zhenghaoz/gorse/base/floats" "golang.org/x/exp/slices" @@ -34,9 +33,6 @@ type Tensor struct { shape []int grad *Tensor op op - - requireGrad bool - id uuid.UUID // Only assigned if requireGrad is true } func NewTensor(data []float32, shape ...int) *Tensor { @@ -53,10 +49,6 @@ func NewTensor(data []float32, shape ...int) *Tensor { } } -func NewVariable(data []float32, shape ...int) *Tensor { - return NewTensor(data, shape...).RequireGrad() -} - func NewScalar(data float32) *Tensor { return &Tensor{ data: []float32{data}, @@ -158,12 +150,6 @@ func (t *Tensor) NoGrad() *Tensor { return t } -func (t *Tensor) RequireGrad() *Tensor { - t.requireGrad = true - t.id = uuid.New() - return t -} - func (t *Tensor) Shape() []int { return t.shape } diff --git a/common/nn/tensor_test.go b/common/nn/tensor_test.go index 309a23809..29a7490a5 100644 --- a/common/nn/tensor_test.go +++ b/common/nn/tensor_test.go @@ -34,7 +34,7 @@ func TestTensor_Slice(t *testing.T) { } func TestTensor_Max(t *testing.T) { - x := NewVariable([]float32{3, 2, 5, 6, 0, 0}, 6) + x := NewTensor([]float32{3, 2, 5, 6, 0, 0}, 6) y := x.max(0, false) assert.Len(t, y.shape, 0) assert.Equal(t, []float32{6}, y.data) @@ -42,7 +42,7 @@ func TestTensor_Max(t *testing.T) { assert.Panics(t, func() { x.max(-1, false) }) assert.Panics(t, func() { x.max(2, false) }) - x = NewVariable([]float32{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}, 3, 2, 2) + x = NewTensor([]float32{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}, 3, 2, 2) y = x.max(1, false) assert.Equal(t, []int{3, 2}, y.shape) assert.Equal(t, []float32{3, 4, 7, 8, 11, 12}, y.data) @@ -52,7 +52,7 @@ func TestTensor_Max(t *testing.T) { } func TestTensor_Sum(t *testing.T) { - x := NewVariable([]float32{1, 2, 3, 4, 5, 6}, 6) + x := NewTensor([]float32{1, 2, 3, 4, 5, 6}, 6) y := x.sum(0, false) assert.Len(t, y.shape, 0) assert.Equal(t, []float32{21}, y.data) @@ -60,7 +60,7 @@ func TestTensor_Sum(t *testing.T) { assert.Panics(t, func() { x.sum(-1, false) }) assert.Panics(t, func() { x.sum(2, false) }) - x = NewVariable([]float32{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}, 3, 2, 2) + x = NewTensor([]float32{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}, 3, 2, 2) y = x.sum(1, false) assert.Equal(t, []int{3, 2}, y.shape) assert.Equal(t, []float32{4, 6, 12, 14, 20, 22}, y.data) From 85d91539d910ff0668c2547293eabbfddf9dc3e8 Mon Sep 17 00:00:00 2001 From: Zhenghao Zhang Date: Sun, 5 Jan 2025 21:59:50 +0800 Subject: [PATCH 2/3] nn: Fix Adam optimizer --- common/nn/layers.go | 3 ++- common/nn/nn_test.go | 39 ++++++++++++++++++++++++--------------- common/nn/optimizers.go | 18 ++++++++++++------ common/nn/tensor.go | 15 +++++++++++++++ 4 files changed, 53 insertions(+), 22 deletions(-) diff --git a/common/nn/layers.go b/common/nn/layers.go index 3774cb8c4..02736d7b8 100644 --- a/common/nn/layers.go +++ b/common/nn/layers.go @@ -29,8 +29,9 @@ type linearLayer struct { } func NewLinear(in, out int) Layer { + bound := 1.0 / math32.Sqrt(float32(in)) return &linearLayer{ - w: Normal(0, 1.0/math32.Sqrt(float32(in)), in, out), + w: Uniform(-bound, bound, in, out), b: Zeros(out), } } diff --git a/common/nn/nn_test.go b/common/nn/nn_test.go index bfcc751e6..60098506f 100644 --- a/common/nn/nn_test.go +++ b/common/nn/nn_test.go @@ -23,6 +23,7 @@ import ( "strconv" "strings" "testing" + "time" "github.com/chewxy/math32" "github.com/samber/lo" @@ -207,6 +208,17 @@ func openMNISTFile(path string) (*Tensor, *Tensor, error) { return NewTensor(images, len(labels), 784), NewTensor(labels, len(labels)), nil } +func accuracy(prediction, target *Tensor) float32 { + var precision float32 + for i, gt := range target.data { + if prediction.Slice(i, i+1).argmax()[1] == int(gt) { + precision += 1 + } + } + precision /= float32(len(target.data)) + return precision +} + func TestMNIST(t *testing.T) { train, test, err := mnist() assert.NoError(t, err) @@ -218,13 +230,14 @@ func TestMNIST(t *testing.T) { ) optimizer := NewAdam(model.Parameters(), 0.001) - var ( - sumLoss float32 + const ( batchSize = 1000 + numEpoch = 10 ) - for i := 0; i < 3; i++ { - sumLoss = 0 - bar := progressbar.Default(int64(train.A.shape[0]), fmt.Sprintf("Epoch %v/%v", i+1, 3)) + for i := 0; i < numEpoch; i++ { + startTime := time.Now() + sumLoss, sumAcc := float32(0), float32(0) + bar := progressbar.Default(int64(train.A.shape[0]), fmt.Sprintf("Epoch %v/%v", i+1, numEpoch)) for j := 0; j < train.A.shape[0]; j += batchSize { xBatch := train.A.Slice(j, j+batchSize) yBatch := train.B.Slice(j, j+batchSize) @@ -237,20 +250,16 @@ func TestMNIST(t *testing.T) { optimizer.Step() sumLoss += loss.data[0] + sumAcc += accuracy(yPred, yBatch) bar.Add(batchSize) } sumLoss /= float32(train.A.shape[0] / batchSize) + sumAcc /= float32(train.A.shape[0] / batchSize) bar.Finish() + fmt.Println("Duration:", time.Since(startTime), "Loss:", sumLoss, "Accuracy:", sumAcc) } - assert.Less(t, sumLoss, float32(0.4)) - testPred := model.Forward(test.A) - var precision float32 - for i, gt := range test.B.data { - if testPred.Slice(i, i+1).argmax()[1] == int(gt) { - precision += 1 - } - } - precision /= float32(len(test.B.data)) - assert.Greater(t, float64(precision), 0.92) + testAcc := accuracy(model.Forward(test.A), test.B) + fmt.Println("Test Accuracy:", testAcc) + assert.Greater(t, float64(testAcc), 0.92) } diff --git a/common/nn/optimizers.go b/common/nn/optimizers.go index 9d13ed4dc..4ef2bca6c 100644 --- a/common/nn/optimizers.go +++ b/common/nn/optimizers.go @@ -61,6 +61,7 @@ type Adam struct { eps float32 ms map[*Tensor]*Tensor vs map[*Tensor]*Tensor + t float32 } func NewAdam(params []*Tensor, alpha float32) Optimizer { @@ -76,6 +77,7 @@ func NewAdam(params []*Tensor, alpha float32) Optimizer { } func (a *Adam) Step() { + a.t++ for _, p := range a.params { if _, ok := a.ms[p]; !ok { a.ms[p] = Zeros(p.shape...) @@ -86,12 +88,16 @@ func (a *Adam) Step() { grad := p.grad.data for i := range m.data { - // m += (1 - beta1) * (grad - m) - m.data[i] += (1 - a.beta1) * (grad[i] - m.data[i]) - // v += (1 - beta2) * (grad * grad - v) - v.data[i] += (1 - a.beta2) * (grad[i]*grad[i] - v.data[i]) - // param.data -= self.lr * m / (xp.sqrt(v) + eps) - p.data[i] -= a.alpha * m.data[i] / (math32.Sqrt(v.data[i]) + a.eps) + // m_t = beta1 * m + (1 - beta1) * grad + m.data[i] = a.beta1*m.data[i] + (1-a.beta1)*grad[i] + // v_t = beta2 * v + (1 - beta2) * grad^2 + v.data[i] = a.beta2*v.data[i] + (1-a.beta2)*grad[i]*grad[i] + // \hat{m} = m / (1 - beta1^t) + hatM := m.data[i] / (1 - math32.Pow(a.beta1, a.t)) + // \hat{v} = v / (1 - beta2^t) + hatV := v.data[i] / (1 - math32.Pow(a.beta2, a.t)) + // p_t = p - alpha * \hat{m} / (\sqrt{\hat{v}} + eps) + p.data[i] -= a.alpha * hatM / (math32.Sqrt(hatV) + a.eps) } } } diff --git a/common/nn/tensor.go b/common/nn/tensor.go index 3e8e01a15..1373ede43 100644 --- a/common/nn/tensor.go +++ b/common/nn/tensor.go @@ -87,6 +87,21 @@ func Rand(shape ...int) *Tensor { } } +func Uniform(low, high float32, shape ...int) *Tensor { + n := 1 + for _, s := range shape { + n *= s + } + data := make([]float32, n) + for i := range data { + data[i] = rand.Float32()*(high-low) + low + } + return &Tensor{ + data: data, + shape: shape, + } +} + func Normal(mean, std float32, shape ...int) *Tensor { n := 1 for _, s := range shape { From ee39fd890989a44c02c581730ad37817522a3d9c Mon Sep 17 00:00:00 2001 From: Zhenghao Zhang Date: Sun, 5 Jan 2025 23:29:46 +0800 Subject: [PATCH 3/3] nn: Fix Adam optimizer --- common/nn/nn_test.go | 12 ++++++++++-- common/nn/optimizers.go | 20 ++++++++++---------- 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/common/nn/nn_test.go b/common/nn/nn_test.go index 60098506f..08f1fc05a 100644 --- a/common/nn/nn_test.go +++ b/common/nn/nn_test.go @@ -26,6 +26,7 @@ import ( "time" "github.com/chewxy/math32" + "github.com/klauspost/cpuid/v2" "github.com/samber/lo" "github.com/schollz/progressbar/v3" "github.com/stretchr/testify/assert" @@ -220,6 +221,13 @@ func accuracy(prediction, target *Tensor) float32 { } func TestMNIST(t *testing.T) { + if cpuid.CPU.VendorString != "Apple" && !cpuid.CPU.Supports(cpuid.AVX512F, cpuid.AVX512DQ) { + // Since the test takes a long time, we run the test only in development environment. + // 1. Mac with Apple Silicon. + // 2. x86 CPU with AVX512 support. + t.Skip("Skip test on non-development environment.") + } + train, test, err := mnist() assert.NoError(t, err) @@ -232,7 +240,7 @@ func TestMNIST(t *testing.T) { const ( batchSize = 1000 - numEpoch = 10 + numEpoch = 5 ) for i := 0; i < numEpoch; i++ { startTime := time.Now() @@ -261,5 +269,5 @@ func TestMNIST(t *testing.T) { testAcc := accuracy(model.Forward(test.A), test.B) fmt.Println("Test Accuracy:", testAcc) - assert.Greater(t, float64(testAcc), 0.92) + assert.Greater(t, float64(testAcc), 0.96) } diff --git a/common/nn/optimizers.go b/common/nn/optimizers.go index 4ef2bca6c..745bfd087 100644 --- a/common/nn/optimizers.go +++ b/common/nn/optimizers.go @@ -87,17 +87,17 @@ func (a *Adam) Step() { m, v := a.ms[p], a.vs[p] grad := p.grad.data + fix1 := 1 - math32.Pow(a.beta1, a.t) + fix2 := 1 - math32.Pow(a.beta2, a.t) + lr := a.alpha * math32.Sqrt(fix2) / fix1 + for i := range m.data { - // m_t = beta1 * m + (1 - beta1) * grad - m.data[i] = a.beta1*m.data[i] + (1-a.beta1)*grad[i] - // v_t = beta2 * v + (1 - beta2) * grad^2 - v.data[i] = a.beta2*v.data[i] + (1-a.beta2)*grad[i]*grad[i] - // \hat{m} = m / (1 - beta1^t) - hatM := m.data[i] / (1 - math32.Pow(a.beta1, a.t)) - // \hat{v} = v / (1 - beta2^t) - hatV := v.data[i] / (1 - math32.Pow(a.beta2, a.t)) - // p_t = p - alpha * \hat{m} / (\sqrt{\hat{v}} + eps) - p.data[i] -= a.alpha * hatM / (math32.Sqrt(hatV) + a.eps) + // m += (1 - beta1) * (grad - m) + m.data[i] += (1 - a.beta1) * (grad[i] - m.data[i]) + // v += (1 - beta2) * (grad * grad - v) + v.data[i] += (1 - a.beta2) * (grad[i]*grad[i] - v.data[i]) + // param.data -= self.lr * m / (xp.sqrt(v) + eps) + p.data[i] -= lr * m.data[i] / (math32.Sqrt(v.data[i]) + a.eps) } } }