From bc0131d69b8859f76096aec5ea558eaa93cf0141 Mon Sep 17 00:00:00 2001 From: Avik Pal Date: Thu, 1 Aug 2024 20:41:01 -0700 Subject: [PATCH] feat: offload matrix multiply routines to Octavian.jl --- Project.toml | 4 +++- src/LuxLib.jl | 1 + src/impl/matmul.jl | 44 ++++++++++++++++++++++---------------------- 3 files changed, 26 insertions(+), 23 deletions(-) diff --git a/Project.toml b/Project.toml index 6979bfcb..bf474dfe 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "LuxLib" uuid = "82251201-b29d-42c6-8e01-566dec8acb11" authors = ["Avik Pal and contributors"] -version = "0.3.38" +version = "0.3.39" [deps] ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9" @@ -17,6 +17,7 @@ LuxCore = "bb33d45b-7691-41d6-9220-0943567d0623" MLDataDevices = "7e8f7934-dd98-4c1a-8fe8-92b47a384d40" Markdown = "d6f4376e-aef5-505a-96c1-9c027394607a" NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd" +Octavian = "6fd5a793-0b7e-452c-907f-f8bfe9c57db4" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Reexport = "189a3867-3050-52da-a836-e630ba90ab69" SLEEFPirates = "476501e8-09a2-5ece-8869-fb82de89a1fa" @@ -63,6 +64,7 @@ LuxTestUtils = "1.1" MLDataDevices = "1.0.0" Markdown = "1.10" NNlib = "0.9.21" +Octavian = "0.3.28" Pkg = "1.10" Preferences = "1.4" Random = "1.10" diff --git a/src/LuxLib.jl b/src/LuxLib.jl index 1ff5d310..67796493 100644 --- a/src/LuxLib.jl +++ b/src/LuxLib.jl @@ -14,6 +14,7 @@ using Markdown: @doc_str using MLDataDevices: get_device_type, AMDGPUDevice, CUDADevice, CPUDevice, AbstractGPUDevice, AbstractDevice using NNlib: NNlib, ConvDims, conv, conv!, relu, gelu, σ, ∇conv_data, ∇conv_filter +using Octavian: Octavian using Random: Random, AbstractRNG, rand! using Reexport: @reexport using Setfield: @set! diff --git a/src/impl/matmul.jl b/src/impl/matmul.jl index 0e51320c..de40000f 100644 --- a/src/impl/matmul.jl +++ b/src/impl/matmul.jl @@ -32,17 +32,21 @@ function matmuladd!(C::AbstractMatrix, ::AbstractInternalArrayOpMode, end function matmuladd!(C::AbstractMatrix, ::LoopedArrayOp, A::AbstractMatrix, B::AbstractMatrix, bias::AbstractVector) - if unrolled_any(≤(256), (size(C, 1), size(A, 2), size(B, 2))) && + dims = (size(C, 1), size(A, 2), size(B, 2)) + if unrolled_any(≤(2048), dims) && + unrolled_all(≤(10_000), dims) && LoopVectorization.check_args(C, A, B) - __matmuladd_loopvec!(C, A, B, bias) + __matmuladd_octavian!(C, A, B, bias) return end __matmuladd_generic!(C, A, B, bias) return end -function __matmuladd_loopvec!( +function __matmuladd_octavian!( C::AbstractMatrix, A::AbstractMatrix, B::AbstractMatrix, bias::AbstractVector) + # NOTE: Octavian doesn't do size checks. + # See https://github.com/JuliaLinearAlgebra/Octavian.jl/issues/109 if size(A, 2) != size(B, 1) throw(DimensionMismatch(lazy"A has shape ($(size(A, 1)), $(size(A, 2))) but B has shape ($(size(B, 1)), $(size(B, 2)))")) end @@ -51,13 +55,11 @@ function __matmuladd_loopvec!( throw(DimensionMismatch(lazy"bias has length $(length(bias)) but A has shape ($(size(A, 1)), $(size(A, 2)))")) end - @tturbo for n in indices((C, B), 2), m in indices((C, A), 1) - Cmn = zero(eltype(C)) - for k in indices((A, B), (2, 1)) - Cmn += A[m, k] * B[k, n] - end - C[m, n] = Cmn + bias[m] + @tturbo for n in indices(C, 2), m in indices(C, 1) + C[m, n] = bias[m] end + Octavian.matmul!(C, A, B, true, true) + return end function __matmuladd_generic!( @@ -91,27 +93,25 @@ function matmul!(C::AbstractMatrix, ::AbstractInternalArrayOpMode, return end function matmul!(C::AbstractMatrix, ::LoopedArrayOp, A::AbstractMatrix, B::AbstractMatrix) - if unrolled_any(≤(256), (size(C, 1), size(A, 2), size(B, 2))) && + dims = (size(C, 1), size(A, 2), size(B, 2)) + if unrolled_any(≤(2048), dims) && + unrolled_all(≤(10_000), dims) && LoopVectorization.check_args(C, A, B) - __matmul_loopvec!(C, A, B) + __matmul_octavian!(C, A, B) return end __matmul_generic!(C, A, B) return end -function __matmul_loopvec!(C::AbstractMatrix, A::AbstractMatrix, B::AbstractMatrix) +function __matmul_octavian!(C::AbstractMatrix, A::AbstractMatrix, B::AbstractMatrix) + # NOTE: Octavian doesn't do size checks. + # See https://github.com/JuliaLinearAlgebra/Octavian.jl/issues/109 if size(A, 2) != size(B, 1) throw(DimensionMismatch(lazy"A has shape ($(size(A, 1)), $(size(A, 2))) but B has shape ($(size(B, 1)), $(size(B, 2)))")) end - - @tturbo for n in indices((C, B), 2), m in indices((C, A), 1) - Cmn = zero(eltype(C)) - for k in indices((A, B), (2, 1)) - Cmn += A[m, k] * B[k, n] - end - C[m, n] = Cmn - end + Octavian.matmul!(C, A, B) + return end function __matmul_generic!(C::AbstractMatrix, A::AbstractMatrix, B::AbstractMatrix) @@ -151,6 +151,6 @@ function CRC.rrule(::typeof(matmuladd), opmode::LoopedArrayOp, end # EnzymeRules -@enzyme_reverse_alternative __matmul_loopvec! __matmul_generic! +@enzyme_reverse_alternative __matmul_octavian! __matmul_generic! -@enzyme_reverse_alternative __matmuladd_loopvec! __matmuladd_generic! +@enzyme_reverse_alternative __matmuladd_octavian! __matmuladd_generic!