From b5bfe2d129121d42145b74c85b8e20317c121953 Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Wed, 4 Oct 2023 10:06:14 -0700 Subject: [PATCH 1/2] Vectorize TensorPrimitives.Exp --- .../Numerics/Tensors/TensorPrimitives.cs | 16 +- .../Tensors/TensorPrimitives.netcore.cs | 280 ++++++++++++++++++ .../Tensors/TensorPrimitives.netstandard.cs | 13 + 3 files changed, 295 insertions(+), 14 deletions(-) diff --git a/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/TensorPrimitives.cs b/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/TensorPrimitives.cs index b1b96e55bb44f..acc311df6e4cb 100644 --- a/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/TensorPrimitives.cs +++ b/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/TensorPrimitives.cs @@ -322,20 +322,8 @@ public static float Dot(ReadOnlySpan x, ReadOnlySpan y) /// operating systems or architectures. /// /// - public static void Exp(ReadOnlySpan x, Span destination) - { - if (x.Length > destination.Length) - { - ThrowHelper.ThrowArgument_DestinationTooShort(); - } - - ValidateInputOutputSpanNonOverlapping(x, destination); - - for (int i = 0; i < x.Length; i++) - { - destination[i] = MathF.Exp(x[i]); - } - } + public static void Exp(ReadOnlySpan x, Span destination) => + InvokeSpanIntoSpan(x, destination); /// Searches for the index of the largest single-precision floating-point number in the specified tensor. /// The tensor, represented as a span. diff --git a/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/TensorPrimitives.netcore.cs b/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/TensorPrimitives.netcore.cs index 7e13b54148e4e..aa306db678d77 100644 --- a/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/TensorPrimitives.netcore.cs +++ b/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/TensorPrimitives.netcore.cs @@ -2579,6 +2579,286 @@ public static Vector512 Invoke(Vector512 x, Vector512 y) #endif } + private readonly struct ExpOperator : IUnaryOperator + { + // This code is based on `vrs4_expf` from amd/aocl-libm-ose + // Copyright (C) 2019-2022 Advanced Micro Devices, Inc. All rights reserved. + // + // Licensed under the BSD 3-Clause "New" or "Revised" License + // See THIRD-PARTY-NOTICES.TXT for the full license text + + // Implementation Notes: + // 1. Argument Reduction: + // e^x = 2^(x/ln2) --- (1) + // + // Let x/ln(2) = z --- (2) + // + // Let z = n + r , where n is an integer --- (3) + // |r| <= 1/2 + // + // From (1), (2) and (3), + // e^x = 2^z + // = 2^(N+r) + // = (2^N)*(2^r) --- (4) + // + // 2. Polynomial Evaluation + // From (4), + // r = z - N + // 2^r = C1 + C2*r + C3*r^2 + C4*r^3 + C5 *r^4 + C6*r^5 + // + // 4. Reconstruction + // Thus, + // e^x = (2^N) * (2^r) + + private const uint V_ARG_MAX = 0x42AE0000; + private const uint V_MASK = 0x7FFFFFFF; + + private const float V_EXPF_MIN = -103.97208f; + private const float V_EXPF_MAX = 88.72284f; + + private const double V_EXPF_HUGE = 6755399441055744; + private const double V_TBL_LN2 = 1.4426950408889634; + + private const double C1 = 1.0000000754895704; + private const double C2 = 0.6931472254087585; + private const double C3 = 0.2402210737432219; + private const double C4 = 0.05550297297702539; + private const double C5 = 0.009676036358193323; + private const double C6 = 0.001341000536524434; + + public static float Invoke(float x) => MathF.Exp(x); + + public static Vector128 Invoke(Vector128 x) + { + // Convert x to double precision + (Vector128 xl, Vector128 xu) = Vector128.Widen(x); + + // x * (64.0 / ln(2)) + Vector128 v_tbl_ln2 = Vector128.Create(V_TBL_LN2); + + Vector128 zl = xl * v_tbl_ln2; + Vector128 zu = xu * v_tbl_ln2; + + Vector128 v_expf_huge = Vector128.Create(V_EXPF_HUGE); + + Vector128 dnl = zl + v_expf_huge; + Vector128 dnu = zu + v_expf_huge; + + // n = int (z) + Vector128 nl = dnl.AsUInt64(); + Vector128 nu = dnu.AsUInt64(); + + // dn = double(n) + dnl -= v_expf_huge; + dnu -= v_expf_huge; + + // r = z - dn + Vector128 c1 = Vector128.Create(C1); + Vector128 c2 = Vector128.Create(C2); + Vector128 c3 = Vector128.Create(C3); + Vector128 c4 = Vector128.Create(C4); + Vector128 c5 = Vector128.Create(C5); + Vector128 c6 = Vector128.Create(C6); + + Vector128 rl = zl - dnl; + + Vector128 rl2 = rl * rl; + Vector128 rl4 = rl2 * rl2; + + Vector128 polyl = (c4 * rl + c3) * rl2 + + ((c6 * rl + c5) * rl4 + + (c2 * rl + c1)); + + + Vector128 ru = zu - dnu; + + Vector128 ru2 = ru * ru; + Vector128 ru4 = ru2 * ru2; + + Vector128 polyu = (c4 * ru + c3) * ru2 + + ((c6 * ru + c5) * ru4 + + (c2 * ru + c1)); + + // result = (float)[poly + (n << 52)] + Vector128 ret = Vector128.Narrow( + (polyl.AsUInt64() + Vector128.ShiftLeft(nl, 52)).AsDouble(), + (polyu.AsUInt64() + Vector128.ShiftLeft(nu, 52)).AsDouble() + ); + + // Check if -103 < |x| < 88 + if (Vector128.GreaterThanAny(x.AsUInt32() & Vector128.Create(V_MASK), Vector128.Create(V_ARG_MAX))) + { + // (x > V_EXPF_MAX) ? float.PositiveInfinity : x + Vector128 infinityMask = Vector128.GreaterThan(x, Vector128.Create(V_EXPF_MAX)); + + ret = Vector128.ConditionalSelect( + infinityMask, + Vector128.Create(float.PositiveInfinity), + ret + ); + + // (x < V_EXPF_MIN) ? 0 : x + ret = Vector128.AndNot(ret, Vector128.LessThan(x, Vector128.Create(V_EXPF_MIN))); + } + + return ret; + } + + public static Vector256 Invoke(Vector256 x) + { + // Convert x to double precision + (Vector256 xl, Vector256 xu) = Vector256.Widen(x); + + // x * (64.0 / ln(2)) + Vector256 v_tbl_ln2 = Vector256.Create(V_TBL_LN2); + + Vector256 zl = xl * v_tbl_ln2; + Vector256 zu = xu * v_tbl_ln2; + + Vector256 v_expf_huge = Vector256.Create(V_EXPF_HUGE); + + Vector256 dnl = zl + v_expf_huge; + Vector256 dnu = zu + v_expf_huge; + + // n = int (z) + Vector256 nl = dnl.AsUInt64(); + Vector256 nu = dnu.AsUInt64(); + + // dn = double(n) + dnl -= v_expf_huge; + dnu -= v_expf_huge; + + // r = z - dn + Vector256 c1 = Vector256.Create(C1); + Vector256 c2 = Vector256.Create(C2); + Vector256 c3 = Vector256.Create(C3); + Vector256 c4 = Vector256.Create(C4); + Vector256 c5 = Vector256.Create(C5); + Vector256 c6 = Vector256.Create(C6); + + Vector256 rl = zl - dnl; + + Vector256 rl2 = rl * rl; + Vector256 rl4 = rl2 * rl2; + + Vector256 polyl = (c4 * rl + c3) * rl2 + + ((c6 * rl + c5) * rl4 + + (c2 * rl + c1)); + + + Vector256 ru = zu - dnu; + + Vector256 ru2 = ru * ru; + Vector256 ru4 = ru2 * ru2; + + Vector256 polyu = (c4 * ru + c3) * ru2 + + ((c6 * ru + c5) * ru4 + + (c2 * ru + c1)); + + // result = (float)[poly + (n << 52)] + Vector256 ret = Vector256.Narrow( + (polyl.AsUInt64() + Vector256.ShiftLeft(nl, 52)).AsDouble(), + (polyu.AsUInt64() + Vector256.ShiftLeft(nu, 52)).AsDouble() + ); + + // Check if -103 < |x| < 88 + if (Vector256.GreaterThanAny(x.AsUInt32() & Vector256.Create(V_MASK), Vector256.Create(V_ARG_MAX))) + { + // (x > V_EXPF_MAX) ? float.PositiveInfinity : x + Vector256 infinityMask = Vector256.GreaterThan(x, Vector256.Create(V_EXPF_MAX)); + + ret = Vector256.ConditionalSelect( + infinityMask, + Vector256.Create(float.PositiveInfinity), + ret + ); + + // (x < V_EXPF_MIN) ? 0 : x + ret = Vector256.AndNot(ret, Vector256.LessThan(x, Vector256.Create(V_EXPF_MIN))); + } + + return ret; + } + +#if NET8_0_OR_GREATER + public static Vector512 Invoke(Vector512 x) + { + // Convert x to double precision + (Vector512 xl, Vector512 xu) = Vector512.Widen(x); + + // x * (64.0 / ln(2)) + Vector512 v_tbl_ln2 = Vector512.Create(V_TBL_LN2); + + Vector512 zl = xl * v_tbl_ln2; + Vector512 zu = xu * v_tbl_ln2; + + Vector512 v_expf_huge = Vector512.Create(V_EXPF_HUGE); + + Vector512 dnl = zl + v_expf_huge; + Vector512 dnu = zu + v_expf_huge; + + // n = int (z) + Vector512 nl = dnl.AsUInt64(); + Vector512 nu = dnu.AsUInt64(); + + // dn = double(n) + dnl -= v_expf_huge; + dnu -= v_expf_huge; + + // r = z - dn + Vector512 c1 = Vector512.Create(C1); + Vector512 c2 = Vector512.Create(C2); + Vector512 c3 = Vector512.Create(C3); + Vector512 c4 = Vector512.Create(C4); + Vector512 c5 = Vector512.Create(C5); + Vector512 c6 = Vector512.Create(C6); + + Vector512 rl = zl - dnl; + + Vector512 rl2 = rl * rl; + Vector512 rl4 = rl2 * rl2; + + Vector512 polyl = (c4 * rl + c3) * rl2 + + ((c6 * rl + c5) * rl4 + + (c2 * rl + c1)); + + + Vector512 ru = zu - dnu; + + Vector512 ru2 = ru * ru; + Vector512 ru4 = ru2 * ru2; + + Vector512 polyu = (c4 * ru + c3) * ru2 + + ((c6 * ru + c5) * ru4 + + (c2 * ru + c1)); + + // result = (float)[poly + (n << 52)] + Vector512 ret = Vector512.Narrow( + (polyl.AsUInt64() + Vector512.ShiftLeft(nl, 52)).AsDouble(), + (polyu.AsUInt64() + Vector512.ShiftLeft(nu, 52)).AsDouble() + ); + + // Check if -103 < |x| < 88 + if (Vector512.GreaterThanAny(x.AsUInt32() & Vector512.Create(V_MASK), Vector512.Create(V_ARG_MAX))) + { + // (x > V_EXPF_MAX) ? float.PositiveInfinity : x + Vector512 infinityMask = Vector512.GreaterThan(x, Vector512.Create(V_EXPF_MAX)); + + ret = Vector512.ConditionalSelect( + infinityMask, + Vector512.Create(float.PositiveInfinity), + ret + ); + + // (x < V_EXPF_MIN) ? 0 : x + ret = Vector512.AndNot(ret, Vector512.LessThan(x, Vector512.Create(V_EXPF_MIN))); + } + + return ret; + } +#endif + } + private readonly struct LogOperator : IUnaryOperator { // This code is based on `vrs4_logf` from amd/aocl-libm-ose diff --git a/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/TensorPrimitives.netstandard.cs b/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/TensorPrimitives.netstandard.cs index b9c1e128aaede..eb9887daf4f84 100644 --- a/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/TensorPrimitives.netstandard.cs +++ b/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/TensorPrimitives.netstandard.cs @@ -923,6 +923,19 @@ public Vector Invoke(Vector x, Vector y) public Vector Invoke(Vector x) => Vector.Abs(x); } + private readonly struct ExpOperator : IUnaryOperator + { + public bool CanVectorize => false; + + public float Invoke(float x) => MathF.Exp(x); + + public Vector Invoke(Vector x) + { + // Vectorizing requires shift right support, which is .NET 7 or later + throw new NotImplementedException(); + } + } + private readonly struct LogOperator : IUnaryOperator { public bool CanVectorize => false; From ce489e7da1951548ed290a4a5eb3104572e717f9 Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Wed, 4 Oct 2023 10:48:06 -0700 Subject: [PATCH 2/2] Update src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/TensorPrimitives.netstandard.cs --- .../src/System/Numerics/Tensors/TensorPrimitives.netstandard.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/TensorPrimitives.netstandard.cs b/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/TensorPrimitives.netstandard.cs index eb9887daf4f84..ae72988bbe00e 100644 --- a/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/TensorPrimitives.netstandard.cs +++ b/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/TensorPrimitives.netstandard.cs @@ -931,7 +931,7 @@ public Vector Invoke(Vector x, Vector y) public Vector Invoke(Vector x) { - // Vectorizing requires shift right support, which is .NET 7 or later + // Vectorizing requires shift left support, which is .NET 7 or later throw new NotImplementedException(); } }