From a421a90e0ad2432ee2621c41ace63a2702b13452 Mon Sep 17 00:00:00 2001 From: Damian Schneider Date: Mon, 20 Jan 2025 05:51:04 +0100 Subject: [PATCH] replacement for fastled sqrt16() (#4426) * added bitwise operation based sqrt16 - replacement for fastled, it is about 10% slower for numbers smaller 128 but faster for larger numbers. speed difference is irrelevant to WLED but it saves some flash. * updated to 32bit, improved for typical WLED use - making it 32bits allows for larger numbers - added another initial condition check for medium sized numbers - increased the "small number" optimization to larger numbers: the function is currently only used to calculate sqrt(x^2+y^2) which even for small segments is larger than the initially used 64, so optimizing for 1024 makes more sense, although the value is arbitrarily chosen --- wled00/FX.cpp | 6 +++--- wled00/FX_fcn.cpp | 4 ++-- wled00/fcn_declare.h | 1 + wled00/wled_math.cpp | 24 ++++++++++++++++++++++++ 4 files changed, 30 insertions(+), 5 deletions(-) diff --git a/wled00/FX.cpp b/wled00/FX.cpp index 2655d7daab..9fffe4d094 100644 --- a/wled00/FX.cpp +++ b/wled00/FX.cpp @@ -5446,15 +5446,15 @@ uint16_t mode_2Dmetaballs(void) { // Metaballs by Stefan Petrick. Cannot have // and add them together with weightening unsigned dx = abs(x - x1); unsigned dy = abs(y - y1); - unsigned dist = 2 * sqrt16((dx * dx) + (dy * dy)); + unsigned dist = 2 * sqrt32_bw((dx * dx) + (dy * dy)); dx = abs(x - x2); dy = abs(y - y2); - dist += sqrt16((dx * dx) + (dy * dy)); + dist += sqrt32_bw((dx * dx) + (dy * dy)); dx = abs(x - x3); dy = abs(y - y3); - dist += sqrt16((dx * dx) + (dy * dy)); + dist += sqrt32_bw((dx * dx) + (dy * dy)); // inverse result int color = dist ? 1000 / dist : 255; diff --git a/wled00/FX_fcn.cpp b/wled00/FX_fcn.cpp index 5ad2314df8..20d99519d3 100644 --- a/wled00/FX_fcn.cpp +++ b/wled00/FX_fcn.cpp @@ -679,7 +679,7 @@ uint16_t Segment::virtualLength() const { vLen = max(vW,vH); // get the longest dimension break; case M12_pArc: - vLen = sqrt16(vH*vH + vW*vW); // use diagonal + vLen = sqrt32_bw(vH*vH + vW*vW); // use diagonal break; case M12_sPinwheel: vLen = getPinwheelLength(vW, vH); @@ -922,7 +922,7 @@ uint32_t IRAM_ATTR_YN Segment::getPixelColor(int i) const break; } case M12_pArc: if (i >= vW && i >= vH) { - unsigned vI = sqrt16(i*i/2); + unsigned vI = sqrt32_bw(i*i/2); return getPixelColorXY(vI,vI); // use diagonal } case M12_pCorner: diff --git a/wled00/fcn_declare.h b/wled00/fcn_declare.h index fcfa1bdcc0..c8b1f05ab7 100644 --- a/wled00/fcn_declare.h +++ b/wled00/fcn_declare.h @@ -552,6 +552,7 @@ float asin_t(float x); template T atan_t(T x); float floor_t(float x); float fmod_t(float num, float denom); +uint32_t sqrt32_bw(uint32_t x); #define sin_t sin_approx #define cos_t cos_approx #define tan_t tan_approx diff --git a/wled00/wled_math.cpp b/wled00/wled_math.cpp index a8ec55400f..43c593080e 100644 --- a/wled00/wled_math.cpp +++ b/wled00/wled_math.cpp @@ -220,3 +220,27 @@ float fmod_t(float num, float denom) { #endif return res; } + +// bit-wise integer square root calculation (exact) +uint32_t sqrt32_bw(uint32_t x) { + uint32_t res = 0; + uint32_t bit; + uint32_t num = x; // use 32bit for faster calculation + + if(num < 1 << 10) bit = 1 << 10; // speed optimization for small numbers < 32^2 + else if (num < 1 << 20) bit = 1 << 20; // speed optimization for medium numbers < 1024^2 + else bit = 1 << 30; // start with highest power of 4 <= 2^32 + + while (bit > num) bit >>= 2; // reduce iterations + + while (bit != 0) { + if (num >= res + bit) { + num -= res + bit; + res = (res >> 1) + bit; + } else { + res >>= 1; + } + bit >>= 2; + } + return res; +}