From f5eb7e6c441d856e1496c12fdb8ce2a40519997f Mon Sep 17 00:00:00 2001
From: peterminea <peter.minea@gmail.com>
Date: Tue, 14 Nov 2023 12:47:53 +0200
Subject: [PATCH 1/2] dlimbs_algs_multi - attempted definition for when LIMBS
 is multiple of TPI (greater than TPI)

---
 .vscode/c_cpp_properties.json        |  16 ++
 include/cgbn/core/dispatch_dlimbs.cu | 210 ++++++++++++++++++++++++++-
 2 files changed, 225 insertions(+), 1 deletion(-)
 create mode 100644 .vscode/c_cpp_properties.json
diff --git a/.vscode/c_cpp_properties.json b/.vscode/c_cpp_properties.json
new file mode 100644
index 0000000..b1f27e8
--- /dev/null
+++ b/.vscode/c_cpp_properties.json
@@ -0,0 +1,16 @@
+{
+    "configurations": [
+        {
+            "name": "Win32",
+            "includePath": [
+                "${workspaceFolder}/**"
+            ],
+            "defines": [
+                "_DEBUG",
+                "UNICODE",
+                "_UNICODE"
+            ]
+        }
+    ],
+    "version": 4
+}
\ No newline at end of file
diff --git a/include/cgbn/core/dispatch_dlimbs.cu b/include/cgbn/core/dispatch_dlimbs.cu
index a075183..d796ff2 100644
--- a/include/cgbn/core/dispatch_dlimbs.cu
+++ b/include/cgbn/core/dispatch_dlimbs.cu
@@ -469,4 +469,212 @@ class dispatch_dlimbs_t<core, dlimbs_algs_full> {
   }
 };
 
-} /* namespace cgbn */
\ No newline at end of file
+template<class core>
+class dispatch_dlimbs_t<core, dlimbs_algs_multi> {
+  public:
+  static const uint32_t TPI=core::TPI;
+  static const uint32_t LIMBS=core::LIMBS;
+  static const uint32_t DLIMBS=core::DLIMBS;
+  static const uint32_t LIMB_OFFSET=DLIMBS*TPI-LIMBS;
+
+  // These algorithms are used when LIMBS >= TPI.  Almost the same as the half size ones, few tweaks here and there.
+  
+  __device__ __forceinline__ static void dlimbs_approximate(uint32_t approx[DLIMBS], const uint32_t denom[DLIMBS]) {
+    uint32_t sync=core::sync_mask(), group_thread=threadIdx.x & TPI-1;
+    uint32_t x, d0, d1, x0, x1, x2, est, a, h, l;
+    int32_t  c, top;
+
+    // computes (beta^2 - 1) / denom - beta, where beta=1<<32*LIMBS
+    
+    x=0xFFFFFFFF-denom[0];
+    
+    d1=__shfl_sync(sync, denom[0], TPI-1, TPI);
+    d0=__shfl_sync(sync, denom[0], TPI-2, TPI);
+      
+    approx[0]=0;
+    a=uapprox(d1);
+  
+    #pragma nounroll
+    for(int32_t thread=LIMBS-1;thread>=0;thread--) {
+      x0=__shfl_sync(sync, x, TPI-3, TPI);
+      x1=__shfl_sync(sync, x, TPI-2, TPI);
+      x2=__shfl_sync(sync, x, TPI-1, TPI);
+      est=udiv(x0, x1, x2, d0, d1, a);
+
+      l=madlo_cc(est, denom[0], 0);
+      h=madhic(est, denom[0], 0);
+
+      x=sub_cc(x, h);
+      c=subc(0, 0);  // thread TPI-1 is zero
+    
+      top=__shfl_sync(sync, x, TPI-1, TPI);
+      x=__shfl_up_sync(sync, x, 1, TPI);
+      c=__shfl_sync(sync, c, threadIdx.x-1, TPI);
+      x=(group_thread==0) ? 0xFFFFFFFF : x;
+
+      x=sub_cc(x, l);
+      c=subc(c, 0);
+
+      if(top+core::resolve_sub(c, x)<0) {
+        // means a correction is required, should be very rare
+        x=add_cc(x, denom[0]);
+        c=addc(0, 0);
+        core::fast_propagate_add(c, x);
+        est--;
+      }
+      approx[0]=(group_thread==thread+TPI-LIMBS) ? est : approx[0];
+    }
+  }
+  
+  __device__ __forceinline__ static uint32_t dlimbs_sqrt_rem_wide(uint32_t s[DLIMBS], uint32_t r[DLIMBS], const uint32_t lo[DLIMBS], const uint32_t hi[DLIMBS]) {
+    uint32_t sync=core::sync_mask(), group_thread=threadIdx.x & TPI-1;
+    uint32_t x, x0, x1, t0, t1, divisor, approx, p, q, c, low;
+  
+    // computes s=sqrt(x), r=x-s^2, where x=(hi<<32*LIMBS) + lo
+    
+    low=lo[0];
+    x=hi[0];
+    if(TPI!=LIMBS) {
+      low=__shfl_sync(sync, low, threadIdx.x-TPI+LIMBS, TPI);
+      x=((int32_t)group_thread>=(int32_t)(TPI-LIMBS)) ? x : low;   // use casts to silence warning
+    }
+    x0=__shfl_sync(sync, x, TPI-2, TPI);
+    x1=__shfl_sync(sync, x, TPI-1, TPI);
+  
+    divisor=usqrt(x0, x1);
+    approx=uapprox(divisor);
+
+    t0=madlo_cc(divisor, divisor, 0);
+    t1=madhic(divisor, divisor, 0);
+    x0=sub_cc(x0, t0);
+    x1=subc(x1, t1);
+
+    x=(group_thread==TPI-1) ? low : x;
+    x=__shfl_sync(sync, x, threadIdx.x-1, TPI);
+    x=(group_thread==TPI-1) ? x0 : x;
+    s[0]=(group_thread==TPI-1) ? divisor+divisor : 0;  // silent 1 at the top of s
+
+    #pragma nounroll
+    for(int32_t index=TPI-2;index>=(int32_t)(TPI-LIMBS);index--) {
+      x0=__shfl_sync(sync, x, TPI-1, TPI);
+      q=usqrt_div(x0, x1, divisor, approx);
+      s[0]=(group_thread==index) ? q : s[0];
+
+      p=madhi(q, s[0], 0);
+      x=sub_cc(x, p);
+      c=subc(0, 0);
+      core::fast_propagate_sub(c, x);
+    
+      x1=__shfl_sync(sync, x, TPI-1, TPI)-q;  // we subtract q because of the silent 1 at the top of s
+      t0=__shfl_sync(sync, low, index, TPI);
+      x=__shfl_up_sync(sync, x, 1, TPI);
+      x=(group_thread==0) ? t0 : x;
+    
+      p=madlo(q, s[0], 0);
+      x=sub_cc(x, p);
+      c=subc(0, 0);
+      x1-=core::fast_propagate_sub(c, x);
+
+      while(0>(int32_t)x1) {
+        x1++;
+        q--;
+      
+        // correction step: add q and s
+        x=add_cc(x, (group_thread==index) ? q : 0);
+        c=addc(0, 0);
+        x=add_cc(x, s[0]);
+        c=addc(c, 0);
+      
+        x1+=core::resolve_add(c, x);
+
+        // update s
+        s[0]=(group_thread==index) ? q : s[0];
+      }
+      s[0]=(group_thread==index+1) ? s[0]+(q>>31) : s[0];
+      s[0]=(group_thread==index) ? q+q : s[0];
+    }
+    t0=__shfl_down_sync(sync, s[0], 1, TPI);
+    t0=(group_thread==TPI-1) ? 1 : t0;
+    s[0]=uright_wrap(s[0], t0, 1);
+    r[0]=x;
+    return x1;
+  }
+
+  __device__ __forceinline__ static void dlimbs_div_estimate(uint32_t q[DLIMBS], const uint32_t x[DLIMBS], const uint32_t approx[DLIMBS]) {
+    uint32_t sync=core::sync_mask(), group_thread=threadIdx.x & TPI-1;
+    uint32_t t, c;
+    uint64_t w;
+  
+    // computes q=(x*approx>>32*LIMBS) + x + 3
+    //          q=min(q, (1<<32*LIMBS)-1);
+    // 
+    // Notes:   leaves junk in lower words of q 
+
+    w=0;
+    #pragma unroll
+    for(int32_t index=0;index<LIMBS;index++) {
+      t=__shfl_sync(sync, x[0], TPI-LIMBS+index, TPI);
+      w=mad_wide(t, approx[0], w);
+      t=__shfl_sync(sync, ulow(w), threadIdx.x+1, TPI);
+     // t=(group_thread==TPI-1) ? 0 : t;
+      t=((group_thread + 1) % TPI == 0) ? 0 : t;
+      w=(w>>32)+t;
+    }
+    
+    // increase the estimate by 3
+    t=(group_thread==TPI-LIMBS) ? 3 : 0;
+    w=w + t + x[0];
+    
+    q[0]=ulow(w);
+    c=uhigh(w);
+    if(core::resolve_add(c, q[0])!=0)
+      q[0]=0xFFFFFFFF;
+  }
+
+  __device__ __forceinline__ static void dlimbs_sqrt_estimate(uint32_t q[DLIMBS], uint32_t top, const uint32_t x[DLIMBS], const uint32_t approx[DLIMBS]) {
+    uint32_t sync=core::sync_mask(), group_thread=threadIdx.x & TPI-1;
+    uint32_t t, high, low;
+    uint64_t w;
+
+    // computes:
+    //    1.  num=((top<<32*LIMBS) + x) / 2
+    //    2.  q=(num*approx>>32*LIMBS) + num + 4
+    //    3.  q=min(q, (1<<32*LIMBS)-1);
+    //
+    //  Note:  Leaves junk in lower words of q
+
+    // shift x right by 1 bit.  Fill high bit with top.
+    t=__shfl_down_sync(sync, x[0], 1, TPI);
+    t=(group_thread==TPI-1) ? top : t;
+    low=uright_wrap(x[0], t, 1);
+
+     // if we're exactly multiple of the size, need to clear out low limb. Not sure if this is really needed.
+    if(LIMBS % TPI == 0) {
+    	//low=(group_thread>=LIMBS) ? low : 0;
+    }
+
+    // estimate is in low
+    w=0;
+    #pragma unroll
+    for(int32_t index=0;index<LIMBS;index++) {
+      t=__shfl_sync(sync, low, TPI-LIMBS+index, TPI);
+      w=mad_wide(t, approx[0], w);
+      t=__shfl_down_sync(sync, ulow(w), 1, TPI);
+     // t=(group_thread==TPI-1) ? 0 : t;
+      t=((group_thread+1) % TPI == 0) ? 0 : t;
+      w=(w>>32)+t;
+    }
+    
+    // increase the estimate by 4 -- because we might have cleared low bit, estimate can be off by 4
+    t=(group_thread==TPI-LIMBS) ? 4 : 0;
+    w=w + t + low;
+
+    low=ulow(w);
+    high=uhigh(w);
+    if(core::resolve_add(high, low)!=0)
+      low=0xFFFFFFFF;
+    q[0]=low;
+  }
+};
+
+} /* namespace cgbn */

From c3da460078359b0cbbf4ce1bde36d4f211337821 Mon Sep 17 00:00:00 2001
From: Peter Minea <peter.minea@gmail.com>
Date: Wed, 7 Feb 2024 12:52:36 +0000
Subject: [PATCH 2/2] Edited the new dlimbs_algs_multi structure

---
 include/cgbn/core/dispatch_dlimbs.cu | 62 ++++++++++++++++------------
 1 file changed, 36 insertions(+), 26 deletions(-)

diff --git a/include/cgbn/core/dispatch_dlimbs.cu b/include/cgbn/core/dispatch_dlimbs.cu
index d796ff2..d3da962 100644
--- a/include/cgbn/core/dispatch_dlimbs.cu
+++ b/include/cgbn/core/dispatch_dlimbs.cu
@@ -477,11 +477,11 @@ class dispatch_dlimbs_t<core, dlimbs_algs_multi> {
   static const uint32_t DLIMBS=core::DLIMBS;
   static const uint32_t LIMB_OFFSET=DLIMBS*TPI-LIMBS;
 
-  // These algorithms are used when LIMBS >= TPI.  Almost the same as the half size ones, few tweaks here and there.
+  // These algorithms are used when LIMBS >= TPI.  Almost the same as the half/full size ones, few tweaks here and there.
   
   __device__ __forceinline__ static void dlimbs_approximate(uint32_t approx[DLIMBS], const uint32_t denom[DLIMBS]) {
     uint32_t sync=core::sync_mask(), group_thread=threadIdx.x & TPI-1;
-    uint32_t x, d0, d1, x0, x1, x2, est, a, h, l;
+    uint32_t x, d0, d1, x0, x1, x2, est, a, h, l, rem = !(LIMBS % TPI)? 0 : TPI - (LIMBS % TPI);//This is the equivalent of TPI-LIMBS. When TPI < LIMBS it can be either 0 (when LIMBS is a multiple of TPI, like LIMBS=64 with TPI=32) or a value between 1 and TPI-1, if LIMBS is not a multiple of TPI (e.g. TPI = 32, BITS = 1056, LIMBS = 33 = 1056/32, where the last 32 is not TPI but the universal number of bits per limb - rem will be 32-1).
     int32_t  c, top;
 
     // computes (beta^2 - 1) / denom - beta, where beta=1<<32*LIMBS
@@ -495,7 +495,7 @@ class dispatch_dlimbs_t<core, dlimbs_algs_multi> {
     a=uapprox(d1);
   
     #pragma nounroll
-    for(int32_t thread=LIMBS-1;thread>=0;thread--) {
+    for(int32_t thread = LIMBS-1; thread>=0; thread--) {//Please properly indent your source code.
       x0=__shfl_sync(sync, x, TPI-3, TPI);
       x1=__shfl_sync(sync, x, TPI-2, TPI);
       x2=__shfl_sync(sync, x, TPI-1, TPI);
@@ -522,25 +522,30 @@ class dispatch_dlimbs_t<core, dlimbs_algs_multi> {
         core::fast_propagate_add(c, x);
         est--;
       }
-      approx[0]=(group_thread==thread+TPI-LIMBS) ? est : approx[0];
+      //approx[0]=(group_thread==thread+TPI-LIMBS) ? est : approx[0];
+      approx[0]=(group_thread==thread+rem) ? est : approx[0];
     }
   }
   
   __device__ __forceinline__ static uint32_t dlimbs_sqrt_rem_wide(uint32_t s[DLIMBS], uint32_t r[DLIMBS], const uint32_t lo[DLIMBS], const uint32_t hi[DLIMBS]) {
     uint32_t sync=core::sync_mask(), group_thread=threadIdx.x & TPI-1;
-    uint32_t x, x0, x1, t0, t1, divisor, approx, p, q, c, low;
+    uint32_t x, x0, x1, t0, t1, divisor, approx, p, q, c, low, rem = !(LIMBS % TPI)? 0 : TPI - (LIMBS % TPI);
   
     // computes s=sqrt(x), r=x-s^2, where x=(hi<<32*LIMBS) + lo
     
     low=lo[0];
     x=hi[0];
-    if(TPI!=LIMBS) {
+/*    if(TPI ^ LIMBS) {//Always true for is_multi when TPI < LIMBS. Also, threadIdx.x-TPI+LIMBS would then be greater than threadIdx.x . Moreover, we can say ^ for !=.
       low=__shfl_sync(sync, low, threadIdx.x-TPI+LIMBS, TPI);
       x=((int32_t)group_thread>=(int32_t)(TPI-LIMBS)) ? x : low;   // use casts to silence warning
-    }
+    }*/
+   //Alternative approach:
+    t0=__shfl_sync(sync, lo[0], threadIdx.x+LIMBS, TPI);
+    x=hi[0] | t0;
+    
     x0=__shfl_sync(sync, x, TPI-2, TPI);
     x1=__shfl_sync(sync, x, TPI-1, TPI);
-  
+ 
     divisor=usqrt(x0, x1);
     approx=uapprox(divisor);
 
@@ -555,7 +560,7 @@ class dispatch_dlimbs_t<core, dlimbs_algs_multi> {
     s[0]=(group_thread==TPI-1) ? divisor+divisor : 0;  // silent 1 at the top of s
 
     #pragma nounroll
-    for(int32_t index=TPI-2;index>=(int32_t)(TPI-LIMBS);index--) {
+    for(int32_t index=TPI-2;index >= (int32_t)(0);index--) {//TPI < LIMBS here, need to adjust. TPI-LIMBS would result in a less than zero number. For example BITS=2048, LIMBS=64, TPI=32
       x0=__shfl_sync(sync, x, TPI-1, TPI);
       q=usqrt_div(x0, x1, divisor, approx);
       s[0]=(group_thread==index) ? q : s[0];
@@ -575,7 +580,7 @@ class dispatch_dlimbs_t<core, dlimbs_algs_multi> {
       c=subc(0, 0);
       x1-=core::fast_propagate_sub(c, x);
 
-      while(0>(int32_t)x1) {
+      while(0 > (int32_t)x1) {
         x1++;
         q--;
       
@@ -602,7 +607,7 @@ class dispatch_dlimbs_t<core, dlimbs_algs_multi> {
 
   __device__ __forceinline__ static void dlimbs_div_estimate(uint32_t q[DLIMBS], const uint32_t x[DLIMBS], const uint32_t approx[DLIMBS]) {
     uint32_t sync=core::sync_mask(), group_thread=threadIdx.x & TPI-1;
-    uint32_t t, c;
+    uint32_t t, c, rem = !(LIMBS % TPI)? 0 : TPI - (LIMBS % TPI);
     uint64_t w;
   
     // computes q=(x*approx>>32*LIMBS) + x + 3
@@ -612,17 +617,19 @@ class dispatch_dlimbs_t<core, dlimbs_algs_multi> {
 
     w=0;
     #pragma unroll
-    for(int32_t index=0;index<LIMBS;index++) {
-      t=__shfl_sync(sync, x[0], TPI-LIMBS+index, TPI);
+    for(int32_t index=0; index<LIMBS; index++) {
+      //t=__shfl_sync(sync, x[0], TPI-LIMBS+index, TPI);
+      t=__shfl_sync(sync, x[0], rem+index, TPI);
       w=mad_wide(t, approx[0], w);
       t=__shfl_sync(sync, ulow(w), threadIdx.x+1, TPI);
      // t=(group_thread==TPI-1) ? 0 : t;
-      t=((group_thread + 1) % TPI == 0) ? 0 : t;
+      t=((group_thread + 1) & (TPI-1) == 0) ? 0 : t;//group_thread+1 is divisible by TPI that is a power of two, therefore masking the last log(TPI) bits of group_thread.
       w=(w>>32)+t;
     }
     
     // increase the estimate by 3
-    t=(group_thread==TPI-LIMBS) ? 3 : 0;
+    //t=(group_thread==TPI-LIMBS) ? 3 : 0;
+    t=(group_thread == rem) ? 3 : 0;
     w=w + t + x[0];
     
     q[0]=ulow(w);
@@ -633,7 +640,7 @@ class dispatch_dlimbs_t<core, dlimbs_algs_multi> {
 
   __device__ __forceinline__ static void dlimbs_sqrt_estimate(uint32_t q[DLIMBS], uint32_t top, const uint32_t x[DLIMBS], const uint32_t approx[DLIMBS]) {
     uint32_t sync=core::sync_mask(), group_thread=threadIdx.x & TPI-1;
-    uint32_t t, high, low;
+    uint32_t t, high, low, rem = !(LIMBS % TPI)? 0 : TPI - (LIMBS % TPI);
     uint64_t w;
 
     // computes:
@@ -648,33 +655,36 @@ class dispatch_dlimbs_t<core, dlimbs_algs_multi> {
     t=(group_thread==TPI-1) ? top : t;
     low=uright_wrap(x[0], t, 1);
 
-     // if we're exactly multiple of the size, need to clear out low limb. Not sure if this is really needed.
-    if(LIMBS % TPI == 0) {
+     // if we're exactly multiple of the size, need to clear out low limb. Not sure if this is really needed at multi, if was for half (LIMBS half of TPI) and already not for full.
+   // if(LIMBS % TPI == 0) {
     	//low=(group_thread>=LIMBS) ? low : 0;
-    }
+   // }
 
     // estimate is in low
     w=0;
     #pragma unroll
     for(int32_t index=0;index<LIMBS;index++) {
-      t=__shfl_sync(sync, low, TPI-LIMBS+index, TPI);
+      t=__shfl_sync(sync, low, rem+index, TPI);
       w=mad_wide(t, approx[0], w);
       t=__shfl_down_sync(sync, ulow(w), 1, TPI);
      // t=(group_thread==TPI-1) ? 0 : t;
-      t=((group_thread+1) % TPI == 0) ? 0 : t;
-      w=(w>>32)+t;
+      //t=((group_thread+1) % TPI == 0) ? 0 : t;
+      t=((group_thread + 1) & (TPI-1) == 0) ? 0 : t;//group_thread+1 is divisible by TPI that is a power of two, therefore masking the last log(TPI) bits of group_thread.
+      w = (w>>32)+t;
     }
     
     // increase the estimate by 4 -- because we might have cleared low bit, estimate can be off by 4
-    t=(group_thread==TPI-LIMBS) ? 4 : 0;
-    w=w + t + low;
+    t = (group_thread == rem) ? 4 : 0;
+    w = w + t + low;
 
     low=ulow(w);
     high=uhigh(w);
-    if(core::resolve_add(high, low)!=0)
+    if(core::resolve_add(high, low)!=0) {
       low=0xFFFFFFFF;
+     }
+
     q[0]=low;
   }
 };
 
-} /* namespace cgbn */
+} /* namespace cgbn */
\ No newline at end of file