From b7d8ba1589f99f6601790fb11223181339b2d232 Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Mon, 3 Jun 2024 11:17:39 -0400
Subject: [PATCH 1/6] Enable FP6-LLM kernel build on Windows

---
 setup.py                                      | 44 +++++++---
 torchao/csrc/cuda/fp6_llm/kernel_matmul.cuh   |  6 +-
 torchao/csrc/cuda/fp6_llm/ptx_mma.cuh         |  8 +-
 torchao/csrc/cuda/fp6_llm/utils_core.cuh      |  4 +-
 torchao/csrc/cuda/fp6_llm/utils_gmem.cuh      | 10 +--
 .../cuda/fp6_llm/utils_parallel_dequant.cuh   | 24 +++---
 torchao/utils.py                              | 84 +++++++++----------
 7 files changed, 99 insertions(+), 81 deletions(-)

diff --git a/setup.py b/setup.py
index 014a36832..88083cd91 100644
--- a/setup.py
+++ b/setup.py
@@ -32,6 +32,7 @@ def read_requirements(file_path):
     CUDAExtension,
     BuildExtension,
     CUDA_HOME,
+    IS_WINDOWS
 )
 
 
@@ -49,21 +50,38 @@ def get_extensions():
     use_cuda = torch.cuda.is_available() and CUDA_HOME is not None
     extension = CUDAExtension if use_cuda else CppExtension
 
-    extra_link_args = ["-fopenmp"]
-    extra_compile_args = {
-        "cxx": [
-            "-O3" if not debug_mode else "-O0",
-            "-fdiagnostics-color=always",
-            "-fopenmp",
-        ],
-        "nvcc": [
-            "-O3" if not debug_mode else "-O0",
-        ]
-    }
+    if not IS_WINDOWS:
+        extra_link_args = ["-fopenmp"]
+        extra_compile_args = {
+            "cxx": [
+                "-O3" if not debug_mode else "-O0",
+                "-fdiagnostics-color=always",
+                "-fopenmp",
+            ],
+            "nvcc": [
+                "-O3" if not debug_mode else "-O0",
+            ]
+        }
+    else:
+        extra_link_args = []
+        extra_compile_args = {
+            "cxx": [
+                "/O2" if not debug_mode else "/Od",
+                "/openmp",
+                "/permissive-"
+            ],
+            "nvcc": [
+                "-O3" if not debug_mode else "-O0",
+                "-t=0"
+            ]
+       }
+        
     if debug_mode:
-        extra_compile_args["cxx"].append("-g")
+        extra_compile_args["cxx"].append("-g" if not IS_WINDOWS else "/ZI")
         extra_compile_args["nvcc"].append("-g")
-        extra_link_args.extend(["-O0", "-g"])
+        extra_link_args.extend(["-O0", "-g"] if not IS_WINDOWS else ["/DEBUG"])
+        
+
 
     this_dir = os.path.dirname(os.path.curdir)
     extensions_dir = os.path.join(this_dir, "torchao", "csrc")
diff --git a/torchao/csrc/cuda/fp6_llm/kernel_matmul.cuh b/torchao/csrc/cuda/fp6_llm/kernel_matmul.cuh
index de7775ddc..cf8533b00 100644
--- a/torchao/csrc/cuda/fp6_llm/kernel_matmul.cuh
+++ b/torchao/csrc/cuda/fp6_llm/kernel_matmul.cuh
@@ -133,11 +133,11 @@ __global__ void QUANT_GEMM_Kernel(const uint4* Weight, const half* Scales,
     uint32_t* __restrict__ write_SPTR_Frag1 = AFrag_2BIT_SPTR + ((tile_id_k+(PIPELINE_LEVEL_GMEM-1))  % PIPELINE_LEVEL_GMEM) * SMEM_SIZE_IN_BYTES_PER_WARP_A1/4*4; // 1024 (1)*4: 4 WARPs; (2)/4: int*+1 = char*+16
     uint32_t* __restrict__ write_SPTR_Frag2 = AFrag_4BIT_SPTR + ((tile_id_k+(PIPELINE_LEVEL_GMEM-1))  % PIPELINE_LEVEL_GMEM) * SMEM_SIZE_IN_BYTES_PER_WARP_A2/4*4; // 2048 (1)*4: 4 WARPs; (2)/4: int*+1 = char*+16
     // Trible-Buffer for B Tile
-    half __restrict__ (*read_SPTR )[WARP_K+PADDING_SHARED_MEM_FOR_B_8] = smem_array + ((tile_id_k+0)  % PIPELINE_LEVEL_GMEM) * TilingConfig::TILE_N;
+    half (* __restrict__ read_SPTR)[WARP_K+PADDING_SHARED_MEM_FOR_B_8] = smem_array + ((tile_id_k+0)  % PIPELINE_LEVEL_GMEM) * TilingConfig::TILE_N;
 #ifdef PIPELINE_LEVEL_SMEM
-    half __restrict__ (*read2_SPTR )[WARP_K+PADDING_SHARED_MEM_FOR_B_8] = smem_array + ((tile_id_k+1) % PIPELINE_LEVEL_GMEM) * TilingConfig::TILE_N;
+    half (* __restrict__ read2_SPTR)[WARP_K+PADDING_SHARED_MEM_FOR_B_8] = smem_array + ((tile_id_k+1) % PIPELINE_LEVEL_GMEM) * TilingConfig::TILE_N;
 #endif
-    half __restrict__ (*write_SPTR)[WARP_K+PADDING_SHARED_MEM_FOR_B_8] = smem_array + ((tile_id_k+(PIPELINE_LEVEL_GMEM-1))  % PIPELINE_LEVEL_GMEM) * TilingConfig::TILE_N;
+    half (* __restrict__ write_SPTR)[WARP_K+PADDING_SHARED_MEM_FOR_B_8] = smem_array + ((tile_id_k+(PIPELINE_LEVEL_GMEM-1))  % PIPELINE_LEVEL_GMEM) * TilingConfig::TILE_N;
     //
     bool GlobalCopy = (tile_id_k+PIPELINE_LEVEL_GMEM-1) < NumIter;
     // Copying A tile from Global to Register, Bypassing L1, using double-buffer  
diff --git a/torchao/csrc/cuda/fp6_llm/ptx_mma.cuh b/torchao/csrc/cuda/fp6_llm/ptx_mma.cuh
index d0985bd63..cceff992c 100644
--- a/torchao/csrc/cuda/fp6_llm/ptx_mma.cuh
+++ b/torchao/csrc/cuda/fp6_llm/ptx_mma.cuh
@@ -38,9 +38,9 @@
 
 #ifdef PIPELINE_LEVEL_SMEM
 template <typename TilingConfig>
-__device__ __forceinline__ void B_FromSharedToReg(uint32_t  __restrict__    Reg[][4],
-                                                  half      __restrict__    (*read_SPTR)[WARP_K+PADDING_SHARED_MEM_FOR_B_8],
-                                                  int                       slice_id) {
+__device__ __forceinline__ void B_FromSharedToReg(uint32_t Reg[][4],
+                                                  half     (*read_SPTR)[WARP_K+PADDING_SHARED_MEM_FOR_B_8],
+                                                  int      slice_id) {
     #ifdef DEBUG_MODE
         static_assert( (TilingConfig::WARP_COL_MMA_TENSORS==1) || (TilingConfig::WARP_COL_MMA_TENSORS%2==0) );
     #endif
@@ -113,7 +113,7 @@ __device__ __forceinline__ void B_FromSharedToReg(uint32_t  __restrict__    Reg[
 #endif
 
 __device__ __forceinline__ void
-MMA_FP16_M16N8K16(uint32_t __restrict__ c[], uint32_t __restrict__ *a, uint32_t __restrict__ *b)
+MMA_FP16_M16N8K16(uint32_t * __restrict__ c, uint32_t * __restrict__ a, uint32_t * __restrict__ b)
 {
     asm volatile("mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
                  "{ %0, %1, %2, %3},"
diff --git a/torchao/csrc/cuda/fp6_llm/utils_core.cuh b/torchao/csrc/cuda/fp6_llm/utils_core.cuh
index 5bfc043ef..4ff2b2cb7 100644
--- a/torchao/csrc/cuda/fp6_llm/utils_core.cuh
+++ b/torchao/csrc/cuda/fp6_llm/utils_core.cuh
@@ -40,7 +40,7 @@ __device__ __forceinline__ void initialize_mma_slice(uint32_t                  (
                                                      uint32_t                  (*b)[4],
                                                      uint32_t* __restrict__    A1_SPTR_read,
                                                      uint32_t* __restrict__    A2_SPTR_read,
-                                                     half      __restrict__    (*B_SPTR_read)[WARP_K+PADDING_SHARED_MEM_FOR_B_8],
+                                                     half                      (* __restrict__ B_SPTR_read)[WARP_K+PADDING_SHARED_MEM_FOR_B_8],
                                                      uint32_t*                 RPTR_Scales)
 {
     // Writing registers
@@ -59,7 +59,7 @@ __device__ __forceinline__ void core_mma_slice(float                     c[][REG
                                                uint32_t                  (*b)[4],
                                                uint32_t* __restrict__    A1_SPTR_read,
                                                uint32_t* __restrict__    A2_SPTR_read,
-                                               half      __restrict__    (*B_SPTR_read)[WARP_K+PADDING_SHARED_MEM_FOR_B_8],
+                                               half                      (* __restrict__ B_SPTR_read)[WARP_K+PADDING_SHARED_MEM_FOR_B_8],
                                                uint32_t*                 RPTR_Scales,
                                                int                       slice_id)      // writing slice[slice_id] to registers, k=0 -> slice_id=1 for prefetching
 {
diff --git a/torchao/csrc/cuda/fp6_llm/utils_gmem.cuh b/torchao/csrc/cuda/fp6_llm/utils_gmem.cuh
index 5c37452e1..43356d704 100644
--- a/torchao/csrc/cuda/fp6_llm/utils_gmem.cuh
+++ b/torchao/csrc/cuda/fp6_llm/utils_gmem.cuh
@@ -63,11 +63,11 @@ __device__ __forceinline__ void CopyFromGlobalToShared_Scales(half* SPTR_QuantSc
  * 16 Bytes per thread -> 512 Bytes per WARP = 4 line per WARP = 1 line per 8 Threads
  */
 template<int MaxNumOfLinesToCopy, int BLOCK_WARPS>
-__device__ __forceinline__ void CopyFromGlobalToShared(half __restrict__ (*SharedPTR)[WARP_K+PADDING_SHARED_MEM_FOR_B_8],
-                                                       const half*       GlobalPTR,
-                                                       const int         GlobalStride,
-                                                       const int         NumOfLinesLeft,        // To support arbitrary N dimensions.
-                                                       bool              Pred = true) {
+__device__ __forceinline__ void CopyFromGlobalToShared(half        (* __restrict__ SharedPTR)[WARP_K+PADDING_SHARED_MEM_FOR_B_8],
+                                                       const half* GlobalPTR,
+                                                       const int   GlobalStride,
+                                                       const int   NumOfLinesLeft,        // To support arbitrary N dimensions.
+                                                       bool        Pred = true) {
     // static parameters: 1 Group (8 Threads) can copy 1 line (64 FP16) each time
     const int NumOfThreads  = BLOCK_WARPS * WARP_SIZE;
     const int NumOfGroups   = NumOfThreads / 8; 
diff --git a/torchao/csrc/cuda/fp6_llm/utils_parallel_dequant.cuh b/torchao/csrc/cuda/fp6_llm/utils_parallel_dequant.cuh
index f6ce4cc04..b08865931 100644
--- a/torchao/csrc/cuda/fp6_llm/utils_parallel_dequant.cuh
+++ b/torchao/csrc/cuda/fp6_llm/utils_parallel_dequant.cuh
@@ -26,7 +26,7 @@
  * Outputs: R1, R2
  * Note:    Simplified Exponent calculation is applied.
  */
-__device__ __forceinline__ void FP6_FP16_Cast_4Way(u_int32_t *R1, u_int32_t *R2) {
+__device__ __forceinline__ void FP6_FP16_Cast_4Way(uint32_t *R1, uint32_t *R2) {
     *R2 = *R1 & 0x80808080;
     *R1 = *R1 >> 2;
     *R1 = *R1 & 0x1f1f1f1f;
@@ -41,7 +41,7 @@ __device__ __forceinline__ void FP6_FP16_Cast_4Way(u_int32_t *R1, u_int32_t *R2)
  * Outputs: R1, R2
  * Note:    Simplified Exponent calculation is NOT applied.
  */
-__device__ __forceinline__ void FP6_FP16_Cast_4Way_Naive(u_int32_t *R1, u_int32_t *R2) {
+__device__ __forceinline__ void FP6_FP16_Cast_4Way_Naive(uint32_t *R1, uint32_t *R2) {
     //*R2 = *R1 & 0x80808080;
     *R2 = *R1 & 0xc0c0c0c0;
     *R1 = *R1 >> 2;
@@ -63,7 +63,7 @@ __device__ __forceinline__ void FP6_FP16_Cast_4Way_Naive(u_int32_t *R1, u_int32_
     //*R2 = 0x3c003c00;
 }
 
-__device__ __forceinline__ u_int32_t MultScale(u_int32_t PackedFP16Pair, half Scale) {
+__device__ __forceinline__ uint32_t MultScale(uint32_t PackedFP16Pair, half Scale) {
     half* FP16_1 = reinterpret_cast<half*>(&PackedFP16Pair);
     half* FP16_2 = FP16_1 + 1;
     uint32_t output;
@@ -73,16 +73,16 @@ __device__ __forceinline__ u_int32_t MultScale(u_int32_t PackedFP16Pair, half Sc
     return output;
 }
 
-__device__ __forceinline__ void Dequant_32FP6_4Way(u_int32_t __restrict__   Reg[][4], 
-                                                   u_int32_t __restrict__   *read_RPTR_Frag1, 
-                                                   u_int32_t __restrict__   *read_RPTR_Frag2,
-                                                   u_int32_t                *Scales) {
-    u_int32_t *OutputRegs   = reinterpret_cast<u_int32_t*> (Reg);
-    u_int32_t *Frag1_PTR    = read_RPTR_Frag1;
-    u_int32_t *Frag2_PTR    = read_RPTR_Frag2;
+__device__ __forceinline__ void Dequant_32FP6_4Way(uint32_t                Reg[][4], 
+                                                   uint32_t * __restrict__ read_RPTR_Frag1, 
+                                                   uint32_t * __restrict__ read_RPTR_Frag2,
+                                                   uint32_t *              Scales) {
+    uint32_t *OutputRegs   = reinterpret_cast<uint32_t*> (Reg);
+    uint32_t *Frag1_PTR    = read_RPTR_Frag1;
+    uint32_t *Frag2_PTR    = read_RPTR_Frag2;
     half      *Scale_RPTR   = reinterpret_cast<half*>(Scales);
-    u_int32_t Packed_FP6    = 0;
-    u_int32_t tmp           = 0;
+    uint32_t Packed_FP6    = 0;
+    uint32_t tmp           = 0;
     // Dequantizing 32 FP6, each Loop dequantizing 4 FP6
     #pragma unroll(8)
     for(int i=0; i<8; i++) { 
diff --git a/torchao/utils.py b/torchao/utils.py
index 2835129f7..763ada444 100644
--- a/torchao/utils.py
+++ b/torchao/utils.py
@@ -1,46 +1,46 @@
-import torch
+import torch
 import torch.utils.benchmark as benchmark
-
-
-def benchmark_model(model, num_runs, input_tensor):
-    torch.cuda.synchronize()
-    start_event = torch.cuda.Event(enable_timing=True)
-    end_event = torch.cuda.Event(enable_timing=True)
-    start_event.record()
-
-    # benchmark
-    for _ in range(num_runs):
-        with torch.autograd.profiler.record_function("timed region"):
-            model(input_tensor)
-
-    end_event.record()
-    torch.cuda.synchronize()
-    return start_event.elapsed_time(end_event) / num_runs
-
-def profiler_runner(path, fn, *args, **kwargs):
-    with torch.profiler.profile(
-            activities=[torch.profiler.ProfilerActivity.CPU,
-                        torch.profiler.ProfilerActivity.CUDA],
-            record_shapes=True) as prof:
-        result = fn(*args, **kwargs)
-    prof.export_chrome_trace(path)
-    return result
-
-def get_compute_capability():
-    if torch.cuda.is_available():
-        capability = torch.cuda.get_device_capability()
-        return float(f"{capability[0]}.{capability[1]}")
-    return 0.0
-
-def skip_if_compute_capability_less_than(min_capability):
-    import unittest
-    def decorator(test_func):
-        def wrapper(*args, **kwargs):
-            if get_compute_capability() < min_capability:
-                raise unittest.SkipTest(f"Compute capability is less than {min_capability}")
-            return test_func(*args, **kwargs)
-        return wrapper
-    return decorator
+
+
+def benchmark_model(model, num_runs, input_tensor):
+    torch.cuda.synchronize()
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+    start_event.record()
+
+    # benchmark
+    for _ in range(num_runs):
+        with torch.autograd.profiler.record_function("timed region"):
+            model(input_tensor)
+
+    end_event.record()
+    torch.cuda.synchronize()
+    return start_event.elapsed_time(end_event) / num_runs
+
+def profiler_runner(path, fn, *args, **kwargs):
+    with torch.profiler.profile(
+            activities=[torch.profiler.ProfilerActivity.CPU,
+                        torch.profiler.ProfilerActivity.CUDA],
+            record_shapes=True) as prof:
+        result = fn(*args, **kwargs)
+    prof.export_chrome_trace(path)
+    return result
+
+def get_compute_capability():
+    if torch.cuda.is_available():
+        capability = torch.cuda.get_device_capability()
+        return float(f"{capability[0]}.{capability[1]}")
+    return 0.0
+
+def skip_if_compute_capability_less_than(min_capability):
+    import unittest
+    def decorator(test_func):
+        def wrapper(*args, **kwargs):
+            if get_compute_capability() < min_capability:
+                raise unittest.SkipTest(f"Compute capability is less than {min_capability}")
+            return test_func(*args, **kwargs)
+        return wrapper
+    return decorator
 
 
 def benchmark_torch_function_in_microseconds(f, *args, **kwargs):

From 0a61e590c2a76b5d2fe1031ee8a9596b7cd9caf3 Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Tue, 18 Jun 2024 21:21:34 +0800
Subject: [PATCH 2/6] fix benchmark script

---
 benchmarks/benchmark_fp6_llm.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/benchmarks/benchmark_fp6_llm.py b/benchmarks/benchmark_fp6_llm.py
index b6fdca643..ae17764e6 100644
--- a/benchmarks/benchmark_fp6_llm.py
+++ b/benchmarks/benchmark_fp6_llm.py
@@ -7,12 +7,12 @@
 
 
 def benchmark(m: int, k: int, n: int):
-    fp6_weight = torch.randint(256, size=(n, k // 4 * 3), dtype=torch.uint8, device="cuda")
+    fp6_weight = torch.randint(256, size=(n, k * 3 // 4), dtype=torch.uint8, device="cuda")
     scales = torch.rand(n, dtype=torch.half, device="cuda") + 0.5
-    fp6_linear = Fp6LlmLinear(fp6_weight.view(torch.int32), scales)
+    fp6_linear = Fp6LlmLinear(fp6_weight, scales)
 
     fp16_linear = nn.Linear(k, n, bias=True, dtype=torch.half, device="cuda")
-    fp16_linear.weight.data = from_tc_float6_e3m2(fp6_weight.view(-1), n, k, dtype=torch.half) * scales[:, None]
+    fp16_linear.weight.data = from_tc_float6_e3m2(fp6_weight, dtype=torch.half) * scales[:, None]
 
     fp16_act = torch.randn(m, k, dtype=torch.half, device="cuda")
     fp6_output = fp6_linear(fp16_act)

From ce2b9fc7832d789454aa7ad1dbbcbd29026ed7a6 Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Tue, 18 Jun 2024 21:30:32 +0800
Subject: [PATCH 3/6] update setup.py

---
 setup.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/setup.py b/setup.py
index c5c14cf2c..ef6405a2d 100644
--- a/setup.py
+++ b/setup.py
@@ -64,6 +64,12 @@ def get_extensions():
                 "-O3" if not debug_mode else "-O0",
             ]
         }
+
+        if debug_mode:
+            extra_compile_args["cxx"].append("-g")
+            extra_compile_args["nvcc"].append("-g")
+            extra_link_args.extend(["-O0", "-g"])
+
     else:
         extra_link_args = []
         extra_compile_args = {
@@ -77,10 +83,10 @@ def get_extensions():
             ]
        }
 
-    if debug_mode:
-        extra_compile_args["cxx"].append("-g" if not IS_WINDOWS else "/ZI")
-        extra_compile_args["nvcc"].append("-g")
-        extra_link_args.extend(["-O0", "-g"] if not IS_WINDOWS else ["/DEBUG"])
+        if debug_mode:
+            extra_compile_args["cxx"].append("/ZI")
+            extra_compile_args["nvcc"].append("-g")
+            extra_link_args.append("/DEBUG")
 
     this_dir = os.path.dirname(os.path.curdir)
     extensions_dir = os.path.join(this_dir, "torchao", "csrc")

From 54f4594a258dbc530fcd1e81633043a4d2a13234 Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Tue, 18 Jun 2024 22:36:34 +0800
Subject: [PATCH 4/6] update

---
 torchao/csrc/cuda/fp6_llm/kernel_matmul.cuh        |  3 ++-
 torchao/csrc/cuda/fp6_llm/ptx_mma.cuh              | 13 +++++++++----
 torchao/csrc/cuda/fp6_llm/utils_core.cuh           |  8 +++++---
 torchao/csrc/cuda/fp6_llm/utils_gmem.cuh           | 13 +++++++------
 .../csrc/cuda/fp6_llm/utils_parallel_dequant.cuh   | 14 +++++++++-----
 5 files changed, 32 insertions(+), 19 deletions(-)

diff --git a/torchao/csrc/cuda/fp6_llm/kernel_matmul.cuh b/torchao/csrc/cuda/fp6_llm/kernel_matmul.cuh
index cf8533b00..ed11fc851 100644
--- a/torchao/csrc/cuda/fp6_llm/kernel_matmul.cuh
+++ b/torchao/csrc/cuda/fp6_llm/kernel_matmul.cuh
@@ -12,7 +12,7 @@
 //    See the License for the specific language governing permissions and
 //    limitations under the License.
 // 
-// This file is copied from https://github.com/usyd-fsalab/fp6_llm/blob/ce76774bcfc26b325c1b558abcf1935026d9abbc/fp6_llm/csrc/include/kernel_matmul.cuh
+// This file is modified from https://github.com/usyd-fsalab/fp6_llm/blob/ce76774bcfc26b325c1b558abcf1935026d9abbc/fp6_llm/csrc/include/kernel_matmul.cuh
 
 #include "configs.h"
 #include "utils_gmem.cuh"
@@ -133,6 +133,7 @@ __global__ void QUANT_GEMM_Kernel(const uint4* Weight, const half* Scales,
     uint32_t* __restrict__ write_SPTR_Frag1 = AFrag_2BIT_SPTR + ((tile_id_k+(PIPELINE_LEVEL_GMEM-1))  % PIPELINE_LEVEL_GMEM) * SMEM_SIZE_IN_BYTES_PER_WARP_A1/4*4; // 1024 (1)*4: 4 WARPs; (2)/4: int*+1 = char*+16
     uint32_t* __restrict__ write_SPTR_Frag2 = AFrag_4BIT_SPTR + ((tile_id_k+(PIPELINE_LEVEL_GMEM-1))  % PIPELINE_LEVEL_GMEM) * SMEM_SIZE_IN_BYTES_PER_WARP_A2/4*4; // 2048 (1)*4: 4 WARPs; (2)/4: int*+1 = char*+16
     // Trible-Buffer for B Tile
+    // MODIFICATION NOTE: to support MSVC, half __restrict__ (*read_SPTR ) is changed to below. similarly for read2_SPTR and write_SPTR.
     half (* __restrict__ read_SPTR)[WARP_K+PADDING_SHARED_MEM_FOR_B_8] = smem_array + ((tile_id_k+0)  % PIPELINE_LEVEL_GMEM) * TilingConfig::TILE_N;
 #ifdef PIPELINE_LEVEL_SMEM
     half (* __restrict__ read2_SPTR)[WARP_K+PADDING_SHARED_MEM_FOR_B_8] = smem_array + ((tile_id_k+1) % PIPELINE_LEVEL_GMEM) * TilingConfig::TILE_N;
diff --git a/torchao/csrc/cuda/fp6_llm/ptx_mma.cuh b/torchao/csrc/cuda/fp6_llm/ptx_mma.cuh
index cceff992c..bafdd0b4e 100644
--- a/torchao/csrc/cuda/fp6_llm/ptx_mma.cuh
+++ b/torchao/csrc/cuda/fp6_llm/ptx_mma.cuh
@@ -12,7 +12,7 @@
 //    See the License for the specific language governing permissions and
 //    limitations under the License.
 // 
-// This file is copied from https://github.com/usyd-fsalab/fp6_llm/blob/ce76774bcfc26b325c1b558abcf1935026d9abbc/fp6_llm/csrc/include/ptx_mma.cuh
+// This file is modified from https://github.com/usyd-fsalab/fp6_llm/blob/ce76774bcfc26b325c1b558abcf1935026d9abbc/fp6_llm/csrc/include/ptx_mma.cuh
 
 /***************************************************************************
  * Copyright 2023 The FLash-LLM Authors. All rights reserved.
@@ -36,11 +36,14 @@
 #include <assert.h>
 #include "configs.h"
 
+// MODIFICATION NOTE: to support MSVC
+// - uint32_t __restrict__ Reg[][4] is changed to uint32_t (* __restrict__ Reg)[4]
+// - half __restrict__ (*read_SPTR) is changed to half (* __restrict__ read_SPTR)
 #ifdef PIPELINE_LEVEL_SMEM
 template <typename TilingConfig>
-__device__ __forceinline__ void B_FromSharedToReg(uint32_t Reg[][4],
-                                                  half     (*read_SPTR)[WARP_K+PADDING_SHARED_MEM_FOR_B_8],
-                                                  int      slice_id) {
+__device__ __forceinline__ void B_FromSharedToReg(uint32_t (* __restrict__ Reg)[4],
+                                                  half     (* __restrict__ read_SPTR)[WARP_K+PADDING_SHARED_MEM_FOR_B_8],
+                                                  int                      slice_id) {
     #ifdef DEBUG_MODE
         static_assert( (TilingConfig::WARP_COL_MMA_TENSORS==1) || (TilingConfig::WARP_COL_MMA_TENSORS%2==0) );
     #endif
@@ -112,6 +115,8 @@ __device__ __forceinline__ void B_FromSharedToReg(uint32_t  __restrict__    Reg[
 }
 #endif
 
+// MODIFICATION NOTE: to support MSVC, the function signature is changed from
+// MMA_FP16_M16N8K16(uint32_t __restrict__ c[], uint32_t __restrict__ *a, uint32_t __restrict__ *b).
 __device__ __forceinline__ void
 MMA_FP16_M16N8K16(uint32_t * __restrict__ c, uint32_t * __restrict__ a, uint32_t * __restrict__ b)
 {
diff --git a/torchao/csrc/cuda/fp6_llm/utils_core.cuh b/torchao/csrc/cuda/fp6_llm/utils_core.cuh
index 4ff2b2cb7..07e37d85b 100644
--- a/torchao/csrc/cuda/fp6_llm/utils_core.cuh
+++ b/torchao/csrc/cuda/fp6_llm/utils_core.cuh
@@ -12,7 +12,7 @@
 //    See the License for the specific language governing permissions and
 //    limitations under the License.
 // 
-// This file is copied from https://github.com/usyd-fsalab/fp6_llm/blob/ce76774bcfc26b325c1b558abcf1935026d9abbc/fp6_llm/csrc/include/utils_core.cuh
+// This file is modified from https://github.com/usyd-fsalab/fp6_llm/blob/ce76774bcfc26b325c1b558abcf1935026d9abbc/fp6_llm/csrc/include/utils_core.cuh
 
 #ifndef UTILS_CORE_CUH
 #define UTILS_CORE_CUH
@@ -35,12 +35,13 @@ __device__ __forceinline__ void CopyFromSharedToRegister_AFrag(uint32_t Reg[], u
     }
 }
 
+// MODIFICATION NOTE: to support MSVC, half __restrict__ (*B_SPTR_read)[WARP_K+PADDING_SHARED_MEM_FOR_B_8] is changed to below.
 template <typename TilingConfig>
 __device__ __forceinline__ void initialize_mma_slice(uint32_t                  (*a)[4],
                                                      uint32_t                  (*b)[4],
                                                      uint32_t* __restrict__    A1_SPTR_read,
                                                      uint32_t* __restrict__    A2_SPTR_read,
-                                                     half                      (* __restrict__ B_SPTR_read)[WARP_K+PADDING_SHARED_MEM_FOR_B_8],
+                                                     half   (* __restrict__    B_SPTR_read)[WARP_K+PADDING_SHARED_MEM_FOR_B_8],
                                                      uint32_t*                 RPTR_Scales)
 {
     // Writing registers
@@ -53,13 +54,14 @@ __device__ __forceinline__ void initialize_mma_slice(uint32_t                  (
     B_FromSharedToReg<TilingConfig>(b, B_SPTR_read, 0); // Loading B from shared to registers
 }
 
+// MODIFICATION NOTE: to support MSVC, half __restrict__ (*B_SPTR_read)[WARP_K+PADDING_SHARED_MEM_FOR_B_8] is changed to below.
 template <typename TilingConfig>
 __device__ __forceinline__ void core_mma_slice(float                     c[][REG_PER_THREAD_C_TENSOR_16_16],
                                                uint32_t                  (*a)[4],
                                                uint32_t                  (*b)[4],
                                                uint32_t* __restrict__    A1_SPTR_read,
                                                uint32_t* __restrict__    A2_SPTR_read,
-                                               half                      (* __restrict__ B_SPTR_read)[WARP_K+PADDING_SHARED_MEM_FOR_B_8],
+                                               half   (* __restrict__    B_SPTR_read)[WARP_K+PADDING_SHARED_MEM_FOR_B_8],
                                                uint32_t*                 RPTR_Scales,
                                                int                       slice_id)      // writing slice[slice_id] to registers, k=0 -> slice_id=1 for prefetching
 {
diff --git a/torchao/csrc/cuda/fp6_llm/utils_gmem.cuh b/torchao/csrc/cuda/fp6_llm/utils_gmem.cuh
index 43356d704..a74930ba4 100644
--- a/torchao/csrc/cuda/fp6_llm/utils_gmem.cuh
+++ b/torchao/csrc/cuda/fp6_llm/utils_gmem.cuh
@@ -12,7 +12,7 @@
 //    See the License for the specific language governing permissions and
 //    limitations under the License.
 // 
-// This file is copied from https://github.com/usyd-fsalab/fp6_llm/blob/ce76774bcfc26b325c1b558abcf1935026d9abbc/fp6_llm/csrc/include/utils_gmem.cuh
+// This file is modified from https://github.com/usyd-fsalab/fp6_llm/blob/ce76774bcfc26b325c1b558abcf1935026d9abbc/fp6_llm/csrc/include/utils_gmem.cuh
 
 #ifndef UTILS_GMEM_CUH
 #define UTILS_GMEM_CUH
@@ -57,17 +57,18 @@ __device__ __forceinline__ void CopyFromGlobalToShared_Scales(half* SPTR_QuantSc
     for(int i=0; i<2; i++)  SPTR_QuantScales[Offset_Shared+i] = GPTR_A_Scales[Offset_Global+i*8];
 }
 
+// MODIFICATION NOTE: to support MSVC, half __restrict__ (*SharedPTR)[WARP_K+PADDING_SHARED_MEM_FOR_B_8] is changed to below.
 /* 
  * (1) Copying X  rows * 64 columns of FP16 values, originally in row    major
  * (2) Copying 64 rows * X  columns of FP16 values, originally in column major
  * 16 Bytes per thread -> 512 Bytes per WARP = 4 line per WARP = 1 line per 8 Threads
  */
 template<int MaxNumOfLinesToCopy, int BLOCK_WARPS>
-__device__ __forceinline__ void CopyFromGlobalToShared(half        (* __restrict__ SharedPTR)[WARP_K+PADDING_SHARED_MEM_FOR_B_8],
-                                                       const half* GlobalPTR,
-                                                       const int   GlobalStride,
-                                                       const int   NumOfLinesLeft,        // To support arbitrary N dimensions.
-                                                       bool        Pred = true) {
+__device__ __forceinline__ void CopyFromGlobalToShared(half (* __restrict__ SharedPTR)[WARP_K+PADDING_SHARED_MEM_FOR_B_8],
+                                                       const half*          GlobalPTR,
+                                                       const int            GlobalStride,
+                                                       const int            NumOfLinesLeft,        // To support arbitrary N dimensions.
+                                                       bool                 Pred = true) {
     // static parameters: 1 Group (8 Threads) can copy 1 line (64 FP16) each time
     const int NumOfThreads  = BLOCK_WARPS * WARP_SIZE;
     const int NumOfGroups   = NumOfThreads / 8; 
diff --git a/torchao/csrc/cuda/fp6_llm/utils_parallel_dequant.cuh b/torchao/csrc/cuda/fp6_llm/utils_parallel_dequant.cuh
index b08865931..48b0f968b 100644
--- a/torchao/csrc/cuda/fp6_llm/utils_parallel_dequant.cuh
+++ b/torchao/csrc/cuda/fp6_llm/utils_parallel_dequant.cuh
@@ -12,7 +12,8 @@
 //    See the License for the specific language governing permissions and
 //    limitations under the License.
 // 
-// This file is copied from https://github.com/usyd-fsalab/fp6_llm/blob/ce76774bcfc26b325c1b558abcf1935026d9abbc/fp6_llm/csrc/include/utils_parallel_dequant.cuh
+// This file is modified from https://github.com/usyd-fsalab/fp6_llm/blob/ce76774bcfc26b325c1b558abcf1935026d9abbc/fp6_llm/csrc/include/utils_parallel_dequant.cuh
+// To support MSVC, all instances of u_int32_t are changed to uint32_t.
 
 #ifndef UTILS_PARALLELDEQUANT_CUH
 #define UTILS_PARALLELDEQUANT_CUH
@@ -73,10 +74,13 @@ __device__ __forceinline__ uint32_t MultScale(uint32_t PackedFP16Pair, half Scal
     return output;
 }
 
-__device__ __forceinline__ void Dequant_32FP6_4Way(uint32_t                Reg[][4], 
-                                                   uint32_t * __restrict__ read_RPTR_Frag1, 
-                                                   uint32_t * __restrict__ read_RPTR_Frag2,
-                                                   uint32_t *              Scales) {
+// MODIFICATION NOTE: to support MSVC
+// - u_int32_t __restrict__ Reg[][4] is changed to below.
+// - u_int32_t __restrict__ *read_RPTR_Frag1 is changed to below. similarly for read_RPTR_Frag2
+__device__ __forceinline__ void Dequant_32FP6_4Way(uint32_t (* __restrict__ Reg)[4], 
+                                                   uint32_t  * __restrict__ read_RPTR_Frag1, 
+                                                   uint32_t  * __restrict__ read_RPTR_Frag2,
+                                                   uint32_t  *              Scales) {
     uint32_t *OutputRegs   = reinterpret_cast<uint32_t*> (Reg);
     uint32_t *Frag1_PTR    = read_RPTR_Frag1;
     uint32_t *Frag2_PTR    = read_RPTR_Frag2;

From 8e93557ecef42226cdb3e744a494ab7c56c40a76 Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Tue, 18 Jun 2024 22:37:50 +0800
Subject: [PATCH 5/6] fix indent

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index ef6405a2d..0bec8c499 100644
--- a/setup.py
+++ b/setup.py
@@ -81,7 +81,7 @@ def get_extensions():
                 "-O3" if not debug_mode else "-O0",
                 "-t=0"
             ]
-       }
+        }
 
         if debug_mode:
             extra_compile_args["cxx"].append("/ZI")

From 02fd6ad834f367a9fa1a99b4193a0207c425de0b Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Tue, 18 Jun 2024 23:46:44 +0800
Subject: [PATCH 6/6] add -t=0 for linux

---
 setup.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 0bec8c499..a6a2da8a7 100644
--- a/setup.py
+++ b/setup.py
@@ -62,6 +62,7 @@ def get_extensions():
             ],
             "nvcc": [
                 "-O3" if not debug_mode else "-O0",
+                "-t=0",
             ]
         }
 
@@ -79,7 +80,7 @@ def get_extensions():
             ],
             "nvcc": [
                 "-O3" if not debug_mode else "-O0",
-                "-t=0"
+                "-t=0",
             ]
         }