StreamHPC · neon60 · Nov 23, 2023 · Nov 23, 2023 · Nov 23, 2023 · Nov 24, 2023
diff --git a/.markdownlint-cli2.yaml b/.markdownlint-cli2.yaml
@@ -0,0 +1,2 @@
+ignores:
+  - docs/api-library.md
diff --git a/.wordlist.txt b/.wordlist.txt
@@ -0,0 +1,17 @@
+asynchronicity
+bfloat
+BFLOAT
+ClippedReLU
+csv
+cuSPARSELt
+GeLU
+Gemm
+GoogleTest
+hipSPARSELt
+LeakyReLU
+ReLU
+rocSPARSELt
+sudo
+SpMM
+Tanh
+tensorfloat
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,21 +1,25 @@
 # Change Log for hipSPARSELt
 
 ## (Unreleased) hipSPARSELt 0.2.0
+
 ### Added
+
 - Support Matrix B is a Structured Sparsity Matrix.
 
 ## (Unreleased) hipSPARSELt 0.1.0
+
 ### Added
+
 - Enable hipSPARSELt APIs
-- Support platform: gfx940, gfx941, gfx942 
+- Support platform: gfx940, gfx941, gfx942
 - Support problem type: fp16, bf16, int8
 - Support activation: relu, gelu, abs, sigmod, tanh
 - Support gelu scaling
 - Support bias vector
-- Support batched computation (single sparse x multiple dense, multiple sparse x single dense)
+- Support batched computation (single sparse x multiple dense, multiple sparse x
+single dense)
 - Support cuSPARSELt v0.4 backend
 - Integreate with tensilelite kernel generator
 - Add Gtest: hipsparselt-test
 - Add benchmarking tool: hipsparselt-bench
 - Add sample app: example_spmm_strided_batched, example_prune, example_compress
-
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -148,7 +148,7 @@ else()
       set( Tensile_ARCHITECTURE "${AMDGPU_TARGETS}" CACHE STRING "Tensile to use which architecture?" FORCE)
 
       set( Tensile_LOGIC "asm_full" CACHE STRING "Tensile to use which logic?")
-      set( Tensile_CODE_OBJECT_VERSION "V2" CACHE STRING "Tensile code_object_version")
+      set( Tensile_CODE_OBJECT_VERSION "default" CACHE STRING "Tensile code_object_version")
       set( Tensile_COMPILER "hipcc" CACHE STRING "Tensile compiler")
       set( Tensile_LIBRARY_FORMAT "msgpack" CACHE STRING "Tensile library format")
       set( Tensile_CPU_THREADS "" CACHE STRING "Number of threads for Tensile parallel build")
@@ -160,7 +160,7 @@ else()
       set( Tensile_TEST_LOCAL_PATH "" CACHE PATH "Use local Tensile directory instead of fetching a GitHub branch" )
 
       set_property( CACHE Tensile_LOGIC PROPERTY STRINGS aldebaran asm_full asm_lite asm_miopen hip_lite other )
-      set_property( CACHE Tensile_CODE_OBJECT_VERSION PROPERTY STRINGS V2 V3 )
+      set_property( CACHE Tensile_CODE_OBJECT_VERSION PROPERTY STRINGS default V4 V5 )
       set_property( CACHE Tensile_COMPILER PROPERTY STRINGS hcc hipcc)
       set_property( CACHE Tensile_LIBRARY_FORMAT PROPERTY STRINGS msgpack yaml)
 

diff --git a/LICENSE.md → LICENSE b/LICENSE.md → LICENSE
diff --git a/README.md b/README.md
@@ -1,10 +1,18 @@
 # hipSPARSELt
 
-hipSPARSELt is a SPARSE marshalling library, with multiple supported backends. It sits between the application and a 'worker' SPARSE library, marshalling inputs into the backend library and marshalling results back to the application. hipSPARSELt exports an interface that does not require the client to change, regardless of the chosen backend. Currently, hipSPARSELt supports [rocSPARSELt](library/src/hcc_detial/rocsparselt) and [cuSPARSELt v0.3](https://docs.nvidia.com/cuda/cusparselt) as backends.
+hipSPARSELt is a SPARSE marshalling library, with multiple supported backends.
+It sits between the application and a 'worker' SPARSE library, marshalling
+inputs into the backend library and marshalling results back to the
+application. hipSPARSELt exports an interface that does not require the client
+to change, regardless of the chosen backend. Currently, hipSPARSELt supports
+[rocSPARSELt](library/src/hcc_detial/rocsparselt) and [cuSPARSELt v0.4](https://docs.nvidia.com/cuda/cusparselt)
+as backends.
 
 ## Installing pre-built packages
 
-Download pre-built packages either from the [ROCm package servers](https://rocm.github.io/install.html#installing-from-amd-rocm-repositories) or by clicking the GitHub releases tab and manually downloading, which could be newer. Release notes are available for each release on the releases tab.
+Download pre-built packages either from the [ROCm package servers](https://rocm.github.io/install.html#installing-from-amd-rocm-repositories)
+or by clicking the GitHub releases tab and manually downloading, which could be
+newer. Release notes are available for each release on the releases tab.
 
 * `sudo apt update && sudo apt install hipsparselt`
 
@@ -24,7 +32,12 @@ Download pre-built packages either from the [ROCm package servers](https://rocm.
 
 ### Bash helper build script
 
-The root of this repository has a helper bash script `install.sh` to build and install hipSPARSELt on Ubuntu with a single command.  It does not take a lot of options and hard-codes configuration that can be specified through invoking cmake directly, but it's a great way to get started quickly and can serve as an example of how to build/install. A few commands in the script need sudo access, so it may prompt you for a password.
+The root of this repository has a helper bash script `install.sh` to build and
+install hipSPARSELt on Ubuntu with a single command.  It does not take a lot of
+options and hard-codes configuration that can be specified through invoking
+CMake directly, but it's a great way to get started quickly and can serve as an
+example of how to build/install. A few commands in the script need sudo access,
+so it may prompt you for a password.
 
 ```bash
 # Run install.sh script
@@ -64,9 +77,10 @@ The root of this repository has a helper bash script `install.sh` to build and i
       * Add support for Mixed-precision computation
         * FP8 input/output, FP32 Matrix Core accumulate
         * BF8 input/output, FP32 Matrix Core accumulate
-      * Add kernel selection and genroator, used to provide the appropriate solution for the specific problem.
+      * Add kernel selection and genroator, used to provide the appropriate
+        solution for the specific problem.
 * CUDA
-  * Support cusparseLt v0.3
+  * Support cusparseLt v0.4
 
 ## Documentation
 
@@ -84,7 +98,10 @@ python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html
 
 ## hipSPARSELt interface examples
 
-The hipSPARSELt interface is compatible with cuSPARSELt APIs. Porting a CUDA application which originally calls the cuSPARSELt API to an application calling hipSPARSELt API should be relatively straightforward. For example, the hipSPARSELt matmul interface is
+The hipSPARSELt interface is compatible with cuSPARSELt APIs. Porting a CUDA
+application which originally calls the cuSPARSELt API to an application calling
+hipSPARSELt API should be relatively straightforward. For example, the
+hipSPARSELt matmul interface is
 
 ### matmul API
 
@@ -103,13 +120,16 @@ hipsparseStatus_t hipsparseLtMatmul(const hipsparseLtHandle_t*     handle,
 
 ```
 
-hipSPARSELt assumes matrix A, B, C, D and workspace are allocated in GPU memory space filled with data. Users are responsible for copying data from/to the host and device memory.
+hipSPARSELt assumes matrix A, B, C, D and workspace are allocated in GPU memory
+space filled with data. Users are responsible for copying data from/to the host
+and device memory.
 
 ## Running tests and benchmark tool
 
 ### Unit tests
 
-To run unit tests, hipSPARSELt has to be built with option -DBUILD_CLIENTS_TESTS=ON (or using ./install.sh -c)
+To run unit tests, hipSPARSELt has to be built with option
+-DBUILD_CLIENTS_TESTS=ON (or using ./install.sh -c)
 
 ```bash
 # Go to hipSPARSELt build directory
@@ -121,7 +141,8 @@ cd hipSPARSELt; cd build/release
 
 ### Benchmarks
 
-To run benchmarks, hipSPARSELt has to be built with option -DBUILD_CLIENTS_BENCHMARKS=ON (or using ./install.sh -c).
+To run benchmarks, hipSPARSELt has to be built with option
+-DBUILD_CLIENTS_BENCHMARKS=ON (or using ./install.sh -c).
 
 ```bash
 # Go to hipSPARSELt build directory

diff --git a/docs/conf.py b/docs/conf.py
@@ -15,7 +15,7 @@
 
 extensions = ['sphinx_design', 'sphinx.ext.intersphinx']
 
-exclude_patterns = ['reference/api-library.md', 'reference/_functions.rst']
+exclude_patterns = ['reference/api-library.md']
 
 external_toc_path = "./sphinx/_toc.yml"
 

diff --git a/docs/reference/_functions.rst b/docs/reference/_functions.rst
diff --git a/docs/reference/api-library.md b/docs/reference/api-library.md
@@ -1,4 +1,5 @@
 The hipSPARSELt library is organized as follows:
+<!-- spellcheck-disable -->
 
 * @ref types_module
 * @ref library_module
@@ -9,6 +10,8 @@ The hipSPARSELt library is organized as follows:
 * @ref helper_module
 * @ref aux_module
 
-Note that all hipSPARSELt library functions, unless otherwise stated, are non-blocking and are run
-asynchronously with respect to the host. They may return before the actual computation has finished.
-To force synchronization, use `hipDeviceSynchronize` or `hipStreamSynchronize`.
+<!-- spellcheck-enable -->
+Note that all hipSPARSELt library functions, unless otherwise stated, are
+non-blocking and are run asynchronously with respect to the host. They may
+return before the actual computation has finished. To force synchronization, use
+`hipDeviceSynchronize` or `hipStreamSynchronize`.
diff --git a/docs/reference/data-type-support.rst b/docs/reference/data-type-support.rst
@@ -8,14 +8,140 @@
 Data type support
 ******************************************
 
-.. csv-table::
-   :header: "Input", "Output", "Compute type", "Backend"
-
-   "HIPSPARSELT_R_16F", "HIPSPARSELT_R_16F", "HIPSPARSELT_COMPUTE_32F", "HIP"
-   "HIPSPARSELT_R_16BF", "HIPSPARSELT_R_16BF", "HIPSPARSELT_COMPUTE_32F", "HIP"
-   "HIPSPARSELT_R_8I", "HIPSPARSELT_R_8I", "HIPSPARSELT_COMPUTE_32I", "HIP / CUDA"
-   "HIPSPARSELT_R_8I", "HIPSPARSELT_R_16F", "HIPSPARSELT_COMPUTE_32I", "HIP / CUDA"
-   "HIPSPARSELT_R_16F", "HIPSPARSELT_R_16F", "HIPSPARSELT_COMPUTE_16F", "CUDA"
-   "HIPSPARSELT_R_16BF", "HIPSPARSELT_R_16BF", "HIPSPARSELT_COMPUTE_16F", "CUDA"
-   "HIPSPARSELT_R_32F", "HIPSPARSELT_R_32F", "HIPSPARSELT_COMPUTE_TF32", "CUDA"
-   "HIPSPARSELT_R_32F", "HIPSPARSELT_R_32F", "HIPSPARSELT_COMPUTE_TF32_FAST", "CUDA"
+* Supported input and output types.
+
+  .. list-table:: Supported Input/Output Types
+    :header-rows: 1
+    :name: supported-input-output-types
+
+    *
+      - Input/Output Types
+      - Library Data Type
+      - AMD Supports
+      - CUDA Supports
+    *
+      - int8
+      - HIPSPARSELT_R_8I
+      - ✅
+      - ✅
+    *
+      - float8
+      - HIPSPARSELT_R_8F
+      - ❌
+      - ❌
+    *
+      - bfloat8
+      - HIPSPARSELT_R_8BF
+      - ❌
+      - ❌
+    *
+      - int16
+      - Not Supported
+      - ❌
+      - ❌
+    *
+      - float16
+      - HIPSPARSELT_R_16F
+      - ✅
+      - ✅
+    *
+      - bfloat16      
+      - HIPSPARSELT_R_16BF
+      - ✅
+      - ✅
+    *
+      - int32
+      - Not Supported
+      - ❌
+      - ❌
+    *
+      - tensorfloat32
+      - Not Supported
+      - ❌
+      - ❌
+    *
+      - float32
+      - HIPSPARSELT_R_32F
+      - ❌
+      - ✅
+    *
+      - float64
+      - Not Supported
+      - ❌
+      - ❌
+
+* Supported accumulator types.
+
+  .. list-table:: Supported Compute Types
+    :header-rows: 1
+    :name: supported-accumulator-types
+
+    *
+      - Accumulator Types
+      - Library Data Type
+      - AMD Supports
+      - CUDA Supports
+    *
+      - int8
+      - Not Supported
+      - ❌
+      - ❌
+    *
+      - float8
+      - Not Supported
+      - ❌
+      - ❌
+    *
+      - bfloat8
+      - Not Supported
+      - ❌
+      - ❌
+    *
+      - int16
+      - Not Supported
+      - ❌
+      - ❌
+    *
+      - float16
+      - HIPSPARSELT_COMPUTE_16F
+      - ❌
+      - ✅
+    *
+      - bfloat16
+      - Not Supported
+      - ❌
+      - ❌
+    *
+      - int32
+      - HIPSPARSELT_COMPUTE_32I
+      - ✅
+      - ✅
+    *
+      - tensorfloat32
+      - Not Supported
+      - ❌
+      - ✅
+    *
+      - float32
+      - HIPSPARSELT_COMPUTE_32F
+      - ✅
+      - ❌
+    *
+      - float64
+      - Not Supported
+      - ❌
+      - ❌      
+
+* List of supported compute types at specific input and output types:
+
+  .. csv-table::
+     :header: "Input", "Output", "Compute type", "Backend"
+
+     "HIPSPARSELT_R_16F", "HIPSPARSELT_R_16F", "HIPSPARSELT_COMPUTE_32F", "HIP"
+     "HIPSPARSELT_R_16BF", "HIPSPARSELT_R_16BF", "HIPSPARSELT_COMPUTE_32F", "HIP"
+     "HIPSPARSELT_R_8I", "HIPSPARSELT_R_8I", "HIPSPARSELT_COMPUTE_32I", "HIP / CUDA"
+     "HIPSPARSELT_R_8I", "HIPSPARSELT_R_16F", "HIPSPARSELT_COMPUTE_32I", "HIP / CUDA"
+     "HIPSPARSELT_R_16F", "HIPSPARSELT_R_16F", "HIPSPARSELT_COMPUTE_16F", "CUDA"
+     "HIPSPARSELT_R_16BF", "HIPSPARSELT_R_16BF", "HIPSPARSELT_COMPUTE_16F", "CUDA"
+     "HIPSPARSELT_R_32F", "HIPSPARSELT_R_32F", "HIPSPARSELT_COMPUTE_TF32", "CUDA"
+     "HIPSPARSELT_R_32F", "HIPSPARSELT_R_32F", "HIPSPARSELT_COMPUTE_TF32_FAST", "CUDA"
diff --git a/docs/reference/porting.rst b/docs/reference/porting.rst
@@ -12,9 +12,9 @@ The hipSPARSELt interface is compatible with cuSPARSELt APIs. Porting a CUDA app
 originally calls the cuSPARSELt API to an application that calls the hipSPARSELt API should be relatively
 straightforward.
 
-For example, the hipSPARSELt matmul interface is:
+For example, the hipSPARSELt matrix multiplication interface is:
 
-Matmul API
+Matrix multiplication API
 
 .. code-block:: c
 

diff --git a/docs/reference/supported-functions.rst b/docs/reference/supported-functions.rst
@@ -17,6 +17,7 @@ ROCm & CUDA supported functions
     * FP16 input/output, FP32 Matrix Core accumulate
     * BFLOAT16 input/output, FP32 Matrix Core accumulate
     * INT8 input/output, INT32 Matrix Core accumulate
+    * INT8 input, FP16 output, INT32 Matrix Core accumulate
 
   * Matrix pruning and compression functionalities
   * Auto-tuning functionality (see ``hipsparseLtMatmulSearch()``)
@@ -25,7 +26,7 @@ ROCm & CUDA supported functions
     * Single sparse matrix/Multiple dense matrices (Broadcast)
     * Multiple sparse and dense matrices
 
-  * Activation function fuse in spmm kernel support:
+  * Activation function fuse in SpMM kernel support:
 
     * ReLU
     * ClippedReLU (ReLU with upper bound and threshold setting)
@@ -45,4 +46,4 @@ ROCm & CUDA supported functions
 
 * CUDA
 
-  * Support cusparseLt v0.3
+  * Support cuSPARSELt v0.4