diff --git a/docs/API_Reference_Guide.rst b/docs/API_Reference_Guide.rst deleted file mode 100644 index 7bb064ee..00000000 --- a/docs/API_Reference_Guide.rst +++ /dev/null @@ -1,488 +0,0 @@ -************ -Introduction -************ - -rocWMMA is AMD's C++ library for accelerating mixed precision matrix multiply-accumulate operations -leveraging specialized GPU matrix cores on AMD's latest discrete GPUs. - -A C++ API is provided to facilitate decomposition of matrix multiply-accumulate problems into -discretized block fragments and to parallelize block-wise operations across multiple GPU wavefronts. - -The API is implemented in GPU device code: it empowers user device kernel code with direct use of GPU matrix cores. -Moreover, this code can benefit from inline compiler optimization passes and does not incur additional -overhead of external runtime calls or extra kernel launches. - -======== ========= -Acronym Expansion -======== ========= -**GEMM** **GE**\ neral **M**\ atrix **M**\ ultiply -**WMMA** **W**\ avefront **M**\ ixed precision **M**\ ultiply **A**\ ccumulate -**HIP** **H**\ eterogeneous-Compute **I**\ nterface for **P**\ ortability -======== ========= - -rocWMMA is written in C++ and may be applied directly in device kernel code. Library code is templated -for modularity and uses available meta-data to provide opportunities for compile-time inferences and optimizations. - -The rocWMMA API exposes block-wise data load / store and matrix multiply-accumulate functions appropriately sized -for thread-block execution on data fragments. Matrix multiply-accumulate functionality supports mixed precision inputs -and outputs with native fixed-precision accumulation. The rocWMMA Coop API provides wave/warp collaborations -within the thread-blocks for block-wise data load and stores. Supporting code is required for GPU device -management and for kernel invocation. Kernel code samples and tests provided are built and launched via the HIP -ecosystem within ROCm. - -Below is a simple example code for calling rocWMMA functions load_matrix_sync, store_matrix_sync, fill_fragment, mma_sync. - -.. code-block:: c++ - - #include - #include - #include - - #include - #include - - #include - - using rocwmma::float16_t; - using rocwmma::float32_t; - - // Matrix data initialization - template - __host__ static inline void fill(DataT* mat, uint32_t m, uint32_t n) - { - auto ld = n; - for(int i = 0; i < m; ++i) - { - for(int j = 0; j < n; ++j) - { - // Generated data - // Alternate sign every 3 elements - auto value = (i * n + j) % 13; - mat[i * ld + j] = (value % 3) ? -static_cast(value) : static_cast(value); - } - } - } - - // Supports BlockM/N square sizes of - // : 16 x 16 - // : 32 x 32 - const int ROCWMMA_M = 16; - const int ROCWMMA_N = 16; - - // Supports ROCWMMA_K sizes as - // : multiples of 16. - const int ROCWMMA_K = 16; - - // AMDGCN default wave size - const int WAVE_SIZE = rocwmma::AMDGCN_WAVE_SIZE; - - // Thread block - // : T_BLOCK_X must be multiple of WAVE_SIZE. - // Note: Each wave will compute one BLOCK_M x BLOCK_N output block - // Note: Workgroup will compute - // T_BLOCK_X / WAVE_SIZE x T_BLOCK_Y output blocks - // This thread block will compute (4 x 4 output blocks) - const int T_BLOCK_X = 4 * WAVE_SIZE; - const int T_BLOCK_Y = 4; - - // The following device kernel is a naive implementation - // of blocked GEMM. Each wave will compute one BLOCK_M x BLOCK_N - // output block of the M x N x K GEMM, generalized as: - // D = alpha * (A x B) + beta * C - // - // In this simplified example, we assume: - // : A is in row-major format (m x k) - // : B is in col-major format (k x n) - // : C, D are in row-major format (m x n) - // : Multiplication is NOT in-place, output is written to D matrix - // : No LDS required - // - // Disclaimer: This is a simplified implementation to demonstrate API usage in - // context of wave-level GEMM computation, and is not optimized. - // - // Launchable device kernel function: - // - __global__ void gemm_wmma_d(uint32_t m, // matrix free dim m - uint32_t n, // matrix free dim n - uint32_t k, // matrix fixed dim k - float16_t const* a, // device data ptr for matrix A - float16_t const* b, // device data ptr for matrix B - float32_t const* c, // device data ptr for matrix C - float32_t* d, // device data ptr for matrix D - uint32_t lda, // leading dimension for matrix A - uint32_t ldb, // leading dimension for matrix B - uint32_t ldc, // leading dimension for matrix C - uint32_t ldd, // leading dimension for matrix D - float32_t alpha, // uniform scalar - float32_t beta) // uniform scalar - { - // Create frags with meta-data context for block-wise GEMM decomposition - // @tp0: fragment context = matrix_a, matrix_b or accumulator - // @tp1: block size M - // @tp2: block size N - // @tp3: block size K - // @tp4: fragment data type - // @tp5: data layout = row_major, col_major or void (default) - auto fragA = rocwmma::fragment(); - auto fragB = rocwmma::fragment(); - auto fragC = rocwmma::fragment(); - auto fragAcc = rocwmma::fragment(); - - // Initialize accumulator fragment - rocwmma::fill_fragment(fragAcc, 0.0f); - - // Tile using a 2D grid - auto majorWarp = (blockIdx.x * blockDim.x + threadIdx.x) / WAVE_SIZE; - auto minorWarp = (blockIdx.y * blockDim.y + threadIdx.y); - - // Target C block - auto cRow = majorWarp * ROCWMMA_M; - auto cCol = minorWarp * ROCWMMA_N; - - // Bounds check - if(cRow < m && cCol < n) - { - // fragAcc = A x B - for(int i = 0; i < k; i += ROCWMMA_K) - { - // Load the inputs - rocwmma::load_matrix_sync(fragA, a + (cRow * lda + i), lda); - rocwmma::load_matrix_sync(fragB, b + (i + cCol * ldb), ldb); - - // Matrix multiply - accumulate using MFMA units - rocwmma::mma_sync(fragAcc, fragA, fragB, fragAcc); - } - - // Fetch C matrix - rocwmma::load_matrix_sync(fragC, c + (cRow * ldc + cCol), ldc, rocwmma::mem_row_major); - - // D = alpha * A x B + beta * C - for(int i = 0; i < fragC.num_elements; ++i) - { - fragC.x[i] = alpha * fragAcc.x[i] + beta * fragC.x[i]; - } - - // Store to D - rocwmma::store_matrix_sync(d + (cRow * ldd + cCol), fragC, ldd, rocwmma::mem_row_major); - } - } - - // Host side supporting device mgmt and launch code - __host__ void gemm_test(uint32_t m, uint32_t n, uint32_t k, float32_t alpha, float32_t beta) - { - // Problem size check - if((m < (ROCWMMA_M * T_BLOCK_X / WAVE_SIZE) || n < (ROCWMMA_N * T_BLOCK_Y) || k < ROCWMMA_K) - || (m % ROCWMMA_M || n % ROCWMMA_N || k % ROCWMMA_K)) - { - std::cout << "Unsupported size!\n"; - return; - } - - int lda = k; - int ldb = k; - int ldc = n; - int ldd = ldc; - - std::cout << "Initializing host data..." << std::endl; - - // Initialize input matrices - std::vector matrixA(m * k); - std::vector matrixB(k * n); - std::vector matrixC(m * n); - // Fill outputs with NaN to catch contamination - std::vector matrixD(m * n, std::numeric_limits::signaling_NaN()); - - fill(matrixA.data(), m, k); - fill(matrixB.data(), k, n); - fill(matrixC.data(), m, n); - - std::cout << "Initializing device data..." << std::endl; - - // Allocate and copy device memory - float16_t* d_a; - float16_t* d_b; - float32_t* d_c; - float32_t* d_d; - - const size_t bytesA = matrixA.size() * sizeof(float16_t); - const size_t bytesB = matrixB.size() * sizeof(float16_t); - const size_t bytesC = matrixC.size() * sizeof(float32_t); - const size_t bytesD = matrixD.size() * sizeof(float32_t); - - CHECK_HIP_ERROR(hipMalloc(&d_a, bytesA)); - CHECK_HIP_ERROR(hipMalloc(&d_b, bytesB)); - CHECK_HIP_ERROR(hipMalloc(&d_c, bytesC)); - CHECK_HIP_ERROR(hipMalloc(&d_d, bytesD)); - - CHECK_HIP_ERROR(hipMemcpy(d_a, matrixA.data(), bytesA, hipMemcpyHostToDevice)); - CHECK_HIP_ERROR(hipMemcpy(d_b, matrixB.data(), bytesB, hipMemcpyHostToDevice)); - CHECK_HIP_ERROR(hipMemcpy(d_c, matrixC.data(), bytesC, hipMemcpyHostToDevice)); - CHECK_HIP_ERROR(hipMemcpy(d_d, matrixD.data(), bytesD, hipMemcpyHostToDevice)); - - auto blockDim = dim3(T_BLOCK_X, T_BLOCK_Y); - auto gridDim = dim3(rocwmma::ceilDiv(m, ROCWMMA_M * T_BLOCK_X / WAVE_SIZE), - rocwmma::ceilDiv(n, ROCWMMA_N * T_BLOCK_Y)); - - std::cout << "Launching GEMM kernel..." << std::endl; - - hipEvent_t startEvent, stopEvent; - CHECK_HIP_ERROR(hipEventCreate(&startEvent)); - CHECK_HIP_ERROR(hipEventCreate(&stopEvent)); - - hipExtLaunchKernelGGL(gemm_wmma_d, - gridDim, - blockDim, - 0, // sharedMemBytes - 0, // stream - startEvent, // Event start - stopEvent, // event stop - 0, // flags - m, - n, - k, - d_a, - d_b, - d_c, - d_d, - lda, - ldb, - ldc, - ldd, - alpha, - beta); - - auto elapsedTimeMs = 0.0f; - CHECK_HIP_ERROR(hipEventSynchronize(stopEvent)); - CHECK_HIP_ERROR(hipEventElapsedTime(&elapsedTimeMs, startEvent, stopEvent)); - CHECK_HIP_ERROR(hipEventDestroy(startEvent)); - CHECK_HIP_ERROR(hipEventDestroy(stopEvent)); - - // Release device memory - CHECK_HIP_ERROR(hipFree(d_a)); - CHECK_HIP_ERROR(hipFree(d_b)); - CHECK_HIP_ERROR(hipFree(d_c)); - CHECK_HIP_ERROR(hipFree(d_d)); - - std::cout << "Finished!" << std::endl; - } - - int main() - { - gemm_test(256, 256, 256, 2.1f, 2.1f); - return 0; - } - -Synchronous API -^^^^^^^^^^^^^^^ - -In general, rocWMMA API functions ( load_matrix_sync, store_matrix_sync, mma_sync ) are assumed to be synchronous when -used in context of global memory. - -When using these functions in the context of shared memory (e.g. LDS memory), additional explicit workgroup synchronization -may be required due to the nature this memory usage. - - -Supported Data Types -^^^^^^^^^^^^^^^^^^^^ - -rocWMMA mixed precision multiply-accumulate operations support the following data type combinations. - -Data Types **** = - -where - -Input Type = Matrix A/B - -Output Type = Matrix C/D - -Compute Type = math / accumulation type - -.. tabularcolumns:: - |C|C|C|C| - -+------------------------------+------------+-----------+---------------+ -|Ti / To / Tc |BlockM |BlockN |BlockK | -+==============================+============+===========+===============+ -|i8 / i32 / i32 |16 |16 |Min: 16, pow2 | -+------------------------------+------------+-----------+---------------+ -|i8 / i32 / i32 |32 |32 |Min: 8, pow2 | -+------------------------------+------------+-----------+---------------+ -|i8 / i8 / i32 |16 |16 |Min: 16, pow2 | -+------------------------------+------------+-----------+---------------+ -|i8 / i32 / i32 |32 |32 |Min: 8, pow2 | -+------------------------------+------------+-----------+---------------+ -|f16 / f32 / f32 |16 |16 |Min: 16, pow2 | -+------------------------------+------------+-----------+---------------+ -|f16 / f32 / f32 |32 |32 |Min: 8, pow2 | -+------------------------------+------------+-----------+---------------+ -|f16 / f16 / f32 |16 |16 |Min: 16, pow2 | -+------------------------------+------------+-----------+---------------+ -|f16 / f16 / f32 |32 |32 |Min: 8, pow2 | -+------------------------------+------------+-----------+---------------+ -|f16 / f16 / f16* |16 |16 |Min: 16, pow2 | -+------------------------------+------------+-----------+---------------+ -|f16 / f16 / f16* |32 |32 |Min: 8, pow2 | -+------------------------------+------------+-----------+---------------+ -|__half / f32 / f32 |16 |16 |Min: 16, pow2 | -+------------------------------+------------+-----------+---------------+ -|__half / f32 / f32 |32 |32 |Min: 8, pow2 | -+------------------------------+------------+-----------+---------------+ -|__half / __half / f32 |16 |16 |Min: 16, pow2 | -+------------------------------+------------+-----------+---------------+ -|__half / __half / f32 |32 |32 |Min: 8, pow2 | -+------------------------------+------------+-----------+---------------+ -|__half / __half / __half* |16 |16 |Min: 16, pow2 | -+------------------------------+------------+-----------+---------------+ -|__half / __half / __half* |32 |32 |Min: 8, pow2 | -+------------------------------+------------+-----------+---------------+ -|bf16 / f32 / f32 |16 |16 |Min: 8, pow2 | -+------------------------------+------------+-----------+---------------+ -|bf16 / f32 / f32 |32 |32 |Min: 4, pow2 | -+------------------------------+------------+-----------+---------------+ -|bf16 / bf16 / f32 |16 |16 |Min: 8, pow2 | -+------------------------------+------------+-----------+---------------+ -|bf16 / bf16 / f32 |32 |32 |Min: 4, pow2 | -+------------------------------+------------+-----------+---------------+ -|bf16 / bf16 / bf16* |16 |16 |Min: 8, pow2 | -+------------------------------+------------+-----------+---------------+ -|bf16 / bf16 / bf16* |32 |32 |Min: 4, pow2 | -+------------------------------+------------+-----------+---------------+ -|f32 / f32 / f32 |16 |16 |Min: 4, pow2 | -+------------------------------+------------+-----------+---------------+ -|f32 / f32 / f32 |32 |32 |Min: 2, pow2 | -+------------------------------+------------+-----------+---------------+ -|f64** / f64** / f64** |16 |16 |Min: 4, pow2 | -+------------------------------+------------+-----------+---------------+ - -*= matrix unit accumulation is natively 32 bit precision, and is converted to desired type. - -**= f64 datatype is only supported on MI-200 class AMDGPU and successors. - - -Supported Matrix Layouts -^^^^^^^^^^^^^^^^^^^^^^^^ - -(N = col major, T = row major) - -.. tabularcolumns:: - |C|C|C|C| - -+---------+--------+---------+--------+ -|LayoutA |LayoutB |Layout C |LayoutD | -+=========+========+=========+========+ -|N |N |N |N | -+---------+--------+---------+--------+ -|N |N |T |T | -+---------+--------+---------+--------+ -|N |T |N |N | -+---------+--------+---------+--------+ -|N |T |T |T | -+---------+--------+---------+--------+ -|T |N |N |N | -+---------+--------+---------+--------+ -|T |N |T |T | -+---------+--------+---------+--------+ -|T |T |N |N | -+---------+--------+---------+--------+ -|T |T |T |T | -+---------+--------+---------+--------+ - ------------------ -Using rocWMMA API ------------------ - -This section describes how to use the rocWMMA library API. - - -rocWMMA Datatypes -^^^^^^^^^^^^^^^^^ - -matrix_a -'''''''' - -.. doxygenstruct:: rocwmma::matrix_a - - -matrix_b -'''''''' - -.. doxygenstruct:: rocwmma::matrix_b - - -accumulator -''''''''''' - -.. doxygenstruct:: rocwmma::accumulator - - -row_major -''''''''' - -.. doxygenstruct:: rocwmma::row_major - - -col_major -''''''''' - -.. doxygenstruct:: rocwmma::col_major - - -VecT -'''' - -.. doxygenclass:: VecT - - - -IOConfig -'''''''''''' - -.. doxygenstruct:: rocwmma::IOConfig - - -IOShape -'''''''''''' - -.. doxygenstruct:: rocwmma::IOShape - - -rocWMMA Enumeration -^^^^^^^^^^^^^^^^^^^ - - Enumeration constants have numbering that is consistent with standard C++ libraries. - - -layout_t -'''''''''''' - -.. doxygenenum:: rocwmma::layout_t - - -rocWMMA API functions -^^^^^^^^^^^^^^^^^^^^^^ - -.. doxygenfunction:: fill_fragment - -.. doxygenfunction:: load_matrix_sync(fragment& frag, const DataT* data, uint32_t ldm) - -.. doxygenfunction:: load_matrix_sync(fragment& frag, const DataT* data, uint32_t ldm, layout_t layout) - -.. doxygenfunction:: store_matrix_sync(DataT* data, fragment const& frag, uint32_t ldm) - -.. doxygenfunction:: store_matrix_sync(DataT* data, fragment const& frag, uint32_t ldm,layout_t layout) - -.. doxygenfunction:: mma_sync - -.. doxygenfunction:: synchronize_workgroup - -.. doxygenfunction:: load_matrix_coop_sync(fragment& frag, const DataT* data, uint32_t ldm, uint32_t waveIndex, uint32_t waveCount, uint32_t splitCount) - -.. doxygenfunction:: load_matrix_coop_sync(fragment& frag, const DataT* data, uint32_t ldm, uint32_t waveIndex, uint32_t waveCount) - -.. doxygenfunction:: load_matrix_coop_sync(fragment& frag, const DataT* data, uint32_t ldm) - -.. doxygenfunction:: store_matrix_coop_sync(DataT* data, fragment const& frag, uint32_t ldm, uint32_t waveIndex, uint32_t waveCount, uint32_t splitCount) - -.. doxygenfunction:: store_matrix_coop_sync(DataT* data, fragment const& frag, uint32_t ldm, uint32_t waveIndex, uint32_t waveCount) - -.. doxygenfunction:: store_matrix_coop_sync(DataT* data, fragment const& frag, uint32_t ldm) diff --git a/docs/Linux_Install_Guide.rst b/docs/Linux_Install_Guide.rst deleted file mode 100644 index ba527635..00000000 --- a/docs/Linux_Install_Guide.rst +++ /dev/null @@ -1,291 +0,0 @@ -=============================== -Getting Started Guide for Linux -=============================== - ------------- -Introduction ------------- - -This document contains instructions for installing, using, and contributing to rocWMMA. -The quickest way to install is from prebuilt packages. Alternatively, there are instructions to build from source. The document also contains an API Reference Guide, Programmer's Guide, and Contributor's Guides. - -Documentation Roadmap -^^^^^^^^^^^^^^^^^^^^^ -The following is a list of rocWMMA documents in the suggested reading order: - - - Getting Started Guide (this document): Describes how to install and configure the rocWMMA library; designed to get users up and running quickly with the library. - - API Reference Guide : Provides detailed information about rocWMMA functions, data types and other programming constructs. - - Programmer's Guide: Describes the code organization, Design implementation detail, Optimizations used in the library and those that should be considered for new development and Testing & Benchmarking detail. - - Contributor's Guide : Describes coding guidelines for contributors. - -------------- -Prerequisites -------------- - -- A ROCm enabled platform, more information `here `_. - - ------------------------------ -Installing pre-built packages ------------------------------ - -rocWMMA can be installed on Ubuntu or Debian using - -:: - - sudo apt-get update - sudo apt-get install rocWMMA - -rocWMMA can be installed on CentOS using - -:: - - sudo yum update - sudo yum install rocWMMA - -rocWMMA can be installed on SLES using - -:: - - sudo dnf upgrade - sudo dnf install rocWMMA - -Once installed, rocWMMA can be used just like any other library with a C++ API. -The rocwmma.hpp header file will need to be included in the user code in order to make calls -into rocWMMA. - -Once installed, rocwmma.hpp can be found in the /opt/rocm/include directory. -Only this installed file should be used when needed in user code. -Other rocWMMA files can be found in /opt/rocm/include/internal, however these files -should not be directly included. - - -------------------------------- -Building and Installing rocWMMA -------------------------------- - -For most users building from source is not necessary, as rocWMMA can be used after installing the pre-built -packages as described above. If desired, the following instructions can be used to build rocWMMA from source. - -System Requirements -^^^^^^^^^^^^^^^^^^^ -As a general rule, 8GB of system memory is required for a full rocWMMA build. This value can be lower if rocWMMA is built without tests. This value may also increase in the future as more functions are added to rocWMMA. - - -GPU Support -^^^^^^^^^^^ -AMD CDNA class GPU featuring matrix core support: gfx908, gfx90a as 'gfx9' - -`Note: Double precision FP64 datatype support requires gfx90a` - -OR - -AMD RDNA3 class GPU featuring AI acceleration support: gfx1100, gfx1101, gfx1102 as 'gfx11' - - -Download rocWMMA -^^^^^^^^^^^^^^^^ - -The rocWMMA source code is available at the `rocWMMA github page `_. rocWMMA has a minimum ROCm support version 5.4. -Check the ROCm Version on your system. For Ubuntu use - -:: - - apt show rocm-libs -a - -For Centos use - -:: - - yum info rocm-libs - -The ROCm version has major, minor, and patch fields, possibly followed by a build specific identifier. For example the ROCm version could be 4.0.0.40000-23, this corresponds to major = 4, minor = 0, patch = 0, build identifier 40000-23. -There are GitHub branches at the rocWMMA site with names rocm-major.minor.x where major and minor are the same as in the ROCm version. For ROCm version 4.0.0.40000-23 you need to use the following to download rocWMMA: - -:: - - git clone -b release/rocm-rel-x.y https://github.com/ROCmSoftwarePlatform/rocWMMA.git - cd rocWMMA - -Replace x.y in the above command with the version of ROCm installed on your machine. For example: if you have ROCm 5.0 installed, then replace release/rocm-rel-x.y with release/rocm-rel-5.0 - -The user can build either - -* library - -* library + samples - -* library + tests - -* library + tests + assembly - -You only need (library) if you call rocWMMA from your code. -The client contains the test samples and benchmark code. - -Below are the project options available to build rocWMMA library with/without clients. - -.. tabularcolumns:: - |C|C|C| - -+------------------------------+-------------------------------------+----------------------------------------------------------------------------------------+ -|Option |Description |Default Value | -+==============================+=====================================+========================================================================================+ -|AMDGPU_TARGETS |Build code for specific GPU target(s)|gfx908:xnack-;gfx90a:xnack-;gfx90a:xnack+;gfx940;gfx941;gfx942;gfx1100;gfx1101;gfx1102 | -+------------------------------+-------------------------------------+----------------------------------------------------------------------------------------+ -|ROCWMMA_BUILD_TESTS |Build Tests |ON | -+------------------------------+-------------------------------------+----------------------------------------------------------------------------------------+ -|ROCWMMA_BUILD_SAMPLES |Build Samples |ON | -+------------------------------+-------------------------------------+----------------------------------------------------------------------------------------+ -|ROCWMMA_BUILD_ASSEMBLY |Generate assembly files |OFF | -+------------------------------+-------------------------------------+----------------------------------------------------------------------------------------+ -|ROCWMMA_BUILD_VALIDATION_TESTS|Build validation tests |ON (requires ROCWMMA_BUILD_TESTS=ON) | -+------------------------------+-------------------------------------+----------------------------------------------------------------------------------------+ -|ROCWMMA_BUILD_BENCHMARK_TESTS |Build benchmark tests |OFF (requires ROCWMMA_BUILD_TESTS=ON) | -+------------------------------+-------------------------------------+----------------------------------------------------------------------------------------+ -|ROCWMMA_BUILD_EXTENDED_TESTS |Build extended testing coverage |OFF (requires ROCWMMA_BUILD_TESTS=ON) | -+------------------------------+-------------------------------------+----------------------------------------------------------------------------------------+ -|ROCWMMA_VALIDATE_WITH_ROCBLAS |Use rocBLAS for validation tests |ON (requires ROCWMMA_BUILD_VALIDATION_TESTS=ON) | -+------------------------------+-------------------------------------+----------------------------------------------------------------------------------------+ -|ROCWMMA_BENCHMARK_WITH_ROCBLAS|Include rocBLAS benchmarking data |OFF (requires ROCWMMA_BUILD_BENCHMARK_TESTS=ON) | -+------------------------------+-------------------------------------+----------------------------------------------------------------------------------------+ - - -Build only library -^^^^^^^^^^^^^^^^^^ - -ROCm-cmake has a minimum version requirement 0.8.0 for ROCm 5.3. - -Minimum ROCm version support is 5.4. - -By default, the project is configured as Release mode. - -To build only library, run the following comomand : - - CC=hipcc CXX=hipcc cmake -B . -DROCWMMA_BUILD_TESTS=OFF -DROCWMMA_BUILD_SAMPLES=OFF - -Here are some other example project configurations: - -.. tabularcolumns:: - |\X{1}{4}|\X{3}{4}| - -+-----------------------------------+--------------------------------------------------------------------------------------------------------------------+ -| Configuration | Command | -+===================================+====================================================================================================================+ -| Basic | CC=hipcc CXX=hipcc cmake -B . | -+-----------------------------------+--------------------------------------------------------------------------------------------------------------------+ -| Targeting gfx908 | CC=hipcc CXX=hipcc cmake -B . -DAMDGPU_TARGETS=gfx908:xnack- | -+-----------------------------------+--------------------------------------------------------------------------------------------------------------------+ -| Debug build | CC=hipcc CXX=hipcc cmake -B . -DCMAKE_BUILD_TYPE=Debug | -+-----------------------------------+--------------------------------------------------------------------------------------------------------------------+ -| Build without rocBLAS(default on) | CC=hipcc CXX=hipcc cmake -B . -DROCWMMA_VALIDATE_WITH_ROCBLAS=OFF -DROCWMMA_BENCHMARK_WITH_ROCBLAS=OFF | -+-----------------------------------+--------------------------------------------------------------------------------------------------------------------+ - -After configuration, build with - - cmake --build -- -j - - -Build library + samples -^^^^^^^^^^^^^^^^^^^^^^^ - -To build library and samples, run the following comomand : - - CC=hipcc CXX=hipcc cmake -B . -DROCWMMA_BUILD_TESTS=OFF -DROCWMMA_BUILD_SAMPLES=ON - -After configuration, build with - - cmake --build -- -j - -The samples folder in contains executables in the table below. - -================ ============================================================================================================================== -executable name description -================ ============================================================================================================================== -simple_sgemm a simple GEMM operation [D = alpha * (A x B) + beta * C] using rocWMMA API for single-precision floating point types -simple_dgemm a simple GEMM operation [D = alpha * (A x B) + beta * C] using rocWMMA API for double-precision floating point types -simple_hgemm a simple GEMM operation [D = alpha * (A x B) + beta * C] using rocWMMA API for half-precision floating point types - -perf_sgemm an optimized GEMM operation [D = alpha * (A x B) + beta * C] using rocWMMA API for single-precision floating point types -perf_dgemm an optimized GEMM operation [D = alpha * (A x B) + beta * C] using rocWMMA API for double-precision floating point types -perf_hgemm an optimized GEMM operation [D = alpha * (A x B) + beta * C] using rocWMMA API for half-precision floating point types - -simple_sgemv a simple GEMV operation [y = alpha * (A) * x + beta * y] using rocWMMA API for single-precision fp32 inputs and output -simple_dgemv a simple GEMV operation [y = alpha * (A) * x + beta * y] using rocWMMA API for double-precision fp64 inputs and output - -simple-dlrm a simple DLRM operation using rocWMMA API - -hipRTC_gemm a simple GEMM operation [D = alpha * (A x B) + beta * C] demonstrating runtime compilation (hipRTC) compatibility -================ ============================================================================================================================== - - -Build library + tests -^^^^^^^^^^^^^^^^^^^^^ -rocWMMA has several test suites that can be built: - -- DLRM tests -- GEMM tests -- Unit tests - -DLRM tests cover the dot product interactions between embeddings used in DLRM. - -GEMM tests cover block-wise Generalized Matrix Multiplication (GEMM) implemented with rocWMMA. - -Unit tests cover various aspects of rocWMMA API and internal functionality. - -rocWMMA can build both validation and benchmark tests. The library uses CPU or rocBLAS methods for validation (where available) and benchmark comparisons based on the provided project option. -By default, the project is linked against rocBLAS for validating results. -Minimum ROCBLAS library version requirement is 2.39.0 for ROCm 4.3.0 - -To build library and tests, run the following command : - - CC=hipcc CXX=hipcc cmake -B . - -After configuration, build with - - cmake --build -- -j - -The tests in contains executables in the table below. - -====================================== =========================================================================================================== -executable name description -====================================== =========================================================================================================== -dlrm/dlrm_dot_test-* a DLRM implementation using rocWMMA API -dlrm/dlrm_dot_lds_test-* a DLRM implementation using rocWMMA API with LDS shared memory -gemm/mma_sync_test-* a simple GEMM operation [D = alpha * (A x B) + beta * C] using rocWMMA API -gemm/mma_sync_multi_test-* a modified GEMM operation, each wave targets a sub-grid of output blocks using rocWMMA API -gemm/mma_sync_multi_ad_hoc_test-* an adhoc version of mma_sync_multi_test-* -gemm/mma_sync_multi_lds_test-* a modified GEMM operation, each wave targets a sub-grid of output blocks using LDS memory and rocWMMA API and wave-level collaboration -gemm/mma_sync_multi_lds_ad_hoc_test-* an adhoc version of mma_sync_multi_lds_test-* -gemm/mma_sync_coop_wg_test-* a modified GEMM operation, each wave targets a sub-grid of output blocks using LDS memory and rocWMMA API and workgroup-level collaboration -gemm/mma_sync_coop_wg_ad_hoc_test-* an adhoc version of mma_sync_coop_wg_test-* -gemm/barrier_test-* a simple GEMM operation with wave synchronization -unit/contamination_test tests against contamination of pristine data for loads and stores -unit/cross_lane_ops_test tests cross-lane vector operations -unit/fill_fragment_test tests fill_fragment API function -unit/io_shape_test tests input/output shape meta data -unit/io_traits_test tests input/output logistical meta data -unit/layout_test tests accuracy of internal matrix layout patterns -unit/load_store_matrix_sync_test tests load_matrix_sync and store_matrix_sync API functions -unit/load_store_matrix_coop_sync_test tests load_matrix_coop_sync and store_matrix_coop_sync API functions -unit/map_util_test tests mapping utilities used in rocWMMA implementations -unit/vector_iterator_test tests internal vector storage iteration implementation -unit/vector_test tests internal vector storage implementation -====================================== =========================================================================================================== - -*= validate: executables that compare outputs for correctness against reference sources such as CPU or rocBLAS calculations. - -*= bench: executables that measure kernel execution speeds and may compare against those of rocBLAS references. - - -Build library + Tests + Assembly -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -To build library and tests with assembly code generation, run the following command : - - CC=hipcc CXX=hipcc cmake -B . -DROCWMMA_BUILD_ASSEMBLY=ON - -After configuration, build with - - cmake --build -- -j - -The assembly folder in contains assembly generation of test executables in the format [test_executable_name.s] diff --git a/docs/Programmers_Guide.rst b/docs/Programmers_Guide.rst deleted file mode 100644 index 39d3ce5f..00000000 --- a/docs/Programmers_Guide.rst +++ /dev/null @@ -1,151 +0,0 @@ - -=================== -Programmer's Guide -=================== - --------------------------------- -Library Source Code Organization --------------------------------- - -The rocWMMA code is split into four major parts: - -- The `library` directory contains all source code for the library. -- The `samples` directory contains real-world use-cases of the rocWMMA API. -- The `test` directory contains all validation, performance and unit tests of rocWMMA API. -- Infrastructure - -The `library` directory -^^^^^^^^^^^^^^^^^^^^^^^ - -library/include/rocwmma/ -'''''''''''''''''''''''' - -Contains C++ include files for the rocWMMA API. These files also contain Doxygen -comments that document the API. - -library/include/internal -'''''''''''''''''''''''' - -Internal include files for: - -- Type support -- Input / output configuration, shapes and traits -- Layout -- Mapping Utility -- Cross-lane operation utility -- Vector blend utility -- Packing and unpacking -- Conversion and broadcasting -- Load and store -- Matrix multiply-accumulate -- Cooperative load and store -- Threadblock synchronization -- Utility code - - -The `samples` directory -^^^^^^^^^^^^^^^^^^^^^^^ -samples/hipRTC_gemm.cpp -''''''''''''''''''''''' - -sample code for calling Simple GEMM algorithm demonstration without LDS memory usage and no transpose, from within the hipRTC environment. - - -samples/simple_sgemv.cpp -'''''''''''''''''''''''' - -sample code for calling Simple matrix multiply-accumulate with a vector demonstration, without LDS and no transpose for single-precision floating point types. - - -samples/simple_dgemv.cpp -'''''''''''''''''''''''' - -sample code for calling Simple matrix multiply-accumulate with a vector demonstration, without LDS and no transpose for double-precision floating point types. - - -samples/simple_sgemm.cpp -'''''''''''''''''''''''' - -Sample code for calling Simple GEMM algorithm demonstration without LDS memory usage and no transpose for single-precision floating point types. - - -samples/simple_dgemm.cpp -'''''''''''''''''''''''' - -Sample code for calling Simple GEMM algorithm demonstration without LDS memory usage and no transpose for double-precision floating point types. - - -samples/simple_hgemm.cpp -'''''''''''''''''''''''' - -Sample code for calling Simple GEMM algorithm demonstration without LDS memory usage and no transpose for half-precision floating point types. - - -samples/perf_sgemm.cpp -'''''''''''''''''''''''' - -Sample code for calling the best performant multi-block GEMM algorithm demonstration with LDS memory, Macro Tile Collaboration, Data Re-use and -Optimized pipeline for single-precision floating point types. - - -samples/perf_dgemm.cpp -'''''''''''''''''''''''' - -Sample code for calling the best performant multi-block GEMM algorithm demonstration with LDS memory, Macro Tile Collaboration, Data Re-use and -Optimized pipeline for double-precision floating point types. - - -samples/perf_hgemm.cpp -'''''''''''''''''''''''' - -Sample code for calling the best performant multi-block GEMM algorithm demonstration with LDS memory, Macro Tile Collaboration, Data Re-use and -Optimized pipeline for half-precision floating point types. - - -samples/simple_dlrm.cpp -''''''''''''''''''''''' - -Sample code for calling Simple Deep Learning Recommendation Model (DLRM) for machine learning. - - -samples/common.hpp -'''''''''''''''''' - -Common code used by all the above rocWMMA samples files. - - -The `test` directory -^^^^^^^^^^^^^^^^^^^^^^^ - -test/bin -'''''''' - -Script to generate benchmark plots from the gtest output dumps of benchmark tests of rocWMMA. - -test/dlrm -''''''''' - -Test code for various strategies of DLRM application. This test is used to validate dlrm functions using rocWMMA API. - -test/gemm -''''''''' - -Test Code for various strategies of GEMM application. This test is used to validate and benchmark GEMM functions using rocWMMA API. - -test/unit -''''''''' - -Test code for testing the basic functional units of rocWMMA library. - - -Infrastructure -^^^^^^^^^^^^^^ - -- CMake is used to build and package rocWMMA. There are CMakeLists.txt files throughout the code. -- Doxygen/Breathe/Sphinx/ReadTheDocs are used to produce documentation. Content for the documentation is from: - - - Doxygen comments in include files in the directory library/include - - files in the directory docs/source. - -- Jenkins is used to automate Continuous Integration testing. -- clang-format is used to format C++ code. diff --git a/docs/api-reference-guide.rst b/docs/api-reference-guide.rst new file mode 100644 index 00000000..fccd2ab5 --- /dev/null +++ b/docs/api-reference-guide.rst @@ -0,0 +1,229 @@ +.. meta:: + :description: C++ library for accelerating mixed precision matrix multiply-accumulate operations + leveraging specialized GPU matrix cores on AMD's latest discrete GPUs + :keywords: rocWMMA, ROCm, library, API, tool + +.. _api-reference-guide: + +******************** +API reference guide +******************** + +This document provides information about rocWMMA functions, data types, and other programming constructs. + +Synchronous API +^^^^^^^^^^^^^^^ + +In general, rocWMMA API functions ( ``load_matrix_sync``, ``store_matrix_sync``, ``mma_sync`` ) are assumed to be synchronous when +used in the context of global memory. + +When using these functions in the context of shared memory (e.g. LDS memory), additional explicit workgroup synchronization +may be required due to the nature of this memory usage. + +Supported data types +^^^^^^^^^^^^^^^^^^^^ + +rocWMMA mixed precision multiply-accumulate operations support the following data type combinations. + +Data Types **** = + +where, + +Input Type = Matrix A/B + +Output Type = Matrix C/D + +Compute Type = Math / accumulation type + +.. tabularcolumns:: + |C|C|C|C| + ++------------------------------+------------+-----------+---------------+ +|Ti / To / Tc |BlockM |BlockN |BlockK | ++==============================+============+===========+===============+ +|i8 / i32 / i32 |16 |16 |Min: 16, pow2 | ++------------------------------+------------+-----------+---------------+ +|i8 / i32 / i32 |32 |32 |Min: 8, pow2 | ++------------------------------+------------+-----------+---------------+ +|i8 / i8 / i32 |16 |16 |Min: 16, pow2 | ++------------------------------+------------+-----------+---------------+ +|i8 / i32 / i32 |32 |32 |Min: 8, pow2 | ++------------------------------+------------+-----------+---------------+ +|f16 / f32 / f32 |16 |16 |Min: 16, pow2 | ++------------------------------+------------+-----------+---------------+ +|f16 / f32 / f32 |32 |32 |Min: 8, pow2 | ++------------------------------+------------+-----------+---------------+ +|f16 / f16 / f32 |16 |16 |Min: 16, pow2 | ++------------------------------+------------+-----------+---------------+ +|f16 / f16 / f32 |32 |32 |Min: 8, pow2 | ++------------------------------+------------+-----------+---------------+ +|f16 / f16 / f16* |16 |16 |Min: 16, pow2 | ++------------------------------+------------+-----------+---------------+ +|f16 / f16 / f16* |32 |32 |Min: 8, pow2 | ++------------------------------+------------+-----------+---------------+ +|__half / f32 / f32 |16 |16 |Min: 16, pow2 | ++------------------------------+------------+-----------+---------------+ +|__half / f32 / f32 |32 |32 |Min: 8, pow2 | ++------------------------------+------------+-----------+---------------+ +|__half / __half / f32 |16 |16 |Min: 16, pow2 | ++------------------------------+------------+-----------+---------------+ +|__half / __half / f32 |32 |32 |Min: 8, pow2 | ++------------------------------+------------+-----------+---------------+ +|__half / __half / __half* |16 |16 |Min: 16, pow2 | ++------------------------------+------------+-----------+---------------+ +|__half / __half / __half* |32 |32 |Min: 8, pow2 | ++------------------------------+------------+-----------+---------------+ +|bf16 / f32 / f32 |16 |16 |Min: 8, pow2 | ++------------------------------+------------+-----------+---------------+ +|bf16 / f32 / f32 |32 |32 |Min: 4, pow2 | ++------------------------------+------------+-----------+---------------+ +|bf16 / bf16 / f32 |16 |16 |Min: 8, pow2 | ++------------------------------+------------+-----------+---------------+ +|bf16 / bf16 / f32 |32 |32 |Min: 4, pow2 | ++------------------------------+------------+-----------+---------------+ +|bf16 / bf16 / bf16* |16 |16 |Min: 8, pow2 | ++------------------------------+------------+-----------+---------------+ +|bf16 / bf16 / bf16* |32 |32 |Min: 4, pow2 | ++------------------------------+------------+-----------+---------------+ +|f32 / f32 / f32 |16 |16 |Min: 4, pow2 | ++------------------------------+------------+-----------+---------------+ +|f32 / f32 / f32 |32 |32 |Min: 2, pow2 | ++------------------------------+------------+-----------+---------------+ +|f64** / f64** / f64** |16 |16 |Min: 4, pow2 | ++------------------------------+------------+-----------+---------------+ + +*= Matrix unit accumulation is natively 32-bit precision and is converted to the desired type. + +**= f64 datatype is only supported on MI-200 class AMDGPU and successors. + +Supported matrix layouts +^^^^^^^^^^^^^^^^^^^^^^^^ + +(N = col major, T = row major) + +.. tabularcolumns:: + |C|C|C|C| + ++---------+--------+---------+--------+ +|LayoutA |LayoutB |Layout C |LayoutD | ++=========+========+=========+========+ +|N |N |N |N | ++---------+--------+---------+--------+ +|N |N |T |T | ++---------+--------+---------+--------+ +|N |T |N |N | ++---------+--------+---------+--------+ +|N |T |T |T | ++---------+--------+---------+--------+ +|T |N |N |N | ++---------+--------+---------+--------+ +|T |N |T |T | ++---------+--------+---------+--------+ +|T |T |N |N | ++---------+--------+---------+--------+ +|T |T |T |T | ++---------+--------+---------+--------+ + +----------------- +Using rocWMMA API +----------------- + +This section describes how to use the rocWMMA library API. + +rocWMMA datatypes +^^^^^^^^^^^^^^^^^ + +matrix_a +'''''''' + +.. doxygenstruct:: rocwmma::matrix_a + + +matrix_b +'''''''' + +.. doxygenstruct:: rocwmma::matrix_b + + +accumulator +''''''''''' + +.. doxygenstruct:: rocwmma::accumulator + + +row_major +''''''''' + +.. doxygenstruct:: rocwmma::row_major + + +col_major +''''''''' + +.. doxygenstruct:: rocwmma::col_major + + +VecT +'''' + +.. doxygenclass:: VecT + + + +IOConfig +'''''''''''' + +.. doxygenstruct:: rocwmma::IOConfig + + +IOShape +'''''''''''' + +.. doxygenstruct:: rocwmma::IOShape + +rocWMMA enumeration +^^^^^^^^^^^^^^^^^^^ + +.. note:: + The enumeration constants numbering is consistent with the standard C++ libraries. + +layout_t +'''''''''''' + +.. doxygenenum:: rocwmma::layout_t + + +rocWMMA API functions +^^^^^^^^^^^^^^^^^^^^^^ + +.. doxygenfunction:: fill_fragment + +.. doxygenfunction:: load_matrix_sync(fragment& frag, const DataT* data, uint32_t ldm) + +.. doxygenfunction:: load_matrix_sync(fragment& frag, const DataT* data, uint32_t ldm, layout_t layout) + +.. doxygenfunction:: store_matrix_sync(DataT* data, fragment const& frag, uint32_t ldm) + +.. doxygenfunction:: store_matrix_sync(DataT* data, fragment const& frag, uint32_t ldm,layout_t layout) + +.. doxygenfunction:: mma_sync + +.. doxygenfunction:: synchronize_workgroup + +.. doxygenfunction:: load_matrix_coop_sync(fragment& frag, const DataT* data, uint32_t ldm, uint32_t waveIndex, uint32_t waveCount, uint32_t splitCount) + +.. doxygenfunction:: load_matrix_coop_sync(fragment& frag, const DataT* data, uint32_t ldm, uint32_t waveIndex, uint32_t waveCount) + +.. doxygenfunction:: load_matrix_coop_sync(fragment& frag, const DataT* data, uint32_t ldm) + +.. doxygenfunction:: store_matrix_coop_sync(DataT* data, fragment const& frag, uint32_t ldm, uint32_t waveIndex, uint32_t waveCount, uint32_t splitCount) + +.. doxygenfunction:: store_matrix_coop_sync(DataT* data, fragment const& frag, uint32_t ldm, uint32_t waveIndex, uint32_t waveCount) + +.. doxygenfunction:: store_matrix_coop_sync(DataT* data, fragment const& frag, uint32_t ldm) + +Sample programs +^^^^^^^^^^^^^^^^ + +See a sample code for calling rocWMMA functions ``load_matrix_sync``, ``store_matrix_sync``, ``fill_fragment``, and ``mma_sync`` `here `_. +For more such sample programs, refer to the `Samples directory `_. diff --git a/docs/index.rst b/docs/index.rst index c2023c5a..57eb9fbd 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,13 +1,35 @@ -================== -rocWMMA User Guide -================== - -.. toctree:: - :maxdepth: 5 - :caption: Contents: - :numbered: - - Linux_Install_Guide - API_Reference_Guide - Programmers_Guide - Contributors_Guide +.. meta:: + :description: C++ library for accelerating mixed precision matrix multiply-accumulate operations + leveraging specialized GPU matrix cores on AMD's latest discrete GPUs + :keywords: rocWMMA, ROCm, library, API, tool + +.. _index: + +=========================== +rocWMMA documentation +=========================== + +Welcome to the rocWMMA docs home page! To learn more, see :ref:`what-is-rocwmma`. + +Our documentation is structured as follows: + +.. grid:: 2 + :gutter: 3 + + .. grid-item-card:: Tutorial + + * :ref:`installation` + + .. grid-item-card:: Conceptual + + * :ref:`programmers-guide` + + .. grid-item-card:: API reference + + * :ref:`api-reference-guide` + +To contribute to the documentation refer to +`Contributing to ROCm `_. + +Licensing information can be found on the +`Licensing `_ page. \ No newline at end of file diff --git a/docs/installation.rst b/docs/installation.rst new file mode 100644 index 00000000..8ac3c6ad --- /dev/null +++ b/docs/installation.rst @@ -0,0 +1,294 @@ +.. meta:: + :description: C++ library for accelerating mixed precision matrix multiply-accumulate operations + leveraging specialized GPU matrix cores on AMD's latest discrete GPUs + :keywords: rocWMMA, ROCm, library, API, tool + +.. _installation: + +============== +Installation +============== + +This document provides instructions for installing and configuring the rocWMMA library. +The quickest way to install is using prebuilt packages. Alternatively, there are instructions to build from source. + +------------- +Prerequisites +------------- + +A ROCm enabled platform. More information `here `_. + +----------------------------- +Installing pre-built packages +----------------------------- + +To install rocWMMA on Ubuntu or Debian, use: + +:: + + sudo apt-get update + sudo apt-get install rocWMMA + +To install rocWMMA on CentOS, use: + +:: + + sudo yum update + sudo yum install rocWMMA + +To install rocWMMA on SLES, use: + +:: + + sudo dnf upgrade + sudo dnf install rocWMMA + +Once installed, rocWMMA can be used just like any other library with a C++ API. + +Once rocWMMA is installed, you can see the ``rocwmma.hpp`` header file in the ``/opt/rocm/include`` directory. +You must include only ``rocwmma.hpp`` in the user code to make calls into rocWMMA. Don't directly include other rocWMMA files that are found in ``/opt/rocm/include/internal``. + +------------------------------- +Building and installing rocWMMA +------------------------------- + +For most users building from source is not necessary, as rocWMMA can be used after installing the pre-built +packages as described above. If still desired, here are the instructions to build rocWMMA from source: + +System requirements +^^^^^^^^^^^^^^^^^^^ +As a general rule, 8GB of system memory is required for a full rocWMMA build. This value can be lower if rocWMMA is built without tests. This value may also increase in the future as more functions are added to rocWMMA. + + +GPU support +^^^^^^^^^^^ +AMD CDNA class GPU featuring matrix core support: `gfx908`, `gfx90a` as `gfx9` + +.. note:: + Double precision FP64 datatype support requires gfx90a + +Or + +AMD RDNA3 class GPU featuring AI acceleration support: `gfx1100`, `gfx1101`, `gfx1102` as `gfx11` + +Download rocWMMA +^^^^^^^^^^^^^^^^^ + +The rocWMMA source code is available at the `rocWMMA github page `_. rocWMMA has a minimum ROCm support version 5.4. +To check the ROCm version on an Ubuntu system, use: + +:: + + apt show rocm-libs -a + +On Centos, use: + +:: + + yum info rocm-libs + +The ROCm version has major, minor, and patch fields, possibly followed by a build specific identifier. For example, a ROCm version 4.0.0.40000-23 corresponds to major = 4, minor = 0, patch = 0, and build identifier 40000-23. +There are GitHub branches at the rocWMMA site with names ``rocm-major.minor.x`` where major and minor are the same as in the ROCm version. To download rocWMMA on ROCm version 4.0.0.40000-23, use: + +:: + + git clone -b release/rocm-rel-x.y https://github.com/ROCmSoftwarePlatform/rocWMMA.git + cd rocWMMA + +Replace ``x.y`` in the above command with the version of ROCm installed on your machine. For example, if you have ROCm 5.0 installed, then replace release/rocm-rel-x.y with release/rocm-rel-5.0. + +You can choose to build any of the following: + +* library + +* library and samples + +* library and tests + +* library, tests, and assembly + +You only need (library) for calling rocWMMA from your code. +The client contains the test samples and benchmark code. + +Below are the project options available to build rocWMMA library with or without clients. + +.. list-table:: + + * - **Option** + - **Description** + - **Default Value** + * - AMDGPU_TARGETS + - Build code for specific GPU target(s) + - ``gfx908:xnack-``; ``gfx90a:xnack-``; ``gfx90a:xnack+``; ``gfx940``; ``gfx941``; ``gfx942``; ``gfx1100``; ``gfx1101``; ``gfx1102`` + * - ROCWMMA_BUILD_TESTS + - Build Tests + - ON + * - ROCWMMA_BUILD_SAMPLES + - Build Samples + - ON + * - ROCWMMA_BUILD_ASSEMBLY + - Generate assembly files + - OFF + * - ROCWMMA_BUILD_VALIDATION_TESTS + - Build validation tests + - ON (requires ROCWMMA_BUILD_TESTS=ON) + * - ROCWMMA_BUILD_BENCHMARK_TESTS + - Build benchmark tests + - OFF (requires ROCWMMA_BUILD_TESTS=ON) + * - ROCWMMA_BUILD_EXTENDED_TESTS + - Build extended testing coverage + - OFF (requires ROCWMMA_BUILD_TESTS=ON) + * - ROCWMMA_VALIDATE_WITH_ROCBLAS + - Use rocBLAS for validation tests + - ON (requires ROCWMMA_BUILD_VALIDATION_TESTS=ON) + * - ROCWMMA_BENCHMARK_WITH_ROCBLAS + - Include rocBLAS benchmarking data + - OFF (requires ROCWMMA_BUILD_BENCHMARK_TESTS=ON) + +Build library +^^^^^^^^^^^^^^^^^^ + +ROCm-cmake has a minimum version requirement of 0.8.0 for ROCm 5.3. + +Minimum ROCm version support is 5.4. + +By default, the project is configured in Release mode. + +To build the library alone, run: + +.. code-block:: bash + + CC=hipcc CXX=hipcc cmake -B . -DROCWMMA_BUILD_TESTS=OFF -DROCWMMA_BUILD_SAMPLES=OFF + +Here are some other example project configurations: + +.. tabularcolumns:: + |\X{1}{4}|\X{3}{4}| + ++-----------------------------------+--------------------------------------------------------------------------------------------------------------------+ +| Configuration | Command | ++===================================+====================================================================================================================+ +| Basic | ``CC=hipcc CXX=hipcc cmake -B`` | ++-----------------------------------+--------------------------------------------------------------------------------------------------------------------+ +| Targeting gfx908 | ``CC=hipcc CXX=hipcc cmake -B . -DAMDGPU_TARGETS=gfx908:xnack-`` | ++-----------------------------------+--------------------------------------------------------------------------------------------------------------------+ +| Debug build | ``CC=hipcc CXX=hipcc cmake -B . -DCMAKE_BUILD_TYPE=Debug`` | ++-----------------------------------+--------------------------------------------------------------------------------------------------------------------+ +| Build without rocBLAS(default on) | ``CC=hipcc CXX=hipcc cmake -B . -DROCWMMA_VALIDATE_WITH_ROCBLAS=OFF -DROCWMMA_BENCHMARK_WITH_ROCBLAS=OFF`` | ++-----------------------------------+--------------------------------------------------------------------------------------------------------------------+ + +After configuration, build using: + +.. code-block:: bash + + cmake --build -- -j + +Build library and samples +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +To build library and samples, run: + +.. code-block:: bash + + CC=hipcc CXX=hipcc cmake -B . -DROCWMMA_BUILD_TESTS=OFF -DROCWMMA_BUILD_SAMPLES=ON + +After configuration, build using: + +.. code-block:: bash + + cmake --build -- -j + +The samples folder in ```` contains executables as given in the table below. + +================ ============================================================================================================================== +Executable Name Description +================ ============================================================================================================================== +``simple_sgemm`` A simple General Matrix Multiply (GEMM) operation [D = alpha * (A x B) + beta * C] using rocWMMA API for single-precision floating point types +``simple_dgemm`` A simple GEMM operation [D = alpha * (A x B) + beta * C] using rocWMMA API for double-precision floating point types +``simple_hgemm`` A simple GEMM operation [D = alpha * (A x B) + beta * C] using rocWMMA API for half-precision floating point types + +``perf_sgemm`` An optimized GEMM operation [D = alpha * (A x B) + beta * C] using rocWMMA API for single-precision floating point types +``perf_dgemm`` An optimized GEMM operation [D = alpha * (A x B) + beta * C] using rocWMMA API for double-precision floating point types +``perf_hgemm`` An optimized GEMM operation [D = alpha * (A x B) + beta * C] using rocWMMA API for half-precision floating point types + +``simple_sgemv`` A simple GEMV operation [y = alpha * (A) * x + beta * y] using rocWMMA API for single-precision fp32 inputs and output +``simple_dgemv`` A simple GEMV operation [y = alpha * (A) * x + beta * y] using rocWMMA API for double-precision fp64 inputs and output + +``simple-dlrm`` A simple DLRM operation using rocWMMA API + +``hipRTC_gemm`` A simple GEMM operation [D = alpha * (A x B) + beta * C] demonstrating runtime compilation (hipRTC) compatibility +================ ============================================================================================================================== + + +Build library and tests +^^^^^^^^^^^^^^^^^^^^^^^^^ +rocWMMA provides the following test suites: + +- DLRM tests: Cover the dot product interactions between embeddings used in DLRM +- GEMM tests: Cover block-wise Generalized Matrix Multiplication (GEMM) implemented with rocWMMA +- Unit tests: Cover various aspects of rocWMMA API and internal functionality + +rocWMMA can build both validation and benchmark tests. The library uses CPU or rocBLAS methods for validation (when available) and benchmark comparisons based on the provided project option. +By default, the project is linked against rocBLAS for validating results. +Minimum ROCBLAS library version requirement for ROCm 4.3.0 is 2.39.0. + +To build library and tests, run: + +.. code-block:: bash + + CC=hipcc CXX=hipcc cmake -B . + +After configuration, build using: + +.. code-block:: bash + + cmake --build -- -j + +The tests in ```` contain executables as given in the table below. + +====================================== =========================================================================================================== +Executable Name Description +====================================== =========================================================================================================== +``dlrm/dlrm_dot_test-``* A DLRM implementation using rocWMMA API +``dlrm/dlrm_dot_lds_test-``* A DLRM implementation using rocWMMA API with LDS shared memory +``gemm/mma_sync_test-``* A simple GEMM operation [D = alpha * (A x B) + beta * C] using rocWMMA API +``gemm/mma_sync_multi_test-``* A modified GEMM operation where each wave targets a sub-grid of output blocks using rocWMMA API +``gemm/mma_sync_multi_ad_hoc_test-``* An adhoc version of ``mma_sync_multi_test-``* +``gemm/mma_sync_multi_lds_test-``* A modified GEMM operation where each wave targets a sub-grid of output blocks using LDS memory, rocWMMA API, and wave-level collaboration +``gemm/mma_sync_multi_lds_ad_hoc_test-``* An adhoc version of ``mma_sync_multi_lds_test-``* +``gemm/mma_sync_coop_wg_test-``* A modified GEMM operation where each wave targets a sub-grid of output blocks using LDS memory, rocWMMA API, and workgroup-level collaboration +``gemm/mma_sync_coop_wg_ad_hoc_test-``* An adhoc version of ``mma_sync_coop_wg_test-``* +``gemm/barrier_test-``* A simple GEMM operation with wave synchronization +``unit/contamination_test`` Tests against contamination of pristine data for loads and stores +``unit/cross_lane_ops_test`` Tests cross-lane vector operations +``unit/fill_fragment_test`` Tests fill_fragment API function +``unit/io_shape_test`` Tests input and output shape meta data +``unit/io_traits_test`` Tests input and output logistical meta data +``unit/layout_test`` Tests accuracy of internal matrix layout patterns +``unit/load_store_matrix_sync_test`` Tests ``load_matrix_sync`` and ``store_matrix_sync`` API functions +``unit/load_store_matrix_coop_sync_test`` Tests ``load_matrix_coop_sync`` and ``store_matrix_coop_sync`` API functions +``unit/map_util_test`` Tests mapping utilities used in rocWMMA implementations +``unit/vector_iterator_test`` Tests internal vector storage iteration implementation +``unit/vector_test`` Tests internal vector storage implementation +====================================== =========================================================================================================== + +*= Validate: Executables that compare outputs for correctness against reference sources such as CPU or rocBLAS calculations. + +*= Bench: Executables that measure kernel execution speeds and may compare against those of rocBLAS references. + +Build library, tests, and assembly +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +To build the library and tests with assembly code generation, run: + +.. code-block:: bash + + CC=hipcc CXX=hipcc cmake -B . -DROCWMMA_BUILD_ASSEMBLY=ON + +After configuration, build using: + +.. code-block:: bash + + cmake --build -- -j + +The assembly folder in ```` contains assembly generation of test executables in the format ``test_executable_name.s`` diff --git a/docs/programmers-guide.rst b/docs/programmers-guide.rst new file mode 100644 index 00000000..3667543a --- /dev/null +++ b/docs/programmers-guide.rst @@ -0,0 +1,100 @@ +.. meta:: + :description: C++ library for accelerating mixed precision matrix multiply-accumulate operations + leveraging specialized GPU matrix cores on AMD's latest discrete GPUs + :keywords: rocWMMA, ROCm, library, API, tool + +.. _programmers-guide: + +=================== +Programmer's guide +=================== + +This document provides insight into the library source code organization, design implementation details, helpful information for new development, and testing and benchmarking details. + +-------------------------------- +Library source code organization +-------------------------------- + +The rocWMMA code is split into four major parts: + +- The ``library`` directory contains the library source code. +- The ``samples`` directory contains real-world use-cases of the rocWMMA API. +- The ``test`` directory contains validation tests for rocWMMA API. +- Infrastructure + +``library`` directory +^^^^^^^^^^^^^^^^^^^^^^^ + +The ``library`` directory contains the following include files: + +- ``library/include/rocwmma/``: C++ include files for the rocWMMA API. These files also contain Doxygen comments that document the API. + +- ``library/include/internal``: Internal include files for: + + - Type support + - Input and output configuration, shapes and traits + - Layout + - Mapping Utility + - Cross-lane operation utility + - Vector blend utility + - Packing and unpacking + - Conversion and broadcasting + - Load and store + - Matrix multiply-accumulate + - Cooperative load and store + - Threadblock synchronization + - Utility code + +``samples`` directory +^^^^^^^^^^^^^^^^^^^^^^^ + +The ``samples`` directory contains the sample codes for the following use cases: + +- ``samples/hipRTC_gemm.cpp``: For calling simple General Matrix Multiply (GEMM) algorithm demonstration without LDS memory usage and no transpose, from within the hipRTC environment + +- ``samples/simple_sgemv.cpp``: For calling simple matrix multiply-accumulate with a vector demonstration, without LDS and no transpose for single-precision floating point types + +- ``samples/simple_dgemv.cpp``: For calling simple matrix multiply-accumulate with a vector demonstration, without LDS and no transpose for double-precision floating point types + +- ``samples/simple_sgemm.cpp``: For calling simple GEMM algorithm demonstration without LDS memory usage and no transpose for single-precision floating point types + +- ``samples/simple_dgemm.cpp``: For calling simple GEMM algorithm demonstration without LDS memory usage and no transpose for double-precision floating point types + +- ``samples/simple_hgemm.cpp``: For calling simple GEMM algorithm demonstration without LDS memory usage and no transpose for half-precision floating point types + +- ``samples/perf_sgemm.cpp``: For calling the best performing multi-block GEMM algorithm demonstration with LDS memory, macro tile collaboration, data reuse and optimized pipeline for single-precision floating point types + +- ``samples/perf_dgemm.cpp``: For calling the best performing multi-block GEMM algorithm demonstration with LDS memory, macro tile collaboration, data reuse and optimized pipeline for double-precision floating point types + +- ``samples/perf_hgemm.cpp``: For calling the best performant multi-block GEMM algorithm demonstration with LDS memory, macro tile collaboration, data reuse and optimized pipeline for half-precision floating point types + +- ``samples/simple_dlrm.cpp``: For calling simple Deep Learning Recommendation Model (DLRM) for machine learning + +- ``samples/common.hpp``: Common code used by all the above rocWMMA samples files + +``test`` directory +^^^^^^^^^^^^^^^^^^^^^^^ + +The ``test`` directory contains the test codes for testing the following functionalities: + +- ``test/bin``: To generate benchmark plots from the ``gtest`` output dumps of rocWMMA's benchmark tests. + +- ``test/dlrm``: For various strategies of DLRM application. This test is used to validate DLRM functions using rocWMMA API. + +- ``test/gemm``: For various strategies of GEMM application. This test is used to validate and benchmark GEMM functions using rocWMMA API. + +- ``test/unit``: For testing the basic functional units of rocWMMA library. + +Infrastructure +^^^^^^^^^^^^^^ + +- CMake is used to build and package rocWMMA. There are ``CMakeLists.txt`` files throughout the code. + +- ``Doxygen/Breathe/Sphinx/ReadTheDocs`` are used to produce documentation. The API documentation is generated using: + + - Doxygen comments in include files in the directory ``library/include`` + - files in the directory ``docs/source``. + +- Jenkins is used to automate Continuous Integration (CI) testing. + +- ``clang-format`` is used to format C++ code. diff --git a/docs/sphinx/_toc.yml.in b/docs/sphinx/_toc.yml.in index 0c4fb332..6b57c757 100644 --- a/docs/sphinx/_toc.yml.in +++ b/docs/sphinx/_toc.yml.in @@ -2,6 +2,18 @@ # These comments will also be removed. root: index subtrees: + - entries: + - file: what-is-rocwmma + - caption: Tutorial + entries: + - file: installation + - caption: Conceptual + entries: + - file: programmers-guide + - caption: API reference + entries: + - file: api-reference-guide - caption: About entries: - file: license + diff --git a/docs/what-is-rocwmma.rst b/docs/what-is-rocwmma.rst new file mode 100644 index 00000000..bc19095e --- /dev/null +++ b/docs/what-is-rocwmma.rst @@ -0,0 +1,25 @@ +.. meta:: + :description: C++ library for accelerating mixed precision matrix multiply-accumulate operations + leveraging specialized GPU matrix cores on AMD's latest discrete GPUs + :keywords: rocWMMA, ROCm, library, API, tool + +.. _what-is-rocwmma: + +***************** +What is rocWMMA? +***************** + +rocWMMA where WMMA stands for Wavefront Mixed precision Multiply Accumulate, is AMD's C++ library for accelerating mixed precision matrix multiply-accumulate operations +leveraging specialized GPU matrix cores on AMD's latest discrete GPUs. + +The C++ APIs facilitate the decomposition of matrix multiply-accumulate problems into +discretized block fragments and parallelize the block-wise operations across multiple GPU wavefronts. + +The API is implemented in the GPU device code, which empowers user device kernel code with direct use of GPU matrix cores. +Moreover, this code can benefit from inline compiler optimization passes and prevent additional +overhead of external runtime calls or extra kernel launches. + +As rocWMMA is written in C++, it can be applied directly in the device kernel code. The library code is templated for modularity and uses the available meta-data to provide opportunities for compile-time inferences and optimizations. + +The rocWMMA API exposes block-wise data load and store and matrix multiply-accumulate functions appropriately sized for thread-block execution on data fragments. Matrix multiply-accumulate functionality supports mixed precision inputs and outputs with native fixed-precision accumulation. The rocWMMA Coop API provides wave and warp collaborations within the thread blocks for block-wise data load and stores. +Supporting code is required for GPU device management and kernel invocation. The provided are built and launched via the Heterogeneous-Compute Interface for Portability (HIP) ecosystem within ROCm. \ No newline at end of file