Skip to content

Commit

Permalink
GPU vectorize pass (#970)
Browse files Browse the repository at this point in the history
Adds GPU kernel vectorization pass.
Extends CUDA lowering pass to support vector operations.

GPU-specific vectorization pass that guides upstream Linalg vectorizer
to process operations within a GPU kernel or prepared for outlining.
CUDA-specific pass gets extended to allow lowering of vector ops within
GPU kernel.

The vectorization is for now disabled within the GPU pipeline due to
lack of vector operation unrolling. When vector sizes exceed hardware
supported lengths, pipeline gets stuck on GPU binary compilation step.
This will be addressed by a separate transformation pass in the future.
  • Loading branch information
adam-smnk authored Oct 3, 2024
1 parent 7de59de commit 894bbc4
Show file tree
Hide file tree
Showing 7 changed files with 392 additions and 7 deletions.
14 changes: 14 additions & 0 deletions include/TPP/Passes.td
Original file line number Diff line number Diff line change
Expand Up @@ -525,4 +525,18 @@ def SplitReductionDim : Pass<"split-reduction-dim", "func::FuncOp"> {
];
}

def GpuVectorize : Pass<"gpu-vectorize", "ModuleOp"> {
let summary = "Vectorize GPU kernel.";
let description = [{
Convert ops targeting GPU to vectorized representation.
}];
let dependentDialects = ["gpu::GPUDialect",
"scf::SCFDialect",
"memref::MemRefDialect",
"tensor::TensorDialect",
"math::MathDialect",
"arith::ArithDialect",
"vector::VectorDialect"];
}

#endif // TPP_DIALECT_TPP_PASSES
1 change: 1 addition & 0 deletions lib/TPP/GPU/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ add_mlir_library(TPPGPU
GpuDataTransfer.cpp
GpuInlineConstants.cpp
LinalgToXeGPU.cpp
GpuVectorize.cpp

ADDITIONAL_HEADER_DIRS
${PROJECT_SOURCE_DIR}/include/TPP
Expand Down
18 changes: 14 additions & 4 deletions lib/TPP/GPU/GpuPipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,9 +65,9 @@ llvm::cl::list<int64_t>
llvm::cl::CommaSeparated);

// Control GPU vectorization.
llvm::cl::opt<bool> gpuVectorize("gpu-vectorize",
llvm::cl::desc("Vectorize GPU kernel"),
llvm::cl::init(false));
llvm::cl::opt<bool> gpuVector("gpu-vector",
llvm::cl::desc("Vectorize GPU kernel"),
llvm::cl::init(false));

namespace mlir {
namespace tpp {
Expand Down Expand Up @@ -187,12 +187,22 @@ struct GpuPipeline : public tpp::impl::GpuPipelineBase<GpuPipeline>,
pm.addPass(createTileConsumerAndFuseProducers(threadTileOptions));
pm.addPass(createCleanup());

if (gpuVectorize) {
if (gpuVector) {
// Early reduction dimension splitting is incompatible with
// Linalg to XeGPU lowering that expects full GEMM.
// For now, enable only with other vectorization passes.
pm.addPass(createSplitReductionDim(SplitReductionDimOptions{kTile}));
pm.addPass(createCleanup());

// Vectorize at tensor-level to benefit from better cleanup utilities like
// folding.
// TODO: Enable vectorization when vector unrolling is added.
// When vector sizes exceed hardware supported lengths,
// pipeline gets stuck on GPU binary compilation step.
// The vectorization can only be enabled when a pass
// to resize vector operations is available.
pm.addPass(createGpuVectorize());
pm.addPass(createCleanup());
}

// Preprocess and bufferize as further conversion requires memref
Expand Down
17 changes: 14 additions & 3 deletions lib/TPP/GPU/GpuToCuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,18 +67,29 @@ struct GpuToCuda : public tpp::impl::GpuToCudaBase<GpuToCuda>,
memref::createExpandStridedMetadataPass());
pm.addNestedPass<gpu::GPUModuleOp>(arith::createArithExpandOpsPass());
pm.addNestedPass<gpu::GPUModuleOp>(createLowerAffinePass());
pm.addNestedPass<gpu::GPUModuleOp>(createConvertVectorToSCFPass());
pm.addNestedPass<gpu::GPUModuleOp>(createConvertSCFToCFPass());

// Create CUDA kernels.
pm.addNestedPass<gpu::GPUModuleOp>(createStripDebugInfoPass());
pm.addNestedPass<gpu::GPUModuleOp>(createConvertNVGPUToNVVMPass());
pm.addNestedPass<gpu::GPUModuleOp>(createConvertGpuOpsToNVVMOps());
pm.addNestedPass<gpu::GPUModuleOp>(createReconcileUnrealizedCastsPass());
pm.addNestedPass<gpu::GPUModuleOp>(createConvertVectorToLLVMPass());
pm.addNestedPass<gpu::GPUModuleOp>(createConvertNVVMToLLVMPass());
pm.addNestedPass<gpu::GPUModuleOp>(createConvertFuncToLLVMPass());
pm.addNestedPass<gpu::GPUModuleOp>(createArithToLLVMConversionPass());
pm.addNestedPass<gpu::GPUModuleOp>(createConvertIndexToLLVMPass());

GpuNVVMAttachTargetOptions nvvmTargetOptions;
nvvmTargetOptions.triple = gpuTriple;
nvvmTargetOptions.chip = gpuChip;
nvvmTargetOptions.features = gpuFeatures;
pm.addPass(createGpuNVVMAttachTarget(nvvmTargetOptions));

// Create CUDA kernels.
pm.addNestedPass<gpu::GPUModuleOp>(createStripDebugInfoPass());
pm.addNestedPass<gpu::GPUModuleOp>(createCanonicalizerPass());
pm.addNestedPass<gpu::GPUModuleOp>(createCSEPass());
pm.addNestedPass<gpu::GPUModuleOp>(createReconcileUnrealizedCastsPass());

// Cleanup IR.
pm.addPass(createCanonicalizerPass());
pm.addPass(createCSEPass());
Expand Down
116 changes: 116 additions & 0 deletions lib/TPP/GPU/GpuVectorize.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
//===- GpuVectorize.cpp ------------------------------------------*- C++-*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "TPP/Passes.h"

#include "mlir/Conversion/Passes.h"
#include "mlir/Dialect/Arith/IR/Arith.h"
#include "mlir/Dialect/Func/IR/FuncOps.h"
#include "mlir/Dialect/GPU/TransformOps/Utils.h"
#include "mlir/Dialect/GPU/Transforms/Passes.h"
#include "mlir/Dialect/Linalg/IR/Linalg.h"
#include "mlir/Dialect/Linalg/Passes.h"
#include "mlir/Dialect/Math/IR/Math.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/Dialect/SCF/IR/SCF.h"
#include "mlir/Dialect/Tensor/IR/Tensor.h"
#include "mlir/Dialect/Vector/IR/VectorOps.h"
#include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h"
#include "mlir/Dialect/Vector/Transforms/VectorTransforms.h"
#include "mlir/IR/Dialect.h"
#include "mlir/Pass/Pass.h"
#include "mlir/Pass/PassManager.h"
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
#include "mlir/Transforms/Passes.h"

using namespace mlir;

namespace mlir {
namespace tpp {
#define GEN_PASS_DEF_GPUVECTORIZE
#include "TPP/Passes.h.inc"
} // namespace tpp
} // namespace mlir

namespace {

// Vectorize ops within GPU kernel.
struct VectorizeGpuLaunch : public OpRewritePattern<gpu::LaunchOp> {
using OpRewritePattern<gpu::LaunchOp>::OpRewritePattern;

LogicalResult matchAndRewrite(gpu::LaunchOp launchOp,
PatternRewriter &rewriter) const override {
// Vectorize all linalg ops within GPU kernel.
// It is expected that the ops operate on statically sized tiles.
auto walkResult = launchOp->walk([&](linalg::LinalgOp linalgOp) {
if (linalgOp.hasDynamicShape())
return WalkResult::interrupt();

if (failed(vectorize(rewriter, linalgOp, /*inputVectorSizes=*/{},
/*scalableVecDims=*/{})))
return WalkResult::interrupt();
return WalkResult::advance();
});

if (walkResult.wasInterrupted())
return rewriter.notifyMatchFailure(
launchOp, "Failed to vectorize ops within GPU launch");

return success();
}
};

// Vectorize linalg ops targeting GPU.
struct GpuVectorizeLinalg : public OpInterfaceRewritePattern<linalg::LinalgOp> {
using OpInterfaceRewritePattern<linalg::LinalgOp>::OpInterfaceRewritePattern;

LogicalResult matchAndRewrite(linalg::LinalgOp linalgOp,
PatternRewriter &rewriter) const override {
// Vectorize all Linalg ops within parallelized loops.
if (!linalgOp.hasPureTensorSemantics())
return rewriter.notifyMatchFailure(linalgOp, "Expects tensor semantics");

if (linalgOp.hasDynamicShape())
return rewriter.notifyMatchFailure(linalgOp,
"Expects static shapes only");

// Only process operations within parallelized loops.
// TODO: Use some different mechanism like annotations to determine which
// ops target GPU.
if (!linalgOp->getParentOfType<scf::ForallOp>())
return rewriter.notifyMatchFailure(linalgOp,
"Expects parallel loop parent");

return vectorize(rewriter, linalgOp, /*inputVectorSizes=*/{},
/*scalableVecDims=*/{});
}
};

// Vectorize operations targeting GPU.
struct GpuVectorize : public tpp::impl::GpuVectorizeBase<GpuVectorize> {
using GpuVectorizeBase::GpuVectorizeBase;

void runOnOperation() override {
MLIRContext *ctx = getOperation().getContext();
RewritePatternSet patterns(ctx);

// Vectorize core computation ops within kernel launch.
patterns.add<VectorizeGpuLaunch, GpuVectorizeLinalg>(ctx);

// Vector postprocessing patterns.
vector::populateVectorTransferPermutationMapLoweringPatterns(patterns);
vector::populateVectorReductionToContractPatterns(patterns);
vector::populateSinkVectorOpsPatterns(patterns);
vector::TransferReadOp::getCanonicalizationPatterns(patterns, ctx);
vector::TransferWriteOp::getCanonicalizationPatterns(patterns, ctx);

(void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns));
}
};

} // namespace
28 changes: 28 additions & 0 deletions test/GPU/CUDA/Integration/vector-contract-small.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
// RUN: ASAN_OPTIONS=protect_shadow_gap=0:replace_intrin=0:detect_leaks=0:${ASAN_OPTIONS} \
// RUN: tpp-run %s -gpu=cuda -print \
// RUN: -entry-point-result=void -e entry 2>&1 | \
// RUN: FileCheck %s

#map = affine_map<(d0, d1, d2) -> (d0, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
func.func @entry(%arg0: tensor<8x8xf32>, %arg1: tensor<8x8xf32>, %arg2: tensor<8x8xf32>) -> tensor<8x8xf32> {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = scf.forall (%arg3, %arg4) = (0, 0) to (8, 8) step (4, 4) shared_outs(%arg5 = %arg2) -> (tensor<8x8xf32>) {
%extracted_slice = tensor.extract_slice %arg0[%arg3, 0] [4, 8] [1, 1] : tensor<8x8xf32> to tensor<4x8xf32>
%extracted_slice_0 = tensor.extract_slice %arg1[0, %arg4] [8, 4] [1, 1] : tensor<8x8xf32> to tensor<8x4xf32>
%extracted_slice_1 = tensor.extract_slice %arg5[%arg3, %arg4] [4, 4] [1, 1] : tensor<8x8xf32> to tensor<4x4xf32>
%1 = vector.transfer_read %extracted_slice[%c0, %c0], %cst {in_bounds = [true, true]} : tensor<4x8xf32>, vector<4x8xf32>
%2 = vector.transfer_read %extracted_slice_0[%c0, %c0], %cst {in_bounds = [true, true]} : tensor<8x4xf32>, vector<8x4xf32>
%3 = vector.transfer_read %extracted_slice_1[%c0, %c0], %cst {in_bounds = [true, true]} : tensor<4x4xf32>, vector<4x4xf32>
%4 = vector.contract {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %1, %2, %3 : vector<4x8xf32>, vector<8x4xf32> into vector<4x4xf32>
%5 = vector.transfer_write %4, %extracted_slice_1[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xf32>, tensor<4x4xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %5 into %arg5[%arg3, %arg4] [4, 4] [1, 1] : tensor<4x4xf32> into tensor<8x8xf32>
}
}
return %0 : tensor<8x8xf32>
}

// CHECK-COUNT-8: 9, 9, 9, 9, 9, 9, 9, 9
Loading

0 comments on commit 894bbc4

Please sign in to comment.