Skip to content

Commit

Permalink
[AMD] Remove stream pipeliner v1 (#4845)
Browse files Browse the repository at this point in the history
We have flipped stream pipeliner v2 on as default
for quite sometime. All known issues has been fixed.
So now remove old v1 pipeliner.

Note that this changes know `num_stages` are handled:
previously we used to enable pipelining if `num_stages`
is `0`, which really is not a good behavior. Now switched
to follow common practice where `0`/`1` won't trigger
pipelining anymore; need `2` or more to trigger. 

Given downstream users might be using `0` in the 
codebase, right now we `assert` to give developers
a clear indication the switch of behavior instead of
silently drop the perf. The `assert` is expected to be
dropped sometime down the line.

---------

Co-authored-by: Lei Zhang <antiagainst@gmail.com>
  • Loading branch information
sjw36 and antiagainst authored Oct 18, 2024
1 parent d4e5a78 commit 76ed94d
Show file tree
Hide file tree
Showing 8 changed files with 8 additions and 982 deletions.
1 change: 0 additions & 1 deletion bin/RegisterTritonDialects.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,6 @@ inline void registerTritonDialects(mlir::DialectRegistry &registry) {
mlir::registerTritonAMDGPUAccelerateMatmul();
mlir::registerTritonAMDGPUOptimizeEpilogue();
mlir::registerTritonAMDGPUReorderInstructions();
mlir::registerTritonAMDGPUStreamPipeline();
mlir::registerTritonAMDGPUStreamPipelineV2();
mlir::registerTritonAMDGPUCanonicalizePointers();

Expand Down
31 changes: 0 additions & 31 deletions test/TritonGPU/amd/amd-loop-pipeline-v1.mlir

This file was deleted.

20 changes: 8 additions & 12 deletions third_party/amd/backend/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def min_dot_size(target: GPUTarget):
class HIPOptions:
num_warps: int = 4
waves_per_eu: int = 1
num_stages: int = 0
num_stages: int = 2
num_ctas: int = 1
extern_libs: dict = None
cluster_dims: tuple = (1, 1, 1)
Expand Down Expand Up @@ -215,23 +215,19 @@ def make_ttgir(mod, metadata, options):
passes.ttgpuir.add_remove_layout_conversions(pm)
amd.passes.ttgpuir.add_optimize_epilogue(pm)
passes.ttgpuir.add_optimize_dot_operands(pm, True)
use_new_pipeliner = os.getenv("TRITON_HIP_USE_NEW_STREAM_PIPELINE", "1") == "1"
if amd.has_matrix_core_feature(options.arch):
if use_new_pipeliner:
# In the old pipeliner we only support num_stages = 0/1, which means something
# different than the NVIDIA side. In the new pipeliner we unify the num_stages
# interpretation. Default to use 2 stages if not explicitly set.
num_stages = options.num_stages if options.num_stages != 0 else 2
amd.passes.ttgpuir.add_stream_pipelinev2(pm, num_stages)
else:
if options.num_stages == 0:
amd.passes.ttgpuir.add_stream_pipeline(pm)
assert options.num_stages != 0, ("Triton AMD backend pipeliner has been updated. "
"We used to trigger software pipelining with "
"num_stages == 0. Now it will not happen anymore; "
"please update to use num_stages == 2 for "
"equivalent behavior in the past.")
amd.passes.ttgpuir.add_stream_pipelinev2(pm, options.num_stages)
passes.common.add_canonicalizer(pm)
amd.passes.ttgpuir.insert_instruction_sched_hints(pm)
passes.ttgpuir.add_optimize_dot_operands(pm, True)
passes.ttgpuir.add_remove_layout_conversions(pm)
passes.ttgpuir.add_reduce_data_duplication(pm)
if use_new_pipeliner or options.num_stages != 0:
if amd.has_matrix_core_feature(options.arch):
amd.passes.ttgpuir.add_reorder_instructions(pm)
amd.passes.ttgpuir.add_canonicalize_pointers(pm)
passes.common.add_canonicalizer(pm)
Expand Down
2 changes: 0 additions & 2 deletions third_party/amd/include/TritonAMDGPUTransforms/Passes.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@

namespace mlir {

std::unique_ptr<Pass> createTritonAMDGPUStreamPipelinePass();

std::unique_ptr<Pass> createTritonAMDGPUStreamPipelineV2Pass(int numStages = 2);

std::unique_ptr<Pass>
Expand Down
13 changes: 0 additions & 13 deletions third_party/amd/include/TritonAMDGPUTransforms/Passes.td
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,6 @@

include "mlir/Pass/PassBase.td"

def TritonAMDGPUStreamPipeline : Pass<"tritonamdgpu-stream-pipeline", "mlir::ModuleOp"> {
let summary = "pipeline";

let description = [{
Pipeline global loads through registers to shared memory while computing on previous
tile
}];

let constructor = "mlir::createTritonAMDGPUStreamPipelinePass()";

let dependentDialects = [];
}

def TritonAMDGPUStreamPipelineV2 : Pass<"tritonamdgpu-stream-pipeline-v2", "mlir::ModuleOp"> {
let summary = "pipeline";

Expand Down
1 change: 0 additions & 1 deletion third_party/amd/lib/TritonAMDGPUTransforms/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ add_triton_library(TritonAMDGPUTransforms
CanonicalizePointers.cpp
OptimizeEpilogue.cpp
ReorderInstructions.cpp
StreamPipeline.cpp
StreamPipelineV2.cpp
MfmaGroup.cpp

Expand Down
Loading

0 comments on commit 76ed94d

Please sign in to comment.