Skip to content

Commit

Permalink
[UnitTest][NVPTX] Avoid cascading failures from CUDA postproc
Browse files Browse the repository at this point in the history
Prior to this commit, the tests in
`test_tir_transform_inject_ptx_async_copy.py` registered the
`"tvm_callback_cuda_postproc"` function during pytest collection, and
used a global variable to disable its functionality outside of the
tests in this file.  This had two major issues.  First, if any other
test also installs a postproc function, these postproc function
required by the NVPTX tests would be overwritten.  Second, if one of
the NTPTX tests fails, the global variable controlling the postproc
function would not be reset, causing any subsequent CUDA-related tests
to also fail.

This commit updates these NVPTX tests to conditionally install the
postproc function, to de-register it after the test instead of
disabling its functionality, and to de-register it regardless of the
test result.

This issue was initially found when debugging
apache#15103, when a failure in
`test_tir_transform_inject_ptx_async_copy.py::test_cp_async_in_if_then_else`
caused failures in 32 unrelated tests ([CI
link](https://ci.tlcpack.ai/blue/organizations/jenkins/tvm-gpu/detail/PR-15103/7/tests)).
  • Loading branch information
Lunderberg committed Jul 4, 2023
1 parent f14c61f commit 22d8a8d
Showing 1 changed file with 36 additions and 47 deletions.
83 changes: 36 additions & 47 deletions tests/python/unittest/test_tir_transform_inject_ptx_async_copy.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,14 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import numpy as np

import tvm
import tvm.testing
from tvm.script import tir as T

import pytest
import numpy as np


def count_cp_async(stmt):
num_alloc = [0]
Expand Down Expand Up @@ -351,36 +354,38 @@ def test_inject_async_copy_shared_dyn():
"""


generated_code = ""
support_async = True
@pytest.fixture
def postproc_if_missing_async_support():
arch = tvm.contrib.nvcc.get_target_compute_version()
major, _ = tvm.contrib.nvcc.parse_compute_version(arch)
support_async = major >= 8

func_name = "tvm_callback_cuda_postproc"
prev_postproc = None

@tvm.register_func
def tvm_callback_cuda_postproc(code, _):
global generated_code
global support_async
generated_code = code
# return a dummy code so that device < sm80 could build correctly
if not support_async:
ret = ""
for line in code.split("\n"):
ret += line + "\n"
if line.startswith('extern "C" __global__'):
break
ret += "}"
return ret
return code
prev_postproc = tvm.get_global_func(func_name, allow_missing=True)

@tvm.register_func(func_name, override=True)
def tvm_callback_cuda_postproc(code, _):
ret = []
for line in code.split("\n"):
ret.append(line)
ret.append("\n")
if line.startswith('extern "C" __global__') and line.endswith("{"):
break
ret.append("}")
return "".join(ret)

@tvm.testing.requires_cuda
def test_cp_async_in_if_then_else():
global support_async
arch = tvm.contrib.nvcc.get_target_compute_version()
major, _ = tvm.contrib.nvcc.parse_compute_version(arch)
if major < 8:
# At least sm80 is required
support_async = False
yield

# Restore previous postproc func to avoid impacting other tests
if prev_postproc is not None:
tvm.register_func(func_name, prev_postproc, override=True)


@tvm.testing.requires_cuda
def test_cp_async_in_if_then_else(postproc_if_missing_async_support):
@T.prim_func
def simple_compute(
A: T.Buffer((16, 14), "float32"),
Expand Down Expand Up @@ -421,23 +426,13 @@ def simple_compute(

mod = tvm.IRModule.from_expr(simple_compute)
with tvm.transform.PassContext(config={"tir.use_async_copy": 1}):
tvm.build(mod, target="cuda")
built = tvm.build(mod, target="cuda")
generated_code = built.imported_modules[0].get_source()
assert generated_code == expected_cuda_script

if not support_async:
# avoid return dummy code to other tests
support_async = True


@tvm.testing.requires_cuda
def test_vectorize_cp_async_in_if_then_else():
global support_async
arch = tvm.contrib.nvcc.get_target_compute_version()
major, _ = tvm.contrib.nvcc.parse_compute_version(arch)
if major < 8:
# At least sm80 is required
support_async = False

def test_vectorize_cp_async_in_if_then_else(postproc_if_missing_async_support):
@T.prim_func
def complex_compute(
A: T.Buffer((2, 16, 16, 1280), "float16"),
Expand Down Expand Up @@ -886,17 +881,11 @@ def complex_compute(

mod = tvm.IRModule.from_expr(complex_compute)
with tvm.transform.PassContext(config={"tir.use_async_copy": 1}):
tvm.build(mod, target="cuda")
built = tvm.build(mod, target="cuda")
generated_code = built.imported_modules[0].get_source()
# generated_code must contain " setp.ne.b32 p, %0, 0;"
assert "setp.ne.b32" in generated_code

if not support_async:
# avoid return dummy code to other tests
support_async = True


if __name__ == "__main__":
test_inject_async_copy()
test_inject_async_copy_shared_dyn()
test_cp_async_in_if_then_else()
test_vectorize_cp_async_in_if_then_else()
tvm.testing.main()

0 comments on commit 22d8a8d

Please sign in to comment.