diff --git a/src/gt4py/cartesian/backend/dace_backend.py b/src/gt4py/cartesian/backend/dace_backend.py index 204ce06d7f..60da2c36ff 100644 --- a/src/gt4py/cartesian/backend/dace_backend.py +++ b/src/gt4py/cartesian/backend/dace_backend.py @@ -25,6 +25,7 @@ from dace.serialize import dumps from gt4py import storage as gt_storage +from gt4py.cartesian import config as gt_config from gt4py.cartesian.backend.base import CLIBackendMixin, register from gt4py.cartesian.backend.gtc_common import ( BackendCodegen, @@ -543,6 +544,8 @@ def keep_line(line): def apply(cls, stencil_ir: gtir.Stencil, builder: "StencilBuilder", sdfg: dace.SDFG): self = cls() with dace.config.temporary_config(): + if gt_config.GT4PY_USE_HIP: + dace.config.Config.set("compiler", "cuda", "backend", value="hip") dace.config.Config.set("compiler", "cuda", "max_concurrent_streams", value=-1) dace.config.Config.set("compiler", "cpu", "openmp_sections", value=False) code_objects = sdfg.generate_code() diff --git a/src/gt4py/cartesian/backend/pyext_builder.py b/src/gt4py/cartesian/backend/pyext_builder.py index d295948608..d2aa34bdae 100644 --- a/src/gt4py/cartesian/backend/pyext_builder.py +++ b/src/gt4py/cartesian/backend/pyext_builder.py @@ -85,43 +85,58 @@ def get_gt_pyext_build_opts( "-isystem{}".format(gt_config.build_settings["boost_include_path"]), "-DBOOST_PP_VARIADICS", *extra_compile_args_from_config["cxx"], - ], - nvcc=[ - "-std=c++17", - "-ftemplate-depth={}".format(gt_config.build_settings["cpp_template_depth"]), - "-arch=sm_{}".format(cuda_arch), + ] + ) + extra_compile_args["cuda"] = [ + "-std=c++17", + "-ftemplate-depth={}".format(gt_config.build_settings["cpp_template_depth"]), + "-DBOOST_PP_VARIADICS", + "-DBOOST_OPTIONAL_CONFIG_USE_OLD_IMPLEMENTATION_OF_OPTIONAL", + "-DBOOST_OPTIONAL_USE_OLD_DEFINITION_OF_NONE", + *extra_compile_args_from_config["cuda"], + ] + if gt_config.GT4PY_USE_HIP: + extra_compile_args["cuda"] += [ + "-isystem{}".format(gt_include_path), + "-isystem{}".format(gt_config.build_settings["boost_include_path"]), + "-fvisibility=hidden", + "-fPIC", + ] + else: + extra_compile_args["cuda"] += [ "-isystem={}".format(gt_include_path), "-isystem={}".format(gt_config.build_settings["boost_include_path"]), - "-DBOOST_PP_VARIADICS", - "-DBOOST_OPTIONAL_CONFIG_USE_OLD_IMPLEMENTATION_OF_OPTIONAL", - "-DBOOST_OPTIONAL_USE_OLD_DEFINITION_OF_NONE", + "-arch=sm_{}".format(cuda_arch), "--expt-relaxed-constexpr", "--compiler-options", "-fvisibility=hidden", "--compiler-options", "-fPIC", - *extra_compile_args_from_config["nvcc"], - ], - ) + ] extra_link_args = gt_config.build_settings["extra_link_args"] mode_flags = ["-O0", "-ggdb"] if debug_mode else ["-O3", "-DNDEBUG"] extra_compile_args["cxx"].extend(mode_flags) - extra_compile_args["nvcc"].extend(mode_flags) + extra_compile_args["cuda"].extend(mode_flags) extra_link_args.extend(mode_flags) if dace_path := get_dace_module_path(): extra_compile_args["cxx"].append( "-isystem{}".format(os.path.join(dace_path, "runtime/include")) ) - extra_compile_args["nvcc"].append( - "-isystem={}".format(os.path.join(dace_path, "runtime/include")) - ) + if gt_config.GT4PY_USE_HIP: + extra_compile_args["cuda"].append( + "-isystem{}".format(os.path.join(dace_path, "runtime/include")) + ) + else: + extra_compile_args["cuda"].append( + "-isystem={}".format(os.path.join(dace_path, "runtime/include")) + ) if add_profile_info: profile_flags = ["-pg"] extra_compile_args["cxx"].extend(profile_flags) - extra_compile_args["nvcc"].extend(profile_flags) + extra_compile_args["cuda"].extend(profile_flags) extra_link_args.extend(profile_flags) if uses_cuda: @@ -142,8 +157,11 @@ def get_gt_pyext_build_opts( if uses_cuda: cuda_flags = [] for cpp_flag in cpp_flags: - cuda_flags.extend(["--compiler-options", cpp_flag]) - build_opts["extra_compile_args"]["nvcc"].extend(cuda_flags) + if gt_config.GT4PY_USE_HIP: + cuda_flags.extend([cpp_flag]) + else: + cuda_flags.extend(["--compiler-options", cpp_flag]) + build_opts["extra_compile_args"]["cuda"].extend(cuda_flags) elif cpp_flags: build_opts["extra_compile_args"].extend(cpp_flags) @@ -297,7 +315,10 @@ def build_pybind_cuda_ext( library_dirs = library_dirs or [] library_dirs = [*library_dirs, gt_config.build_settings["cuda_library_path"]] libraries = libraries or [] - libraries = [*libraries, "cudart"] + if gt_config.GT4PY_USE_HIP: + libraries = [*libraries, "hiprtc"] + else: + libraries = [*libraries, "cudart"] extra_compile_args = extra_compile_args or [] return build_pybind_ext( @@ -342,16 +363,19 @@ def build_extensions(self) -> None: # Save references to the original methods original_compile = self.compiler._compile - def nvcc_compile(obj, src, ext, cc_args, extra_postargs, pp_opts): + def cuda_compile(obj, src, ext, cc_args, extra_postargs, pp_opts): original_compiler_so = self.compiler.compiler_so # Copy before we make any modifications. cflags = copy.deepcopy(extra_postargs) try: if os.path.splitext(src)[-1] == ".cu": - nvcc_exec = os.path.join(gt_config.build_settings["cuda_bin_path"], "nvcc") - self.compiler.set_executable("compiler_so", [nvcc_exec]) + if gt_config.GT4PY_USE_HIP: + cuda_exec = os.path.join(gt_config.build_settings["cuda_bin_path"], "hipcc") + else: + cuda_exec = os.path.join(gt_config.build_settings["cuda_bin_path"], "nvcc") + self.compiler.set_executable("compiler_so", [cuda_exec]) if isinstance(cflags, dict): - cflags = cflags["nvcc"] + cflags = cflags["cuda"] elif isinstance(cflags, dict): cflags = cflags["cxx"] @@ -360,6 +384,6 @@ def nvcc_compile(obj, src, ext, cc_args, extra_postargs, pp_opts): # Put the original compiler back in place. self.compiler.set_executable("compiler_so", original_compiler_so) - self.compiler._compile = nvcc_compile + self.compiler._compile = cuda_compile build_ext.build_extensions(self) self.compiler._compile = original_compile diff --git a/src/gt4py/cartesian/config.py b/src/gt4py/cartesian/config.py index 676e1f58d4..23bf36de6c 100644 --- a/src/gt4py/cartesian/config.py +++ b/src/gt4py/cartesian/config.py @@ -32,6 +32,17 @@ CUDA_HOST_CXX: Optional[str] = os.environ.get("CUDA_HOST_CXX", None) +if "GT4PY_USE_HIP" in os.environ: + GT4PY_USE_HIP: bool = bool(int(os.environ["GT4PY_USE_HIP"])) +else: + # Autodetect cupy with ROCm/HIP support + try: + import cupy as _cp + + GT4PY_USE_HIP = _cp.cuda.get_hipcc_path() is not None + del _cp + except Exception: + GT4PY_USE_HIP = False GT_INCLUDE_PATH: str = os.path.abspath(gridtools_cpp.get_include_dir()) @@ -42,28 +53,31 @@ "boost_include_path": os.path.join(BOOST_ROOT, "include"), "cuda_bin_path": os.path.join(CUDA_ROOT, "bin"), "cuda_include_path": os.path.join(CUDA_ROOT, "include"), - "cuda_library_path": os.path.join(CUDA_ROOT, "lib64"), "cuda_arch": os.environ.get("CUDA_ARCH", None), "gt_include_path": os.environ.get("GT_INCLUDE_PATH", GT_INCLUDE_PATH), "openmp_cppflags": os.environ.get("OPENMP_CPPFLAGS", "-fopenmp").split(), "openmp_ldflags": os.environ.get("OPENMP_LDFLAGS", "-fopenmp").split(), "extra_compile_args": { "cxx": [], - "nvcc": [], + "cuda": [], }, "extra_link_args": [], "parallel_jobs": multiprocessing.cpu_count(), "cpp_template_depth": os.environ.get("GT_CPP_TEMPLATE_DEPTH", GT_CPP_TEMPLATE_DEPTH), } +if GT4PY_USE_HIP: + build_settings["cuda_library_path"] = os.path.join(CUDA_ROOT, "lib") +else: + build_settings["cuda_library_path"] = os.path.join(CUDA_ROOT, "lib64") if CUDA_HOST_CXX is not None: - build_settings["extra_compile_args"]["nvcc"].append(f"-ccbin={CUDA_HOST_CXX}") + build_settings["extra_compile_args"]["cuda"].append(f"-ccbin={CUDA_HOST_CXX}") cache_settings: Dict[str, Any] = { "dir_name": os.environ.get("GT_CACHE_DIR_NAME", ".gt_cache"), "root_path": os.environ.get("GT_CACHE_ROOT", os.path.abspath(".")), "load_retries": int(os.environ.get("GT_CACHE_LOAD_RETRIES", 3)), - "load_retry_delay": int(os.environ.get("GT_CACHE_LOAD_RETRY_DELAY", 100)), # unit miliseconds + "load_retry_delay": int(os.environ.get("GT_CACHE_LOAD_RETRY_DELAY", 100)), # unit milliseconds } code_settings: Dict[str, Any] = {"root_package_name": "_GT_"} diff --git a/src/gt4py/storage/utils.py b/src/gt4py/storage/utils.py index cbc3529f49..a68fae1fb5 100644 --- a/src/gt4py/storage/utils.py +++ b/src/gt4py/storage/utils.py @@ -20,6 +20,8 @@ import numpy as np import numpy.typing as npt +from gt4py.cartesian import config as gt_config + if np.lib.NumpyVersion(np.__version__) >= "1.20.0": from numpy.typing import DTypeLike @@ -243,6 +245,18 @@ def allocate_gpu( if device_field.ndim > 0: device_field = device_field[tuple(slice(0, s, None) for s in shape)] + if gt_config.GT4PY_USE_HIP: + # HIP/ROCm lack support for __cuda_array_interface__ + device_field.__hip_array_interface__ = { + "shape": padded_shape, + "typestr": device_field.dtype.descr[0][1], + "descr": device_field.dtype.descr, + "stream": 1, + "version": 3, + "strides": strides, + "data": (device_field.data.ptr, False), + } + return device_raw_buffer, device_field