diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh index 3baa2cfb9acc..5639f3e4f399 100755 --- a/ci/docker/runtime_functions.sh +++ b/ci/docker/runtime_functions.sh @@ -889,6 +889,30 @@ build_ubuntu_gpu_cmake() { ninja -v } +build_ubuntu_gpu_cmake_no_rtc() { + set -ex + cd /work/build + build_ccache_wrappers + cmake \ + -DUSE_SIGNAL_HANDLER=ON \ + -DUSE_CUDA=ON \ + -DUSE_CUDNN=ON \ + -DUSE_TVM_OP=ON \ + -DPython3_EXECUTABLE=/usr/bin/python3 \ + -DUSE_MKL_IF_AVAILABLE=OFF \ + -DUSE_MKLML_MKL=OFF \ + -DUSE_MKLDNN=ON \ + -DUSE_DIST_KVSTORE=ON \ + -DCMAKE_BUILD_TYPE=Release \ + -DMXNET_CUDA_ARCH="$CI_CMAKE_CUDA_ARCH" \ + -DBUILD_CYTHON_MODULES=1 \ + -DENABLE_CUDA_RTC=OFF \ + -G Ninja \ + /work/mxnet + + ninja -v +} + build_ubuntu_gpu_cmake_no_tvm_op() { set -ex cd /work/build diff --git a/ci/jenkins/Jenkins_steps.groovy b/ci/jenkins/Jenkins_steps.groovy index 61fae5259faa..621121b0b617 100644 --- a/ci/jenkins/Jenkins_steps.groovy +++ b/ci/jenkins/Jenkins_steps.groovy @@ -311,6 +311,19 @@ def compile_unix_cmake_gpu_no_tvm_op() { }] } +def compile_unix_cmake_gpu_no_rtc() { + return ['GPU: CMake CUDA RTC OFF': { + node(NODE_LINUX_CPU) { + ws('workspace/build-cmake-gpu-no-rtc') { + timeout(time: max_time, unit: 'MINUTES') { + utils.init_git() + utils.docker_run('ubuntu_gpu_cu101', 'build_ubuntu_gpu_cmake_no_rtc', false) + } + } + } + }] +} + def compile_unix_tensorrt_gpu() { return ['TensorRT': { node(NODE_LINUX_CPU) { diff --git a/ci/jenkins/Jenkinsfile_unix_gpu b/ci/jenkins/Jenkinsfile_unix_gpu index 8c07ee89798a..a60c747c4236 100644 --- a/ci/jenkins/Jenkinsfile_unix_gpu +++ b/ci/jenkins/Jenkinsfile_unix_gpu @@ -43,6 +43,7 @@ core_logic: { custom_steps.compile_unix_int64_gpu(), custom_steps.compile_unix_full_gpu_no_tvm_op(), custom_steps.compile_unix_cmake_gpu_no_tvm_op(), + custom_steps.compile_unix_cmake_gpu_no_rtc(), custom_steps.compile_unix_full_gpu_mkldnn_cpp_test() ]) diff --git a/config/config.cmake b/config/config.cmake index cfd6f2637465..101e43f57699 100644 --- a/config/config.cmake +++ b/config/config.cmake @@ -125,5 +125,5 @@ set(USE_INT64_TENSOR_SIZE OFF CACHE BOOL "Use int64_t to represent the total num # Other GPU features set(USE_NCCL "Use NVidia NCCL with CUDA" OFF) set(NCCL_ROOT "" CACHE BOOL "NCCL install path. Supports autodetection.") -set(ENABLE_CUDA_RTC OFF CACHE BOOL "Build with CUDA runtime compilation support") +set(ENABLE_CUDA_RTC ON CACHE BOOL "Build with CUDA runtime compilation support") set(USE_NVTX ON CACHE BOOL "Build with NVTX support") diff --git a/src/imperative/cached_op.h b/src/imperative/cached_op.h index 239e3e88f57f..c56d8cf198b5 100644 --- a/src/imperative/cached_op.h +++ b/src/imperative/cached_op.h @@ -230,7 +230,7 @@ void SetRefCounts(nnvm::Graph* fwd_graph, const nnvm::Graph& full_graph) { void OptimizeGraph(nnvm::Graph * full_graph, nnvm::Graph * fwd_graph, nnvm::Graph * grad_graph, const Context& context, size_t num_forward_outputs, const bool inlining) { -#if MXNET_USE_CUDA && !defined(_WIN32) +#if MXNET_USE_CUDA && MXNET_ENABLE_CUDA_RTC && !defined(_WIN32) if (context.dev_mask() == kGPU && !inlining && dmlc::GetEnv("MXNET_USE_FUSION", true)) { @@ -265,7 +265,13 @@ void OptimizeGraph(nnvm::Graph * full_graph, nnvm::Graph * fwd_graph, nnvm::Grap << "Graph contains duplicate names for some of its inputs - fusion is NOT enabled!"; } } -#endif // MXNET_USE_CUDA +#else + // Only warn user if MXNET_USE_FUSION env var is explicitly set + if (context.dev_mask() == kGPU && !inlining && + dmlc::GetEnv("MXNET_USE_FUSION", false)) { + exec::WarnFusionNotSupported(); + } +#endif // MXNET_USE_CUDA && MXNET_ENABLE_CUDA_RTC && !defined(_WIN32) *fwd_graph = nnvm::Graph(); fwd_graph->outputs = std::vector(full_graph->outputs.begin(),