diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4cf24bbcae53..1ca92ff19a93 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -180,7 +180,7 @@ endif()
 
 if(NOT mxnet_LINKER_LIBS)
   set(mxnet_LINKER_LIBS "")
-endif(NOT mxnet_LINKER_LIBS)
+endif()
 
 if(USE_GPROF)
   message(STATUS "Using GPROF")
diff --git a/ci/build.py b/ci/build.py
index a21ec44942a8..cbc41218f042 100755
--- a/ci/build.py
+++ b/ci/build.py
@@ -70,7 +70,8 @@ def get_docker_binary(use_nvidia_docker: bool) -> str:
     return "nvidia-docker" if use_nvidia_docker else "docker"
 
 
-def build_docker(platform: str, docker_binary: str, registry: str, num_retries: int, no_cache: bool) -> str:
+def build_docker(platform: str, docker_binary: str, registry: str, num_retries: int, no_cache: bool,
+                 cache_intermediate: bool) -> str:
     """
     Build a container for the given platform
     :param platform: Platform
@@ -104,6 +105,8 @@ def build_docker(platform: str, docker_binary: str, registry: str, num_retries:
            "--build-arg", "GROUP_ID={}".format(os.getgid())]
     if no_cache:
         cmd.append("--no-cache")
+    if cache_intermediate:
+        cmd.append("--rm=false")
     elif registry:
         cmd.extend(["--cache-from", tag])
     cmd.extend(["-t", tag, get_dockerfiles_path()])
@@ -330,6 +333,9 @@ def main() -> int:
     parser.add_argument("--no-cache", action="store_true",
                         help="passes --no-cache to docker build")
 
+    parser.add_argument("--cache-intermediate", action="store_true",
+                        help="passes --rm=false to docker build")
+
     parser.add_argument("-e", "--environment", nargs="*", default=[],
                         help="Environment variables for the docker container. "
                         "Specify with a list containing either names or name=value")
@@ -361,7 +367,8 @@ def main() -> int:
             load_docker_cache(tag=tag, docker_registry=args.docker_registry)
         if not args.run_only:
             build_docker(platform=platform, docker_binary=docker_binary, registry=args.docker_registry,
-                         num_retries=args.docker_build_retries, no_cache=args.no_cache)
+                         num_retries=args.docker_build_retries, no_cache=args.no_cache,
+                         cache_intermediate=args.cache_intermediate)
         else:
             logging.info("Skipping docker build step.")
 
diff --git a/ci/docker/Dockerfile.build.android_armv7 b/ci/docker/Dockerfile.build.android_armv7
index 2c923a015b63..96ca04e9f5e6 100644
--- a/ci/docker/Dockerfile.build.android_armv7
+++ b/ci/docker/Dockerfile.build.android_armv7
@@ -18,62 +18,41 @@
 #
 # Dockerfile to build MXNet for Android ARMv7
 
-FROM dockcross/base
-MAINTAINER Pedro Larroy "pllarroy@amazon.com"
-
-# The cross-compiling emulator
-RUN apt-get update && apt-get install -y \
-  unzip
-
-ENV CROSS_TRIPLE=arm-linux-androideabi
-ENV CROSS_ROOT=/usr/${CROSS_TRIPLE}
-ENV AS=${CROSS_ROOT}/bin/${CROSS_TRIPLE}-as \
-    AR=${CROSS_ROOT}/bin/${CROSS_TRIPLE}-ar \
-    CC=${CROSS_ROOT}/bin/${CROSS_TRIPLE}-gcc \
-    CPP=${CROSS_ROOT}/bin/${CROSS_TRIPLE}-cpp \
-    CXX=${CROSS_ROOT}/bin/${CROSS_TRIPLE}-g++ \
-    LD=${CROSS_ROOT}/bin/${CROSS_TRIPLE}-ld
-
-ENV ANDROID_NDK_REVISION 17b
-ENV ANDROID_NDK_API 27
-ENV ANDROID_NDK_ARCH arm
-WORKDIR /work/deps
-COPY install/android_ndk.sh /work/deps
-RUN /work/deps/android_ndk.sh
-
-ENV DEFAULT_DOCKCROSS_IMAGE dockcross/android-arm
-
-# Build-time metadata as defined at http://label-schema.org
-ARG BUILD_DATE
-ARG IMAGE
-ARG VCS_REF
-ARG VCS_URL
-LABEL org.label-schema.build-date=$BUILD_DATE \
-      org.label-schema.name=$IMAGE \
-      org.label-schema.vcs-ref=$VCS_REF \
-      org.label-schema.vcs-url=$VCS_URL \
-      org.label-schema.schema-version="1.0"
-
-
-ENV CC=${CROSS_ROOT}/bin/${CROSS_TRIPLE}-clang
-ENV CXX=${CROSS_ROOT}/bin/${CROSS_TRIPLE}-clang++
-
-WORKDIR /work/deps
-
-COPY install/deb_ubuntu_ccache.sh /work/
-RUN /work/deb_ubuntu_ccache.sh
-WORKDIR /work
-COPY install/ubuntu_arm.sh /work/
-RUN /work/ubuntu_arm.sh
-
-COPY install/arm_openblas.sh /work/
-COPY install/android_armv7_openblas.sh /work/deps
-RUN /work/deps/android_armv7_openblas.sh
-
-ENV OpenBLAS_HOME=${CROSS_ROOT}
-ENV OpenBLAS_DIR=${CROSS_ROOT}
-
-WORKDIR /work
+FROM ubuntu:20.04
+
+ENV ARCH=armv7l \
+    HOSTCC=gcc \
+    HOSTCXX=g++ \
+    TARGET=ARMV7
+
+WORKDIR /usr/local
+
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
+    build-essential \
+    ninja-build \
+    cmake \
+    ccache \
+    git \
+    curl \
+    unzip \
+ && rm -rf /var/lib/apt/lists/*
+
+RUN curl -o android-ndk-r19-linux-x86_64.zip -L https://dl.google.com/android/repository/android-ndk-r19-linux-x86_64.zip && \
+    unzip android-ndk-r19-linux-x86_64.zip && \
+    rm android-ndk-r19-linux-x86_64.zip
+ENV CMAKE_TOOLCHAIN_FILE=/usr/local/android-ndk-r19/build/cmake/android.toolchain.cmake
+
+RUN git clone --recursive -b v0.3.9 https://github.com/xianyi/OpenBLAS.git && \
+    mkdir /usr/local/openblas-android && \
+    cd /usr/local/OpenBLAS && \
+    export TOOLCHAIN=/usr/local/android-ndk-r19/toolchains/llvm/prebuilt/linux-x86_64 && \
+    make NOFORTRAN=1 ARM_SOFTFP_ABI=1 \
+        LDFLAGS="-L/usr/local/android-ndk-r19/toolchains/arm-linux-androideabi-4.9/prebuilt/linux-x86_64/lib/gcc/arm-linux-androideabi/4.9.x -lm" \
+        CC=$TOOLCHAIN/bin/armv7a-linux-androideabi16-clang AR=$TOOLCHAIN/bin/arm-linux-androideabi-ar && \
+    make PREFIX=/usr/local/openblas-android install && \
+    cd /usr/local && \
+    rm -rf OpenBLAS
+ENV OpenBLAS_HOME=/usr/local/openblas-android
 
 ARG USER_ID=0
 ARG GROUP_ID=0
@@ -81,5 +60,4 @@ COPY install/ubuntu_adduser.sh /work/
 RUN /work/ubuntu_adduser.sh
 
 COPY runtime_functions.sh /work/
-WORKDIR /work/mxnet
-
+WORKDIR /work/build
diff --git a/ci/docker/Dockerfile.build.android_armv8 b/ci/docker/Dockerfile.build.android_armv8
index ca62288129bb..81adc80edf14 100644
--- a/ci/docker/Dockerfile.build.android_armv8
+++ b/ci/docker/Dockerfile.build.android_armv8
@@ -18,62 +18,41 @@
 #
 # Dockerfile to build MXNet for Android ARM64/ARMv8
 
-FROM dockcross/base
-MAINTAINER Pedro Larroy "pllarroy@amazon.com"
-
-RUN apt-get update && apt-get install -y \
-  unzip
-
-WORKDIR /work/deps
-
-# Build x86 dependencies.
-COPY install/deb_ubuntu_ccache.sh /work/
-RUN /work/deb_ubuntu_ccache.sh
-
-# Setup Android cross-compilation environment.
-ENV CROSS_TRIPLE=aarch64-linux-android
-ENV CROSS_ROOT=/usr/${CROSS_TRIPLE}
-ENV AS=${CROSS_ROOT}/bin/${CROSS_TRIPLE}-as \
-    AR=${CROSS_ROOT}/bin/${CROSS_TRIPLE}-ar \
-    CC=${CROSS_ROOT}/bin/${CROSS_TRIPLE}-gcc \
-    CPP=${CROSS_ROOT}/bin/${CROSS_TRIPLE}-cpp \
-    CXX=${CROSS_ROOT}/bin/${CROSS_TRIPLE}-g++ \
-    LD=${CROSS_ROOT}/bin/${CROSS_TRIPLE}-ld
-
-
-ENV DEFAULT_DOCKCROSS_IMAGE dockcross/android-arm
-
-# Build-time metadata as defined at http://label-schema.org
-ARG BUILD_DATE
-ARG IMAGE
-ARG VCS_REF
-ARG VCS_URL
-LABEL org.label-schema.build-date=$BUILD_DATE \
-      org.label-schema.name=$IMAGE \
-      org.label-schema.vcs-ref=$VCS_REF \
-      org.label-schema.vcs-url=$VCS_URL \
-      org.label-schema.schema-version="1.0"
-
-ENV ARCH aarch64
-ENV ANDROID_NDK_REVISION 17b
-ENV ANDROID_NDK_API 27
-ENV ANDROID_NDK_ARCH arm64
-WORKDIR /work/deps
-COPY install/android_ndk.sh /work/deps
-RUN /work/deps/android_ndk.sh
-
-
-WORKDIR /work/deps
-COPY install/android_ndk.sh /work/
-RUN /work/android_ndk.sh
-
-ENV CC=${CROSS_ROOT}/bin/${CROSS_TRIPLE}-clang
-ENV CXX=${CROSS_ROOT}/bin/${CROSS_TRIPLE}-clang++
-
-# Build ARM dependencies.
-COPY install/android_arm64_openblas.sh /work/
-RUN /work/android_arm64_openblas.sh
-ENV CPLUS_INCLUDE_PATH /work/deps/OpenBLAS
+FROM ubuntu:20.04
+
+ENV ARCH=aarch64 \
+    HOSTCC=gcc \
+    HOSTCXX=g++ \
+    TARGET=ARMV8
+
+WORKDIR /usr/local
+
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
+    build-essential \
+    ninja-build \
+    cmake \
+    ccache \
+    git \
+    curl \
+    unzip \
+ && rm -rf /var/lib/apt/lists/*
+
+RUN curl -o android-ndk-r19-linux-x86_64.zip -L https://dl.google.com/android/repository/android-ndk-r19-linux-x86_64.zip && \
+    unzip android-ndk-r19-linux-x86_64.zip && \
+    rm android-ndk-r19-linux-x86_64.zip
+ENV CMAKE_TOOLCHAIN_FILE=/usr/local/android-ndk-r19/build/cmake/android.toolchain.cmake
+
+RUN git clone --recursive -b v0.3.9 https://github.com/xianyi/OpenBLAS.git && \
+    mkdir /usr/local/openblas-android && \
+    cd /usr/local/OpenBLAS && \
+    export TOOLCHAIN=/usr/local/android-ndk-r19/toolchains/llvm/prebuilt/linux-x86_64 && \
+    make NOFORTRAN=1 \
+        LDFLAGS="-L/usr/local/android-ndk-r21/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/lib/gcc/aarch64-linux-android/4.9.x -lm" \
+        CC=$TOOLCHAIN/bin/aarch64-linux-android21-clang AR=$TOOLCHAIN/bin/aarch64-linux-android-ar && \
+    make PREFIX=/usr/local/openblas-android install && \
+    cd /usr/local && \
+    rm -rf OpenBLAS
+ENV OpenBLAS_HOME=/usr/local/openblas-android
 
 ARG USER_ID=0
 ARG GROUP_ID=0
@@ -81,5 +60,4 @@ COPY install/ubuntu_adduser.sh /work/
 RUN /work/ubuntu_adduser.sh
 
 COPY runtime_functions.sh /work/
-
 WORKDIR /work/build
diff --git a/ci/docker/Dockerfile.build.armv6 b/ci/docker/Dockerfile.build.armv6
index 98414d8c2aff..02e16da11616 100644
--- a/ci/docker/Dockerfile.build.armv6
+++ b/ci/docker/Dockerfile.build.armv6
@@ -18,26 +18,42 @@
 #
 # Dockerfile to build MXNet for ARMv6
 
-FROM dockcross/linux-armv6
+FROM ubuntu:20.04
 
-ENV ARCH armv6l
-ENV HOSTCC gcc-6
-ENV HOSTCXX g++-6
-ENV TARGET ARMV6
+ENV ARCH=armv6l \
+    HOSTCC=gcc \
+    HOSTCXX=g++ \
+    TARGET=ARMV6
 
-WORKDIR /work/deps
+WORKDIR /usr/local
 
-COPY install/ubuntu_arm.sh /work/
-RUN /work/ubuntu_arm.sh
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
+    build-essential \
+    ninja-build \
+    cmake \
+    ccache \
+    git \
+    curl \
+    zip \
+    python3 \
+    python3-pip \
+ && rm -rf /var/lib/apt/lists/*
 
-COPY install/arm_openblas.sh /work/
-RUN /work/arm_openblas.sh
+# We use a toolchain from toolchains.bootlin.com instead of Debian / Ubunut
+# crossbuild-essential-armel toolchain, as the latter targets ARM architecture
+# versions 4T, 5T, and 6, whereas we only wish to target ARMV6 and like to use
+# ARMV6 specific features. https://wiki.debian.org/ArmEabiPort
+RUN curl -o armv6-eabihf--glibc--stable-2020.02-2.tar.bz2 -L https://toolchains.bootlin.com/downloads/releases/toolchains/armv6-eabihf/tarballs/armv6-eabihf--glibc--stable-2020.02-2.tar.bz2 && \
+    tar xf armv6-eabihf--glibc--stable-2020.02-2.tar.bz2 && \
+    rm armv6-eabihf--glibc--stable-2020.02-2.tar.bz2
+ENV CMAKE_TOOLCHAIN_FILE=/usr/local/armv6-eabihf--glibc--stable-2020.02-2/share/buildroot/toolchainfile.cmake
 
-ENV OpenBLAS_HOME=${CROSS_ROOT}
-ENV OpenBLAS_DIR=${CROSS_ROOT}
-
-COPY install/deb_ubuntu_ccache.sh /work/
-RUN /work/deb_ubuntu_ccache.sh
+RUN git clone --recursive -b v0.3.9 https://github.com/xianyi/OpenBLAS.git && \
+    cd /usr/local/OpenBLAS && \
+    make NOFORTRAN=1 CC=/usr/local/armv6-eabihf--glibc--stable-2020.02-2/bin/arm-linux-gcc && \
+    make PREFIX=/usr/local/armv6-eabihf--glibc--stable-2020.02-2/arm-buildroot-linux-gnueabihf/sysroot install && \
+    cd /usr/local && \
+    rm -rf OpenBLAS
 
 ARG USER_ID=0
 ARG GROUP_ID=0
diff --git a/ci/docker/Dockerfile.build.armv7 b/ci/docker/Dockerfile.build.armv7
index f0cdd90322f0..a9cc6d1e83a4 100644
--- a/ci/docker/Dockerfile.build.armv7
+++ b/ci/docker/Dockerfile.build.armv7
@@ -16,28 +16,39 @@
 # specific language governing permissions and limitations
 # under the License.
 #
-# Dockerfile to build MXNet for Android ARMv7
-
-FROM dockcross/linux-armv7
-
-ENV ARCH armv7l
-ENV HOSTCC gcc-7
-ENV HOSTCXX g++-7
-ENV TARGET ARMV7
-
-WORKDIR /work/deps
-
-COPY install/ubuntu_arm.sh /work/
-RUN /work/ubuntu_arm.sh
-
-COPY install/arm_openblas.sh /work/
-RUN /work/arm_openblas.sh
-
-ENV OpenBLAS_HOME=${CROSS_ROOT}
-ENV OpenBLAS_DIR=${CROSS_ROOT}
-
-COPY install/deb_ubuntu_ccache.sh /work/
-RUN /work/deb_ubuntu_ccache.sh
+# Dockerfile to build MXNet for ARMv7
+
+FROM ubuntu:20.04
+
+ENV ARCH=armv7l \
+    HOSTCC=gcc \
+    HOSTCXX=g++ \
+    TARGET=ARMV7
+
+WORKDIR /usr/local
+
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
+    build-essential \
+    ninja-build \
+    cmake \
+    ccache \
+    git \
+    curl \
+    zip \
+    python3 \
+    python3-pip \
+    crossbuild-essential-armhf \
+ && rm -rf /var/lib/apt/lists/*
+
+COPY toolchains/arm-linux-gnueabihf-toolchain.cmake /usr/local
+ENV CMAKE_TOOLCHAIN_FILE=/usr/local/arm-linux-gnueabihf-toolchain.cmake
+
+RUN git clone --recursive -b v0.3.9 https://github.com/xianyi/OpenBLAS.git && \
+    cd /usr/local/OpenBLAS && \
+    make NOFORTRAN=1 CC=arm-linux-gnueabihf-gcc && \
+    make PREFIX=/usr/local/arm-linux-gnueabihf install && \
+    cd /usr/local && \
+    rm -rf OpenBLAS
 
 ARG USER_ID=0
 ARG GROUP_ID=0
diff --git a/ci/docker/Dockerfile.build.armv8 b/ci/docker/Dockerfile.build.armv8
index 2f918e55d991..adf6873fb40c 100644
--- a/ci/docker/Dockerfile.build.armv8
+++ b/ci/docker/Dockerfile.build.armv8
@@ -18,26 +18,37 @@
 #
 # Dockerfile to build MXNet for ARM64/ARMv8
 
-FROM dockcross/linux-arm64
-
-ENV ARCH aarch64
-ENV HOSTCC gcc-7
-ENV HOSTCXX g++-7
-ENV TARGET ARMV8
-
-WORKDIR /work/deps
-
-COPY install/ubuntu_arm.sh /work/
-RUN /work/ubuntu_arm.sh
-
-COPY install/arm_openblas.sh /work/
-RUN /work/arm_openblas.sh
-
-ENV OpenBLAS_HOME=${CROSS_ROOT}
-ENV OpenBLAS_DIR=${CROSS_ROOT}
-
-COPY install/deb_ubuntu_ccache.sh /work/
-RUN /work/deb_ubuntu_ccache.sh
+FROM ubuntu:20.04
+
+ENV ARCH=aarch64 \
+    HOSTCC=gcc \
+    HOSTCXX=g++ \
+    TARGET=ARMV8
+
+WORKDIR /usr/local
+
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
+    build-essential \
+    ninja-build \
+    cmake \
+    ccache \
+    git \
+    curl \
+    zip \
+    python3 \
+    python3-pip \
+    crossbuild-essential-arm64 \
+ && rm -rf /var/lib/apt/lists/*
+
+COPY toolchains/aarch64-linux-gnu-toolchain.cmake /usr
+ENV CMAKE_TOOLCHAIN_FILE=/usr/aarch64-linux-gnu-toolchain.cmake
+
+RUN git clone --recursive -b v0.3.9 https://github.com/xianyi/OpenBLAS.git && \
+    cd /usr/local/OpenBLAS && \
+    make NOFORTRAN=1 CC=aarch64-linux-gnu-gcc && \
+    make PREFIX=/usr/aarch64-linux-gnu install && \
+    cd /usr/local && \
+    rm -rf OpenBLAS
 
 ARG USER_ID=0
 ARG GROUP_ID=0
@@ -45,4 +56,4 @@ COPY install/ubuntu_adduser.sh /work/
 RUN /work/ubuntu_adduser.sh
 
 COPY runtime_functions.sh /work/
-WORKDIR /work/build
+WORKDIR /work/mxnet
diff --git a/ci/docker/Dockerfile.build.jetson b/ci/docker/Dockerfile.build.jetson
index 55575d17970f..93fe5e0a5b0d 100644
--- a/ci/docker/Dockerfile.build.jetson
+++ b/ci/docker/Dockerfile.build.jetson
@@ -20,63 +20,58 @@
 # This script assumes /work/mxnet exists and contains the mxnet code you wish to compile and
 # that /work/build exists and is the target for your output.
 
-FROM nvidia/cuda:9.0-cudnn7-devel as cudabuilder
+FROM nvidia/cuda:10.0-cudnn7-devel-ubuntu18.04
 
-FROM dockcross/linux-arm64
+ENV ARCH=aarch64 \
+    HOSTCC=gcc \
+    TARGET=ARMV8
 
-ENV ARCH aarch64
-ENV HOSTCC gcc
-ENV TARGET ARMV8
+WORKDIR /usr/local
 
-WORKDIR /work/deps
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
+    build-essential \
+    ninja-build \
+    git \
+    curl \
+    zip \
+    unzip \
+    python3 \
+    python3-pip \
+    awscli \
+    crossbuild-essential-arm64 \
+ && rm -rf /var/lib/apt/lists/*
 
-COPY install/ubuntu_arm.sh /work/
-RUN /work/ubuntu_arm.sh
-
-COPY install/arm_openblas.sh /work/
-RUN /work/arm_openblas.sh
-
-ENV OpenBLAS_HOME=${CROSS_ROOT}
-ENV OpenBLAS_DIR=${CROSS_ROOT}
+# cmake on Ubuntu 18.04 is too old
+RUN python3 -m pip install cmake
 
+# ccache on Ubuntu 18.04 is too old to support Cuda correctly
 COPY install/deb_ubuntu_ccache.sh /work/
 RUN /work/deb_ubuntu_ccache.sh
 
-# Setup CUDA build env (including configuring and copying nvcc)
-COPY --from=cudabuilder /usr/local/cuda /usr/local/cuda
-ENV TARGET_ARCH aarch64
-ENV TARGET_OS linux
+COPY toolchains/aarch64-linux-gnu-toolchain.cmake /usr
+ENV CMAKE_TOOLCHAIN_FILE=/usr/aarch64-linux-gnu-toolchain.cmake
+
+RUN git clone --recursive -b v0.3.9 https://github.com/xianyi/OpenBLAS.git && \
+    cd /usr/local/OpenBLAS && \
+    make NOFORTRAN=1 CC=aarch64-linux-gnu-gcc && \
+    make PREFIX=/usr/aarch64-linux-gnu install && \
+    cd /usr/local && \
+    rm -rf OpenBLAS
 
-# Install ARM depedencies based on Jetpack 3.3
-RUN JETPACK_DOWNLOAD_PREFIX=https://developer.download.nvidia.com/devzone/devcenter/mobile/jetpack_l4t/3.3/lw.xd42/JetPackL4T_33_b39 && \
-    CUDA_REPO_PREFIX=/var/cuda-repo-9-0-local && \
-    ARM_CUDA_INSTALLER_PACKAGE=cuda-repo-l4t-9-0-local_9.0.252-1_arm64.deb && \
-    ARM_CUDNN_INSTALLER_PACKAGE=libcudnn7_7.1.5.14-1+cuda9.0_arm64.deb && \
-    ARM_CUDNN_DEV_INSTALLER_PACKAGE=libcudnn7-dev_7.1.5.14-1+cuda9.0_arm64.deb && \
-    ARM_LICENSE_INSTALLER=cuda-license-9-0_9.0.252-1_arm64.deb && \
-    ARM_CUBLAS_INSTALLER=cuda-cublas-9-0_9.0.252-1_arm64.deb && \
-    ARM_NVINFER_INSTALLER_PACKAGE=libnvinfer4_4.1.3-1+cuda9.0_arm64.deb && \
-    ARM_NVINFER_DEV_INSTALLER_PACKAGE=libnvinfer-dev_4.1.3-1+cuda9.0_arm64.deb && \
-    dpkg --add-architecture arm64 && \
-    wget -nv $JETPACK_DOWNLOAD_PREFIX/$ARM_CUDA_INSTALLER_PACKAGE && \
-    wget -nv $JETPACK_DOWNLOAD_PREFIX/$ARM_CUDNN_INSTALLER_PACKAGE && \
-    wget -nv $JETPACK_DOWNLOAD_PREFIX/$ARM_CUDNN_DEV_INSTALLER_PACKAGE && \
-    wget -nv $JETPACK_DOWNLOAD_PREFIX/$ARM_NVINFER_INSTALLER_PACKAGE && \
-    wget -nv $JETPACK_DOWNLOAD_PREFIX/$ARM_NVINFER_DEV_INSTALLER_PACKAGE && \
-    dpkg -i --force-architecture  $ARM_CUDA_INSTALLER_PACKAGE && \
-    apt-key add $CUDA_REPO_PREFIX/7fa2af80.pub && \
-    dpkg -i --force-architecture  $ARM_CUDNN_INSTALLER_PACKAGE && \
-    dpkg -i --force-architecture  $ARM_CUDNN_DEV_INSTALLER_PACKAGE && \
-    dpkg -i --force-architecture  $CUDA_REPO_PREFIX/$ARM_LICENSE_INSTALLER && \
-    dpkg -i --force-architecture  $CUDA_REPO_PREFIX/$ARM_CUBLAS_INSTALLER && \
-    dpkg -i --force-architecture  $ARM_NVINFER_INSTALLER_PACKAGE && \
-    dpkg -i --force-architecture  $ARM_NVINFER_DEV_INSTALLER_PACKAGE && \
-    apt update -y || true && apt install -y cuda-libraries-dev-9-0 libcudnn7-dev libnvinfer-dev
-RUN ln -s /usr/include/aarch64-linux-gnu/cudnn_v7.h /usr/include/aarch64-linux-gnu/cudnn.h
-ENV PATH $PATH:/usr/local/cuda/bin
-ENV NVCCFLAGS "-m64"
-ENV CUDA_ARCH "-gencode arch=compute_53,code=sm_53 -gencode arch=compute_62,code=sm_62"
-ENV NVCC /usr/local/cuda/bin/nvcc
+# Install aarch64 cross depedencies based on Jetpack 4.3
+# Manually downloaded using SDK Manager tool and placed in a private S3 bucket.
+# We're not allowed to redistribute these files and there is no public version.
+RUN aws s3 cp s3://mxnet-ci-prod-private-slave-data/nvidia/sdkm_downloads/cuda-repo-ubuntu1804-10-0-local-10.0.326-410.108_1.0-1_amd64.deb . && \
+    dpkg -i cuda-repo-ubuntu1804-10-0-local-10.0.326-410.108_1.0-1_amd64.deb && \
+    rm cuda-repo-ubuntu1804-10-0-local-10.0.326-410.108_1.0-1_amd64.deb && \
+    apt-key add /var/cuda-repo-10-0-local-10.0.326-410.108/7fa2af80.pub && \
+    aws s3 cp s3://mxnet-ci-prod-private-slave-data/nvidia/sdkm_downloads/cuda-repo-cross-aarch64-10-0-local-10.0.326_1.0-1_all.deb . && \
+    dpkg -i cuda-repo-cross-aarch64-10-0-local-10.0.326_1.0-1_all.deb && \
+    rm cuda-repo-cross-aarch64-10-0-local-10.0.326_1.0-1_all.deb && \
+    apt-get update && \
+    apt-get install -y -f && \
+    apt-get install -y cuda-cross-aarch64 cuda-cross-aarch64-10-0 && \
+    rm -rf /var/lib/apt/lists/*
 
 ARG USER_ID=0
 ARG GROUP_ID=0
diff --git a/ci/docker/install/android_armv7_openblas.sh b/ci/docker/install/android_armv7_openblas.sh
deleted file mode 100755
index 8642df6d9450..000000000000
--- a/ci/docker/install/android_armv7_openblas.sh
+++ /dev/null
@@ -1,31 +0,0 @@
-#!/usr/bin/env bash
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# build and install are separated so changes to build don't invalidate
-# the whole docker cache for the image
-
-set -ex
-pushd .
-git clone https://github.com/xianyi/OpenBLAS.git
-cd OpenBLAS
-make TARGET=ARMV7 HOSTCC=gcc-7 HOSTCXX=g++-7 NOFORTRAN=1 ARM_SOFTFP_ABI=1 -j$(nproc) libs
-#make PREFIX=${CROSS_ROOT} TARGET=ARMV7 HOSTCC=gcc NOFORTRAN=1 ARM_SOFTFP_ABI=1 install
-cp *.h ${CROSS_ROOT}/include
-cp libopenblas*.a ${CROSS_ROOT}/lib
-popd
diff --git a/ci/docker/install/android_ndk.sh b/ci/docker/install/android_ndk.sh
deleted file mode 100755
index cb83aa65639a..000000000000
--- a/ci/docker/install/android_ndk.sh
+++ /dev/null
@@ -1,38 +0,0 @@
-#!/usr/bin/env bash
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# build and install are separated so changes to build don't invalidate
-# the whole docker cache for the image
-
-set -ex
-pushd .
-# This environment variable comes from the docker file
-echo "Downloading android SDK rev ${ANDROID_NDK_REVISION}"
-curl -O https://dl.google.com/android/repository/android-ndk-r${ANDROID_NDK_REVISION}-linux-x86_64.zip && \
-unzip ./android-ndk-r${ANDROID_NDK_REVISION}-linux-x86_64.zip && \
-cd android-ndk-r${ANDROID_NDK_REVISION} && \
-./build/tools/make_standalone_toolchain.py \
-    --stl=libc++ \
-    --arch ${ANDROID_NDK_ARCH}\
-    --api ${ANDROID_NDK_API}\
-    --install-dir=${CROSS_ROOT} && \
-
-find ${CROSS_ROOT} -exec chmod a+r '{}' \; && \
-find ${CROSS_ROOT} -executable -exec chmod a+x '{}' \;
-popd
diff --git a/ci/docker/install/arm64_openblas.sh b/ci/docker/install/arm64_openblas.sh
deleted file mode 100755
index 88f2e98cd65b..000000000000
--- a/ci/docker/install/arm64_openblas.sh
+++ /dev/null
@@ -1,35 +0,0 @@
-#!/usr/bin/env bash
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# build and install are separated so changes to build don't invalidate
-# the whole docker cache for the image
-
-set -ex
-pushd .
-wget -nv https://api.github.com/repos/xianyi/OpenBLAS/git/refs/heads/master -O openblas_version.json
-echo "Using openblas:"
-cat openblas_version.json
-git clone https://github.com/xianyi/OpenBLAS.git
-cd OpenBLAS
-make -j$(nproc) TARGET=ARMV8
-make install
-ln -s /opt/OpenBLAS/lib/libopenblas.so /usr/lib/libopenblas.so
-ln -s /opt/OpenBLAS/lib/libopenblas.a /usr/lib/libopenblas.a
-ln -s /opt/OpenBLAS/lib/libopenblas.a /usr/lib/liblapack.a
-popd
diff --git a/ci/docker/install/deb_ubuntu_ccache.sh b/ci/docker/install/deb_ubuntu_ccache.sh
index cdc9354e220f..ef913ba36e55 100755
--- a/ci/docker/install/deb_ubuntu_ccache.sh
+++ b/ci/docker/install/deb_ubuntu_ccache.sh
@@ -23,7 +23,7 @@ set -ex
 
 pushd .
 
-apt update || true
+apt update
 apt install -y \
     autoconf \
     gperf \
@@ -32,31 +32,9 @@ apt install -y \
 mkdir -p /work/deps
 cd /work/deps
 
-# Unset ARM toolchain cross-compilation configuration on dockcross
-unset ARCH
-unset DEFAULT_DOCKCROSS_IMAGE
-unset CROSS_TRIPLE
-unset CC
-unset AS
-unset AR
-unset FC
-unset CXX
-unset CROSS_ROOT
-unset CROSS_COMPILE
-unset PKG_CONFIG_PATH
-unset CMAKE_TOOLCHAIN_FILE
-unset CPP
-unset LD
-export PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
-
 git clone --recursive https://github.com/ccache/ccache.git
 cd ccache
-git checkout v3.7.8
-# Backport cuda related fixes: https://github.com/ccache/ccache/pull/381
-git config user.name "MXNet CI"
-git config user.email "MXNetCI@example.com"
-git cherry-pick --strategy-option=theirs c4fffda031034f930df2cf188878b8f9160027df
-git cherry-pick 0dec5c2df3e3ebc1fbbf33f74c992bef6264f37a
+git checkout v3.7.9
 
 ./autogen.sh
 ./configure --disable-man
diff --git a/ci/docker/install/ubuntu_arm.sh b/ci/docker/install/ubuntu_arm.sh
deleted file mode 100755
index 608d0362f138..000000000000
--- a/ci/docker/install/ubuntu_arm.sh
+++ /dev/null
@@ -1,28 +0,0 @@
-#!/usr/bin/env bash
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-set -ex
-
-apt update || true
-apt install -y \
-    unzip \
-    python3 \
-    python3-pip
-
-pip3 install setuptools
diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 03f8b2c1c3d8..4e7b0db1b648 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -216,15 +216,22 @@ build_dynamic_libmxnet() {
 
 build_jetson() {
     set -ex
-    pushd .
-
-    export CC=gcc-6
-    export CXX=g++-6
-    cp make/crosscompile.jetson.mk ./config.mk
-    make -j$(nproc)
-
-    build_wheel /work/mxnet/python /work/mxnet/lib
-    popd
+    cd /work/build
+    cmake \
+        -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE} \
+        -DUSE_CUDA=ON \
+        -DMXNET_CUDA_ARCH="5.2" \
+        -DENABLE_CUDA_RTC=OFF \
+        -DSUPPORT_F16C=OFF \
+        -DUSE_OPENCV=OFF \
+        -DUSE_OPENMP=ON \
+        -DUSE_LAPACK=OFF \
+        -DUSE_SIGNAL_HANDLER=ON \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DUSE_MKL_IF_AVAILABLE=OFF \
+        -G Ninja /work/mxnet
+    ninja
+    build_wheel
 }
 
 #
@@ -252,7 +259,7 @@ build_armv6() {
         -DUSE_MKL_IF_AVAILABLE=OFF \
         -DUSE_LAPACK=OFF \
         -DBUILD_CPP_EXAMPLES=OFF \
-        -Dmxnet_LINKER_LIBS=-lgfortran \
+        -Dmxnet_LINKER_LIBS=-latomic \
         -G Ninja /work/mxnet
 
     ninja
@@ -279,7 +286,6 @@ build_armv7() {
         -DUSE_MKL_IF_AVAILABLE=OFF \
         -DUSE_LAPACK=OFF \
         -DBUILD_CPP_EXAMPLES=OFF \
-        -Dmxnet_LINKER_LIBS=-lgfortran \
         -G Ninja /work/mxnet
 
     ninja
@@ -289,14 +295,15 @@ build_armv7() {
 build_armv8() {
     cd /work/build
     cmake \
-        -DUSE_CUDA=OFF\
-        -DSUPPORT_F16C=OFF\
-        -DUSE_OPENCV=OFF\
+        -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE} \
+        -DUSE_CUDA=OFF \
+        -DSUPPORT_F16C=OFF \
+        -DUSE_OPENCV=OFF \
         -DUSE_OPENMP=ON \
-        -DUSE_LAPACK=OFF\
-        -DUSE_SIGNAL_HANDLER=ON\
-        -DCMAKE_BUILD_TYPE=Release\
-        -DUSE_MKL_IF_AVAILABLE=OFF\
+        -DUSE_LAPACK=OFF \
+        -DUSE_SIGNAL_HANDLER=ON \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DUSE_MKL_IF_AVAILABLE=OFF \
         -G Ninja /work/mxnet
     ninja
     build_wheel
@@ -311,15 +318,18 @@ build_android_armv7() {
     set -ex
     cd /work/build
     cmake \
-        -DANDROID=ON\
-        -DUSE_CUDA=OFF\
-        -DUSE_SSE=OFF\
-        -DSUPPORT_F16C=OFF\
-        -DUSE_LAPACK=OFF\
-        -DUSE_OPENCV=OFF\
-        -DUSE_OPENMP=OFF\
-        -DUSE_SIGNAL_HANDLER=ON\
-        -DUSE_MKL_IF_AVAILABLE=OFF\
+        -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE} \
+        -DANDROID_ABI="armeabi-v7a" \
+        -DANDROID_STL="c++_shared" \
+        -DANDROID=ON \
+        -DUSE_CUDA=OFF \
+        -DUSE_SSE=OFF \
+        -DSUPPORT_F16C=OFF \
+        -DUSE_LAPACK=OFF \
+        -DUSE_OPENCV=OFF \
+        -DUSE_OPENMP=OFF \
+        -DUSE_SIGNAL_HANDLER=ON \
+        -DUSE_MKL_IF_AVAILABLE=OFF \
         -G Ninja /work/mxnet
     ninja
 }
@@ -328,14 +338,17 @@ build_android_armv8() {
     set -ex
     cd /work/build
     cmake \
+        -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE} \
+        -DANDROID_ABI="arm64-v8a" \
+        -DANDROID_STL="c++_shared" \
         -DANDROID=ON \
-        -DUSE_CUDA=OFF\
-        -DUSE_SSE=OFF\
-        -DUSE_LAPACK=OFF\
-        -DUSE_OPENCV=OFF\
-        -DUSE_OPENMP=OFF\
-        -DUSE_SIGNAL_HANDLER=ON\
-        -DUSE_MKL_IF_AVAILABLE=OFF\
+        -DUSE_CUDA=OFF \
+        -DUSE_SSE=OFF \
+        -DUSE_LAPACK=OFF \
+        -DUSE_OPENCV=OFF \
+        -DUSE_OPENMP=OFF \
+        -DUSE_SIGNAL_HANDLER=ON \
+        -DUSE_MKL_IF_AVAILABLE=OFF \
         -G Ninja /work/mxnet
     ninja
 }
diff --git a/ci/docker/install/android_arm64_openblas.sh b/ci/docker/toolchains/aarch64-linux-gnu-toolchain.cmake
old mode 100755
new mode 100644
similarity index 64%
rename from ci/docker/install/android_arm64_openblas.sh
rename to ci/docker/toolchains/aarch64-linux-gnu-toolchain.cmake
index f9647d969bf2..3780415c4b15
--- a/ci/docker/install/android_arm64_openblas.sh
+++ b/ci/docker/toolchains/aarch64-linux-gnu-toolchain.cmake
@@ -1,5 +1,3 @@
-#!/usr/bin/env bash
-
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
@@ -17,16 +15,14 @@
 # specific language governing permissions and limitations
 # under the License.
 
-# build and install are separated so changes to build don't invalidate
-# the whole docker cache for the image
+set(CMAKE_SYSTEM_NAME Linux)
+set(CMAKE_SYSTEM_PROCESSOR "aarch64")
+set(CMAKE_C_COMPILER aarch64-linux-gnu-gcc)
+set(CMAKE_CXX_COMPILER aarch64-linux-gnu-g++)
+set(CMAKE_CUDA_HOST_COMPILER aarch64-linux-gnu-gcc)
+set(CMAKE_FIND_ROOT_PATH "/usr/aarch64-linux-gnu")
 
-set -ex
-pushd .
-git clone https://github.com/xianyi/OpenBLAS.git
-cd OpenBLAS
-make -j$(nproc) TARGET=ARMV8 ARM_SOFTFP_ABI=1 HOSTCC=gcc-7 HOSTCXX=g++-7 NOFORTRAN=1 libs
-# Can't be run (utility not compiled for the target platform)
-#make install
-cp *.h /usr/include
-cp libopenblas.a /usr/local/lib
-popd
+set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
diff --git a/ci/docker/install/arm_openblas.sh b/ci/docker/toolchains/arm-linux-gnueabihf-toolchain.cmake
old mode 100755
new mode 100644
similarity index 65%
rename from ci/docker/install/arm_openblas.sh
rename to ci/docker/toolchains/arm-linux-gnueabihf-toolchain.cmake
index fa2e5cae9cba..62038ecee16a
--- a/ci/docker/install/arm_openblas.sh
+++ b/ci/docker/toolchains/arm-linux-gnueabihf-toolchain.cmake
@@ -1,5 +1,3 @@
-#!/usr/bin/env bash
-
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
@@ -17,14 +15,13 @@
 # specific language governing permissions and limitations
 # under the License.
 
-set -ex
-
-git clone --recursive -b v0.2.20 https://github.com/xianyi/OpenBLAS.git
-
-cd OpenBLAS
-make -j$(nproc)
-PREFIX=${CROSS_ROOT} make install
-
-cd ..
+set(CMAKE_SYSTEM_NAME Linux)
+set(CMAKE_SYSTEM_PROCESSOR "armv7l")
+set(CMAKE_C_COMPILER arm-linux-gnueabihf-gcc)
+set(CMAKE_CXX_COMPILER arm-linux-gnueabihf-g++)
+set(CMAKE_FIND_ROOT_PATH "/usr/arm-linux-gnueabihf" "/usr/local/arm-linux-gnueabihf")
 
-rm -rf OpenBLAS
+set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
diff --git a/cmake/upstream/FindCUDAToolkit.cmake b/cmake/upstream/FindCUDAToolkit.cmake
index d37c44d9c782..fee4f3f4f698 100644
--- a/cmake/upstream/FindCUDAToolkit.cmake
+++ b/cmake/upstream/FindCUDAToolkit.cmake
@@ -132,6 +132,7 @@ of the following libraries that are part of the CUDAToolkit:
 - :ref:`cuRAND<cuda_toolkit_cuRAND>`
 - :ref:`cuSOLVER<cuda_toolkit_cuSOLVER>`
 - :ref:`cuSPARSE<cuda_toolkit_cuSPARSE>`
+- :ref:`cuPTI<cuda_toolkit_cupti>`
 - :ref:`NPP<cuda_toolkit_NPP>`
 - :ref:`nvBLAS<cuda_toolkit_nvBLAS>`
 - :ref:`nvGRAPH<cuda_toolkit_nvGRAPH>`
@@ -149,7 +150,6 @@ CUDA Runtime Library
 
 The CUDA Runtime library (cudart) are what most applications will typically
 need to link against to make any calls such as `cudaMalloc`, and `cudaFree`.
-They are an explicit dependency of almost every library.
 
 Targets Created:
 
@@ -230,6 +230,18 @@ Targets Created:
 - ``CUDA::cusparse``
 - ``CUDA::cusparse_static``
 
+.. _`cuda_toolkit_cupti`:
+
+cupti
+"""""
+
+The `NVIDIA CUDA Profiling Tools Interface <https://developer.nvidia.com/CUPTI>`_.
+
+Targets Created:
+
+- ``CUDA::cupti``
+- ``CUDA::cupti_static``
+
 .. _`cuda_toolkit_NPP`:
 
 NPP
@@ -361,8 +373,6 @@ Targets Created:
 
 - ``CUDA::nvml``
 
-.. _`cuda_toolkit_opencl`:
-
 .. _`cuda_toolkit_nvToolsExt`:
 
 nvToolsExt
@@ -375,6 +385,8 @@ Targets Created:
 
 - ``CUDA::nvToolsExt``
 
+.. _`cuda_toolkit_opencl`:
+
 OpenCL
 """"""
 
@@ -436,6 +448,11 @@ Result variables
     The path to the CUDA Toolkit library directory that contains the CUDA
     Runtime library ``cudart``.
 
+``CUDAToolkit_TARGET_DIR``
+    The path to the CUDA Toolkit directory including the target architecture
+    when cross-compiling. When not cross-compiling this will be equivalant to
+    ``CUDAToolkit_ROOT_DIR``.
+
 ``CUDAToolkit_NVCC_EXECUTABLE``
     The path to the NVIDIA CUDA compiler ``nvcc``.  Note that this path may
     **not** be the same as
@@ -487,6 +504,7 @@ if(CMAKE_CUDA_COMPILER_LOADED AND NOT CUDAToolkit_BIN_DIR)
   get_filename_component(cuda_dir "${CMAKE_CUDA_COMPILER}" DIRECTORY)
   # use the already detected cuda compiler
   set(CUDAToolkit_BIN_DIR "${cuda_dir}" CACHE PATH "")
+  mark_as_advanced(CUDAToolkit_BIN_DIR)
   unset(cuda_dir)
 endif()
 
@@ -641,6 +659,7 @@ endif()
 if(NOT CUDAToolkit_BIN_DIR AND CUDAToolkit_NVCC_EXECUTABLE)
   get_filename_component(cuda_dir "${CUDAToolkit_NVCC_EXECUTABLE}" DIRECTORY)
   set(CUDAToolkit_BIN_DIR "${cuda_dir}" CACHE PATH "" FORCE)
+  mark_as_advanced(CUDAToolkit_BIN_DIR)
   unset(cuda_dir)
 endif()
 
@@ -669,8 +688,47 @@ endif()
 
 get_filename_component(CUDAToolkit_ROOT_DIR ${CUDAToolkit_BIN_DIR} DIRECTORY ABSOLUTE)
 
-# Now that we have the real ROOT_DIR, find components inside it.
-list(APPEND CMAKE_PREFIX_PATH ${CUDAToolkit_ROOT_DIR})
+# Handle cross compilation
+if(CMAKE_CROSSCOMPILING)
+  if(CMAKE_SYSTEM_PROCESSOR STREQUAL "armv7-a")
+    # Support for NVPACK
+    set (CUDAToolkit_TARGET_NAME "armv7-linux-androideabi")
+  elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm")
+    # Support for arm cross compilation
+    set(CUDAToolkit_TARGET_NAME "armv7-linux-gnueabihf")
+  elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
+    # Support for aarch64 cross compilation
+    if (ANDROID_ARCH_NAME STREQUAL "arm64")
+      set(CUDAToolkit_TARGET_NAME "aarch64-linux-androideabi")
+    else()
+      set(CUDAToolkit_TARGET_NAME "aarch64-linux")
+    endif (ANDROID_ARCH_NAME STREQUAL "arm64")
+  elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
+      set(CUDAToolkit_TARGET_NAME "x86_64-linux")
+  endif()
+
+  if (EXISTS "${CUDAToolkit_ROOT_DIR}/targets/${CUDAToolkit_TARGET_NAME}")
+    set(CUDAToolkit_TARGET_DIR "${CUDAToolkit_ROOT_DIR}/targets/${CUDAToolkit_TARGET_NAME}")
+    # add known CUDA target root path to the set of directories we search for programs, libraries and headers
+    list(PREPEND CMAKE_FIND_ROOT_PATH "${CUDAToolkit_TARGET_DIR}")
+
+    # Mark that we need to pop the root search path changes after we have
+    # found all cuda libraries so that searches for our cross-compilation
+    # libraries work when another cuda sdk is in CMAKE_PREFIX_PATH or
+    # PATh
+    set(_CUDAToolkit_Pop_ROOT_PATH True)
+  endif()
+else()
+  # Not cross compiling
+  set(CUDAToolkit_TARGET_DIR "${CUDAToolkit_ROOT_DIR}")
+  # Now that we have the real ROOT_DIR, find components inside it.
+  list(APPEND CMAKE_PREFIX_PATH ${CUDAToolkit_ROOT_DIR})
+
+  # Mark that we need to pop the prefix path changes after we have
+  # found the cudart library.
+  set(_CUDAToolkit_Pop_Prefix True)
+endif()
+
 
 # Find the include/ directory
 find_path(CUDAToolkit_INCLUDE_DIR
@@ -680,14 +738,17 @@ find_path(CUDAToolkit_INCLUDE_DIR
 # And find the CUDA Runtime Library libcudart
 find_library(CUDA_CUDART
   NAMES cudart
-  PATH_SUFFIXES lib64 lib/x64
+  PATH_SUFFIXES lib64 lib64/stubs lib/x64
 )
 if (NOT CUDA_CUDART AND NOT CUDAToolkit_FIND_QUIETLY)
   message(STATUS "Unable to find cudart library.")
 endif()
 
 unset(CUDAToolkit_ROOT_DIR)
-list(REMOVE_AT CMAKE_PREFIX_PATH -1)
+if(_CUDAToolkit_Pop_Prefix)
+  list(REMOVE_AT CMAKE_PREFIX_PATH -1)
+  unset(_CUDAToolkit_Pop_Prefix)
+endif()
 
 #-----------------------------------------------------------------------------
 # Perform version comparison and validate all required variables are set.
@@ -702,6 +763,10 @@ find_package_handle_standard_args(CUDAToolkit
   VERSION_VAR
     CUDAToolkit_VERSION
 )
+mark_as_advanced(CUDA_CUDART
+                 CUDAToolkit_INCLUDE_DIR
+                 CUDAToolkit_NVCC_EXECUTABLE
+                 )
 
 #-----------------------------------------------------------------------------
 # Construct result variables
@@ -714,78 +779,103 @@ endif()
 # Construct import targets
 if(CUDAToolkit_FOUND)
 
-  function(find_and_add_cuda_import_lib lib_name)
+  function(_CUDAToolkit_find_and_add_import_lib lib_name)
+    cmake_parse_arguments(arg "" "" "ALT;DEPS;EXTRA_PATH_SUFFIXES" ${ARGN})
 
-    if(ARGC GREATER 1)
-      set(search_names ${ARGN})
-    else()
-      set(search_names ${lib_name})
-    endif()
+    set(search_names ${lib_name} ${arg_ALT})
 
     find_library(CUDA_${lib_name}_LIBRARY
       NAMES ${search_names}
-      PATHS ${CUDAToolkit_LIBRARY_DIR}
+      HINTS ${CUDAToolkit_LIBRARY_DIR}
             ENV CUDA_PATH
-      PATH_SUFFIXES nvidia/current lib64 lib/x64 lib
+      PATH_SUFFIXES nvidia/current lib64 lib64/stubs lib/x64 lib lib/stubs stubs
+                    ${arg_EXTRA_PATH_SUFFIXES}
     )
+    mark_as_advanced(CUDA_${lib_name}_LIBRARY)
 
-    if (NOT CUDA::${lib_name} AND CUDA_${lib_name}_LIBRARY)
+    if (NOT TARGET CUDA::${lib_name} AND CUDA_${lib_name}_LIBRARY)
       add_library(CUDA::${lib_name} IMPORTED INTERFACE)
       target_include_directories(CUDA::${lib_name} SYSTEM INTERFACE "${CUDAToolkit_INCLUDE_DIRS}")
       target_link_libraries(CUDA::${lib_name} INTERFACE "${CUDA_${lib_name}_LIBRARY}")
+      foreach(dep ${arg_DEPS})
+        if(TARGET CUDA::${dep})
+          target_link_libraries(CUDA::${lib_name} INTERFACE CUDA::${dep})
+        endif()
+      endforeach()
     endif()
   endfunction()
 
-  function(add_cuda_link_dependency lib_name)
-    foreach(dependency IN LISTS ${ARGN})
-      target_link_libraries(CUDA::${lib_name} INTERFACE CUDA::${dependency})
-    endforeach()
-  endfunction()
+  if(NOT TARGET CUDA::toolkit)
+    add_library(CUDA::toolkit IMPORTED INTERFACE)
+    target_include_directories(CUDA::toolkit SYSTEM INTERFACE "${CUDAToolkit_INCLUDE_DIRS}")
+    target_link_directories(CUDA::toolkit INTERFACE "${CUDAToolkit_LIBRARY_DIR}")
+  endif()
 
-  add_library(CUDA::toolkit IMPORTED INTERFACE)
-  target_include_directories(CUDA::toolkit SYSTEM INTERFACE "${CUDAToolkit_INCLUDE_DIRS}")
-  target_link_directories(CUDA::toolkit INTERFACE "${CUDAToolkit_LIBRARY_DIR}")
+  _CUDAToolkit_find_and_add_import_lib(cuda_driver ALT cuda)
 
+  _CUDAToolkit_find_and_add_import_lib(cudart)
+  _CUDAToolkit_find_and_add_import_lib(cudart_static)
 
-  find_and_add_cuda_import_lib(cuda_driver cuda)
+  # setup dependencies that are required for cudart_static when building
+  # on linux. These are generally only required when using the CUDA toolkit
+  # when CUDA language is disabled
+  if(NOT TARGET CUDA::cudart_static_deps
+     AND TARGET CUDA::cudart_static)
 
-  find_and_add_cuda_import_lib(cudart)
-  find_and_add_cuda_import_lib(cudart_static)
+    add_library(CUDA::cudart_static_deps IMPORTED INTERFACE)
+    target_link_libraries(CUDA::cudart_static INTERFACE CUDA::cudart_static_deps)
 
-  foreach (cuda_lib cublas cufft cufftw curand cusolver cusparse nvgraph nvjpeg)
-    find_and_add_cuda_import_lib(${cuda_lib})
-    add_cuda_link_dependency(${cuda_lib} cudart)
+    if(UNIX AND (CMAKE_C_COMPILER OR CMAKE_CXX_COMPILER))
+      find_package(Threads REQUIRED)
+      target_link_libraries(CUDA::cudart_static_deps INTERFACE Threads::Threads ${CMAKE_DL_LIBS})
+    endif()
 
-    find_and_add_cuda_import_lib(${cuda_lib}_static)
-    add_cuda_link_dependency(${cuda_lib}_static cudart_static)
+    if(UNIX AND NOT APPLE)
+      # On Linux, you must link against librt when using the static cuda runtime.
+      find_library(CUDAToolkit_rt_LIBRARY rt)
+      mark_as_advanced(CUDAToolkit_rt_LIBRARY)
+      if(NOT CUDAToolkit_rt_LIBRARY)
+        message(WARNING "Could not find librt library, needed by CUDA::cudart_static")
+      else()
+        target_link_libraries(CUDA::cudart_static_deps INTERFACE ${CUDAToolkit_rt_LIBRARY})
+      endif()
+    endif()
+  endif()
+
+  _CUDAToolkit_find_and_add_import_lib(culibos) # it's a static library
+  foreach (cuda_lib cublas cufft curand cusparse nppc nvjpeg)
+    _CUDAToolkit_find_and_add_import_lib(${cuda_lib})
+    _CUDAToolkit_find_and_add_import_lib(${cuda_lib}_static DEPS culibos)
   endforeach()
 
+  # cuFFTW depends on cuFFT
+  _CUDAToolkit_find_and_add_import_lib(cufftw DEPS cufft)
+  _CUDAToolkit_find_and_add_import_lib(cufftw DEPS cufft_static)
+
   # cuSOLVER depends on cuBLAS, and cuSPARSE
-  add_cuda_link_dependency(cusolver cublas cusparse)
-  add_cuda_link_dependency(cusolver_static cublas_static cusparse)
+  _CUDAToolkit_find_and_add_import_lib(cusolver DEPS cublas cusparse)
+  _CUDAToolkit_find_and_add_import_lib(cusolver_static DEPS cublas_static cusparse_static culibos)
 
   # nvGRAPH depends on cuRAND, and cuSOLVER.
-  add_cuda_link_dependency(nvgraph curand cusolver)
-  add_cuda_link_dependency(nvgraph_static curand_static cusolver_static)
-
-  find_and_add_cuda_import_lib(nppc)
-  find_and_add_cuda_import_lib(nppc_static)
-
-  add_cuda_link_dependency(nppc cudart)
-  add_cuda_link_dependency(nppc_static cudart_static culibos)
+  _CUDAToolkit_find_and_add_import_lib(nvgraph DEPS curand cusolver)
+  _CUDAToolkit_find_and_add_import_lib(nvgraph_static DEPS curand_static cusolver_static)
 
   # Process the majority of the NPP libraries.
   foreach (cuda_lib nppial nppicc nppidei nppif nppig nppim nppist nppitc npps nppicom nppisu)
-    find_and_add_cuda_import_lib(${cuda_lib})
-    find_and_add_cuda_import_lib(${cuda_lib}_static)
-    add_cuda_link_dependency(${cuda_lib} nppc)
-    add_cuda_link_dependency(${cuda_lib}_static nppc_static)
+    _CUDAToolkit_find_and_add_import_lib(${cuda_lib} DEPS nppc)
+    _CUDAToolkit_find_and_add_import_lib(${cuda_lib}_static DEPS nppc_static)
   endforeach()
 
-  find_and_add_cuda_import_lib(nvrtc)
-  add_cuda_link_dependency(nvrtc cuda_driver)
+  _CUDAToolkit_find_and_add_import_lib(cupti
+                                       EXTRA_PATH_SUFFIXES ../extras/CUPTI/lib64/
+                                                           ../extras/CUPTI/lib/)
+  _CUDAToolkit_find_and_add_import_lib(cupti_static
+                                       EXTRA_PATH_SUFFIXES ../extras/CUPTI/lib64/
+                                                           ../extras/CUPTI/lib/)
+
+  _CUDAToolkit_find_and_add_import_lib(nvrtc DEPS cuda_driver)
 
-  find_and_add_cuda_import_lib(nvml nvidia-ml nvml)
+  _CUDAToolkit_find_and_add_import_lib(nvml ALT nvidia-ml nvml)
 
   if(WIN32)
     # nvtools can be installed outside the CUDA toolkit directory
@@ -798,17 +888,12 @@ if(CUDAToolkit_FOUND)
       PATH_SUFFIXES lib/x64 lib
     )
   endif()
-  find_and_add_cuda_import_lib(nvToolsExt nvToolsExt nvToolsExt64)
+  _CUDAToolkit_find_and_add_import_lib(nvToolsExt ALT nvToolsExt64)
 
-  add_cuda_link_dependency(nvToolsExt cudart)
-
-  find_and_add_cuda_import_lib(OpenCL)
-
-  find_and_add_cuda_import_lib(culibos)
-  if(TARGET CUDA::culibos)
-    foreach (cuda_lib cublas cufft cusparse curand nvjpeg)
-      add_cuda_link_dependency(${cuda_lib}_static culibos)
-    endforeach()
-  endif()
+  _CUDAToolkit_find_and_add_import_lib(OpenCL)
+endif()
 
+if(_CUDAToolkit_Pop_ROOT_PATH)
+  list(REMOVE_AT CMAKE_FIND_ROOT_PATH 0)
+  unset(_CUDAToolkit_Pop_ROOT_PATH)
 endif()
diff --git a/make/crosscompile.jetson.mk b/make/crosscompile.jetson.mk
deleted file mode 100644
index 880e2cf5b466..000000000000
--- a/make/crosscompile.jetson.mk
+++ /dev/null
@@ -1,216 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-#-------------------------------------------------------------------------------
-#  Template configuration for compiling mxnet
-#
-#  If you want to change the configuration, please use the following
-#  steps. Assume you are on the root directory of mxnet. First copy the this
-#  file so that any local changes will be ignored by git
-#
-#  $ cp make/config.mk .
-#
-#  Next modify the according entries, and then compile by
-#
-#  $ make
-#
-#  or build in parallel with 8 threads
-#
-#  $ make -j8
-#-------------------------------------------------------------------------------
-
-#---------------------
-# For cross compilation we only explictily set a compiler when one is not already present.
-#--------------------
-
-ifndef CC
-export CC = gcc
-endif
-ifndef CXX
-export CXX = g++
-endif
-ifndef NVCC
-export NVCC = nvcc
-endif
-
-# whether compile with options for MXNet developer
-DEV = 0
-
-# whether compile with debug
-DEBUG = 0
-
-# whether to turn on segfault signal handler to log the stack trace
-USE_SIGNAL_HANDLER = 1
-
-# the additional link flags you want to add
-ADD_LDFLAGS = -L${CROSS_ROOT}/lib -L/usr/lib/aarch64-linux-gnu/
-
-# the additional compile flags you want to add
-ADD_CFLAGS = -I${CROSS_ROOT}/include -I/usr/include/aarch64-linux-gnu/
-
-#---------------------------------------------
-# matrix computation libraries for CPU/GPU
-#---------------------------------------------
-
-# whether use CUDA during compile
-USE_CUDA = 1
-
-# add the path to CUDA library to link and compile flag
-# if you have already add them to environment variable, leave it as NONE
-# USE_CUDA_PATH = /usr/local/cuda
-USE_CUDA_PATH = /usr/local/cuda-9.0/targets/aarch64-linux
-
-# whether to enable CUDA runtime compilation
-ENABLE_CUDA_RTC = 0
-
-# whether use CuDNN R3 library
-USE_CUDNN = 1
-
-#whether to use NCCL library
-USE_NCCL = 0
-#add the path to NCCL library
-USE_NCCL_PATH = NONE
-
-# whether use opencv during compilation
-# you can disable it, however, you will not able to use
-# imbin iterator
-USE_OPENCV = 0
-# Add OpenCV include path, in which the directory `opencv2` exists
-USE_OPENCV_INC_PATH = NONE
-# Add OpenCV shared library path, in which the shared library exists
-USE_OPENCV_LIB_PATH = NONE
-
-#whether use libjpeg-turbo for image decode without OpenCV wrapper
-USE_LIBJPEG_TURBO = 0
-#add the path to libjpeg-turbo library
-USE_LIBJPEG_TURBO_PATH = NONE
-
-# use openmp for parallelization
-USE_OPENMP = 1
-
-# whether use MKL-DNN library
-USE_MKLDNN = 0
-
-# whether use NNPACK library
-USE_NNPACK = 0
-
-# choose the version of blas you want to use
-# can be: mkl, blas, atlas, openblas
-# in default use atlas for linux while apple for osx
-UNAME_S := $(shell uname -s)
-USE_BLAS = openblas
-
-# whether use lapack during compilation
-# only effective when compiled with blas versions openblas/apple/atlas/mkl
-USE_LAPACK = 1
-
-# path to lapack library in case of a non-standard installation
-USE_LAPACK_PATH =
-
-# add path to intel library, you may need it for MKL, if you did not add the path
-# to environment variable
-USE_INTEL_PATH = NONE
-
-# If use MKL only for BLAS, choose static link automatically to allow python wrapper
-ifeq ($(USE_BLAS), mkl)
-USE_STATIC_MKL = 1
-else
-USE_STATIC_MKL = NONE
-endif
-
-#----------------------------
-# Settings for power and arm arch
-#----------------------------
-USE_SSE=0
-
-# Turn off F16C instruction set support
-USE_F16C=0
-
-#----------------------------
-# distributed computing
-#----------------------------
-
-# whether or not to enable multi-machine supporting
-USE_DIST_KVSTORE = 0
-
-# whether or not allow to read and write HDFS directly. If yes, then hadoop is
-# required
-USE_HDFS = 0
-
-# path to libjvm.so. required if USE_HDFS=1
-LIBJVM=$(JAVA_HOME)/jre/lib/amd64/server
-
-# whether or not allow to read and write AWS S3 directly. If yes, then
-# libcurl4-openssl-dev is required, it can be installed on Ubuntu by
-# sudo apt-get install -y libcurl4-openssl-dev
-USE_S3 = 0
-
-#----------------------------
-# performance settings
-#----------------------------
-# Use operator tuning
-USE_OPERATOR_TUNING = 1
-
-# Use gperftools if found
-# Disable because of #8968
-USE_GPERFTOOLS = 0
-
-# path to gperftools (tcmalloc) library in case of a non-standard installation
-USE_GPERFTOOLS_PATH =
-
-# Use JEMalloc if found, and not using gperftools
-USE_JEMALLOC = 1
-
-# path to jemalloc library in case of a non-standard installation
-USE_JEMALLOC_PATH =
-
-#----------------------------
-# additional operators
-#----------------------------
-
-# path to folders containing projects specific operators that you don't want to put in src/operators
-EXTRA_OPERATORS =
-
-#----------------------------
-# other features
-#----------------------------
-
-# Create C++ interface package
-USE_CPP_PACKAGE = 0
-
-# Use int64_t type to represent the total number of elements in the tensor
-# This will cause performance degradation reported in issue #14496
-# Set to 1 for large tensor with tensor size greater than INT32_MAX i.e. 2147483647
-# Note: the size of each dimension is still bounded by INT32_MAX
-USE_INT64_TENSOR_SIZE = 0
-
-#----------------------------
-# plugins
-#----------------------------
-
-# whether to use caffe integration. This requires installing caffe.
-# You also need to add CAFFE_PATH/build/lib to your LD_LIBRARY_PATH
-# CAFFE_PATH = $(HOME)/caffe
-# MXNET_PLUGINS += plugin/caffe/caffe.mk
-
-# WARPCTC_PATH = $(HOME)/warp-ctc
-# MXNET_PLUGINS += plugin/warpctc/warpctc.mk
-
-# whether to use sframe integration. This requires build sframe
-# git@github.com:dato-code/SFrame.git
-# SFRAME_PATH = $(HOME)/SFrame
-# MXNET_PLUGINS += plugin/sframe/plugin.mk
diff --git a/src/operator/random/shuffle_op.cc b/src/operator/random/shuffle_op.cc
index 3f94cca530c3..c81d90689d58 100644
--- a/src/operator/random/shuffle_op.cc
+++ b/src/operator/random/shuffle_op.cc
@@ -23,7 +23,7 @@
  * \brief Operator to shuffle elements of an NDArray
  */
 #if ((__GNUC__ > 4 && !defined(__clang__major__)) || (__clang_major__ > 4 && __linux__)) && \
-  defined(_OPENMP)
+  defined(_OPENMP) && !defined(__ANDROID__)
 #define USE_GNU_PARALLEL_SHUFFLE
 #endif