diff --git a/cd/Jenkinsfile_cd_pipeline b/cd/Jenkinsfile_cd_pipeline
index e0e94770b682..afb7b9b6d27f 100644
--- a/cd/Jenkinsfile_cd_pipeline
+++ b/cd/Jenkinsfile_cd_pipeline
@@ -61,9 +61,17 @@ pipeline {
               stage("Build") {
                 cd_utils.trigger_release_job("Build static libmxnet", "mxnet_lib/static", params.MXNET_VARIANTS)    
               }
-              stage("PyPI Release") {
-                echo "Building PyPI Release"
-                cd_utils.trigger_release_job("Release PyPI Packages", "python/pypi", params.MXNET_VARIANTS)
+              stage("Releases") {
+                cd_utils.error_checked_parallel([
+                  "PyPI Release": {
+                    echo "Building PyPI Release"
+                    cd_utils.trigger_release_job("Release PyPI Packages", "python/pypi", params.MXNET_VARIANTS)
+                  },
+                  "Python Docker Release": {
+                    echo "Building Python Docker Release"
+                    cd_utils.trigger_release_job("Release Python Docker Images", "python/docker", params.MXNET_VARIANTS)
+                  }
+                ])
               }
             },
 
diff --git a/cd/Jenkinsfile_release_job b/cd/Jenkinsfile_release_job
index c2be26124029..4d6f3b5e9012 100644
--- a/cd/Jenkinsfile_release_job
+++ b/cd/Jenkinsfile_release_job
@@ -92,7 +92,8 @@ pipeline {
           def valid_job_types = [
             "mxnet_lib/static",
             "mxnet_lib/dynamic", 
-            "python/pypi"
+            "python/pypi",
+            "python/docker"
           ]
           
           // Convert mxnet variants to a list
diff --git a/cd/Jenkinsfile_utils.groovy b/cd/Jenkinsfile_utils.groovy
index 5182b04a3b5b..966f0a218057 100644
--- a/cd/Jenkinsfile_utils.groovy
+++ b/cd/Jenkinsfile_utils.groovy
@@ -160,6 +160,18 @@ def restore_artifact(variant, libtype) {
   }
 }
 
+
+// Restores the statically linked libmxnet for the given variant
+def restore_static_libmxnet(variant) {
+  restore_artifact(variant, 'static')
+}
+
+
+// Restores the dynamically linked libmxnet for the given variant
+def restore_dynamic_libmxnet(variant) {
+  restore_artifact(variant, 'dynamic')
+}
+
 // A generic pipeline that can be used by *most* CD jobs
 // It can be used when implementing the pipeline steps in the Jenkins_steps.groovy
 // script for a particular delivery channel. However, it should also implement the
diff --git a/cd/python/docker/Dockerfile b/cd/python/docker/Dockerfile
new file mode 100644
index 000000000000..dc70da188793
--- /dev/null
+++ b/cd/python/docker/Dockerfile
@@ -0,0 +1,40 @@
+# -*- mode: dockerfile -*-
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# Python MXNet Dockerfile
+
+# NOTE: Assumes wheel_build directory is the context root when building
+
+ARG BASE_IMAGE
+FROM ${BASE_IMAGE}
+
+ARG PYTHON_CMD=python
+RUN apt-get update && \
+    apt-get install -y wget ${PYTHON_CMD}-dev gcc && \
+    wget https://bootstrap.pypa.io/get-pip.py && \
+    ${PYTHON_CMD} get-pip.py
+
+ARG MXNET_COMMIT_ID
+ENV MXNET_COMMIT_ID=${MXNET_COMMIT_ID}
+
+RUN mkdir -p /mxnet
+COPY dist/*.whl /mxnet/.
+
+WORKDIR /mxnet
+RUN WHEEL_FILE=$(ls -t /mxnet | head -n 1) && pip install ${WHEEL_FILE} && rm -f ${WHEEL_FILE}
+
diff --git a/cd/python/docker/Dockerfile.test b/cd/python/docker/Dockerfile.test
new file mode 100644
index 000000000000..bed059d0fc73
--- /dev/null
+++ b/cd/python/docker/Dockerfile.test
@@ -0,0 +1,39 @@
+# -*- mode: dockerfile -*-
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# Python MXNet Dockerfile
+
+# NOTE: Assumes 'ci' directory is root of the context when building
+
+ARG BASE_IMAGE
+FROM ${BASE_IMAGE}
+
+# Install test dependencies
+RUN pip install nose
+
+ARG USER_ID=1001
+ARG GROUP_ID=1001
+
+COPY ./docker/install/ubuntu_adduser.sh /work/ubuntu_adduser.sh
+COPY ./docker/install/requirements /work/requirements
+
+RUN mkdir -p /work
+RUN /work/ubuntu_adduser.sh
+RUN pip install -r /work/requirements
+
+WORKDIR /work/mxnet
diff --git a/cd/python/docker/Jenkins_pipeline.groovy b/cd/python/docker/Jenkins_pipeline.groovy
new file mode 100644
index 000000000000..0d4925e00576
--- /dev/null
+++ b/cd/python/docker/Jenkins_pipeline.groovy
@@ -0,0 +1,74 @@
+// -*- mode: groovy -*-
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+//
+// Jenkins pipeline
+// See documents at https://jenkins.io/doc/book/pipeline/jenkinsfile/
+
+// NOTE: 
+// ci_utils and cd_utils are loaded by the originating Jenkins job, e.g. jenkins/Jenkinsfile_release_job
+
+def get_pipeline(mxnet_variant) {
+  def node_type = mxnet_variant.startsWith('cu') ? NODE_LINUX_GPU : NODE_LINUX_CPU
+  return cd_utils.generic_pipeline(mxnet_variant, this, node_type)
+}
+
+// Returns the (Docker) environment for the given variant
+// The environment corresponds to the docker files in the 'docker' directory
+def get_environment(mxnet_variant) {
+  if (mxnet_variant.startsWith("cu")) {
+    // Remove 'mkl' suffix from variant to properly format test environment
+    return "ubuntu_gpu_${mxnet_variant.replace('mkl', '')}"
+  }
+  return "ubuntu_cpu"
+}
+
+
+def build(mxnet_variant) {
+  ws("workspace/python_docker/${mxnet_variant}/${env.BUILD_NUMBER}") {
+    ci_utils.init_git()
+    cd_utils.restore_static_libmxnet(mxnet_variant)
+
+    // package wheel file
+    def nvidia_docker = mxnet_variant.startsWith('cu')
+    def environment = get_environment(mxnet_variant)
+    ci_utils.docker_run(environment, "cd_package_pypi ${mxnet_variant}", nvidia_docker)
+
+    // build python docker images
+    sh "./cd/python/docker/python_images.sh build ${mxnet_variant} py3"
+    sh "./cd/python/docker/python_images.sh build ${mxnet_variant} py2"
+  }
+}
+
+def test(mxnet_variant) {
+  ws("workspace/python_docker/${mxnet_variant}/${env.BUILD_NUMBER}") {
+    // test python docker images
+    sh "./cd/python/docker/python_images.sh test ${mxnet_variant} py3"
+    sh "./cd/python/docker/python_images.sh test ${mxnet_variant} py2"
+  }
+}
+
+def push(mxnet_variant) {
+  ws("workspace/python_docker/${mxnet_variant}/${env.BUILD_NUMBER}") {
+    // push python docker images
+    sh "./cd/python/docker/python_images.sh push ${mxnet_variant} py3"
+    sh "./cd/python/docker/python_images.sh push ${mxnet_variant} py2"
+  }
+}
+
+return this
diff --git a/cd/python/docker/python_images.sh b/cd/python/docker/python_images.sh
new file mode 100755
index 000000000000..305676d2c40a
--- /dev/null
+++ b/cd/python/docker/python_images.sh
@@ -0,0 +1,128 @@
+#!/usr/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Executes mxnet python images pipeline functions: build, test, publish
+# Assumes script is run from the root of the mxnet repository
+# Assumes script is being run within MXNet CD infrastructure
+
+set -xe
+
+usage="Usage: python_images.sh <build|test|publish> MXNET-VARIANT <py2|py3>"
+
+command=${1:?$usage}
+mxnet_variant=${2:?$usage}
+python_version=${3:?usage}
+
+cd_utils='cd/utils'
+ci_utils='ci/'
+
+case ${python_version} in
+    py3)
+        python_cmd="python3"
+        ;;
+    py2)
+        python_cmd="python"
+        ;;
+    *)
+        echo "Error: specify python version with either 'py2' or 'py3'"
+        exit 1
+        ;;
+esac
+
+docker_tags=($(./${cd_utils}/docker_tag.sh ${mxnet_variant}))
+main_tag="${docker_tags[0]}_${python_version}"
+base_image=$(./${cd_utils}/mxnet_base_image.sh ${mxnet_variant})
+repository="python"
+image_name="${repository}:${main_tag}"
+
+resources_path='cd/python/docker'
+
+if [ ! -z "${RELEASE_DOCKERHUB_REPOSITORY}" ]; then
+    image_name="${RELEASE_DOCKERHUB_REPOSITORY}/${image_name}"
+fi
+
+build() {
+    # NOTE: Ensure the correct context root is passed in when building - Dockerfile expects ./wheel_build
+    docker build -t "${image_name}" --build-arg PYTHON_CMD=${python_cmd} --build-arg BASE_IMAGE="${base_image}" --build-arg MXNET_COMMIT_ID=${GIT_COMMIT} -f ${resources_path}/Dockerfile ./wheel_build
+}
+
+test() {
+    local runtime_param=""
+    if [[ ${mxnet_variant} == cu* ]]; then
+        runtime_param="--runtime=nvidia"
+    fi
+    local test_image_name="${image_name}_test"
+    
+    # Ensure the correct context root is passed in when building - Dockerfile.test expects ci directory
+    docker build -t "${test_image_name}" --build-arg USER_ID=`id -u` --build-arg GROUP_ID=`id -g` --build-arg BASE_IMAGE="${image_name}" -f ${resources_path}/Dockerfile.test ./ci
+    ./ci/safe_docker_run.py ${runtime_param} --cap-add "SYS_PTRACE" -u `id -u`:`id -g` -v `pwd`:/work/mxnet "${test_image_name}" ${resources_path}/test_python_image.sh "${mxnet_variant}" "${python_cmd}"
+}
+
+push() {
+    if [ -z "${RELEASE_DOCKERHUB_REPOSITORY}" ]; then
+        echo "Cannot publish image without RELEASE_DOCKERHUB_REPOSITORY environment variable being set."
+        exit 1
+    fi
+
+    # The secret name env var is set in the Jenkins configuration
+    # Manage Jenkins -> Configure System
+    ./${ci_utils}/docker_login.py --secret-name "${RELEASE_DOCKERHUB_SECRET_NAME}"
+
+    # Push image
+    docker push "${image_name}"
+
+    # Iterate over remaining tags, if any
+    for ((i=1;i<${#docker_tags[@]};i++)); do
+        local docker_tag="${docker_tags[${i}]}"
+        local latest_image_name="${RELEASE_DOCKERHUB_REPOSITORY}/${repository}:${docker_tag}"
+
+        # latest and latest gpu should only be pushed for py3
+        if [[ ${docker_tag} == "latest" || ${docker_tag} == "latest_gpu" ]]; then
+            if [[ ${python_version} == "py2" ]]; then
+                continue
+            fi
+        else
+            latest_image_name="${latest_image_name}_${python_version}"
+        fi
+
+        docker tag "${image_name}" "${latest_image_name}"
+        docker push "${latest_image_name}"
+        echo "Successfully pushed ${latest_image_name}. Pull it with:"
+        echo "docker pull ${latest_image_name}"
+        echo "For a complete list of tags see https://hub.docker.com/u/${RELEASE_DOCKERHUB_REPOSITORY}/${repository}"
+    done    
+}
+
+case ${command} in
+    "build")
+        build
+        ;;
+
+    "test")
+        test
+        ;;
+
+    "push")
+        push
+        ;;
+
+    *)
+        echo $usage
+        exit 1
+esac
diff --git a/cd/python/docker/test_python_image.sh b/cd/python/docker/test_python_image.sh
new file mode 100755
index 000000000000..88e03ea84a12
--- /dev/null
+++ b/cd/python/docker/test_python_image.sh
@@ -0,0 +1,47 @@
+#!/usr/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# To be run _within_ a runtime image
+# Tests the Runtime docker image
+# Assumes the mxnet source directory is mounted on /mxnet and cwd is /mxnet
+
+set -ex
+
+# Variant parameter should be passed in
+mxnet_variant=${1:?"Missing mxnet variant"}
+python_cmd=${2:?"Missing python version (python or python3)"}
+
+if [ -z "${MXNET_COMMIT_ID}" ]; then
+    echo "MXNET_COMMIT_ID environment variable is empty. Please rebuild the image with MXNET_COMMIT_ID build-arg specified."
+    exit 1
+fi
+
+# Execute tests
+if [[ $mxnet_variant == cu* ]]; then
+    mnist_params="--gpu 0"
+    test_conv_params="--gpu"
+fi
+
+if [[ $mxnet_variant == *mkl ]]; then
+    ${python_cmd} tests/python/mkl/test_mkldnn.py
+fi
+
+${python_cmd} tests/python/train/test_conv.py ${test_conv_params}
+${python_cmd} example/image-classification/train_mnist.py ${mnist_params}
+
diff --git a/cd/python/pypi/Jenkins_pipeline.groovy b/cd/python/pypi/Jenkins_pipeline.groovy
index bf8103270146..e9f172a570fe 100644
--- a/cd/python/pypi/Jenkins_pipeline.groovy
+++ b/cd/python/pypi/Jenkins_pipeline.groovy
@@ -45,7 +45,7 @@ def get_environment(mxnet_variant) {
 def build(mxnet_variant) {
   ws("workspace/python_pypi/${mxnet_variant}/${env.BUILD_NUMBER}") {
     ci_utils.init_git()
-    cd_utils.restore_artifact(mxnet_variant, 'static')
+    cd_utils.restore_static_libmxnet(mxnet_variant)
     
     // create wheel file
     def environment = get_environment(mxnet_variant)
diff --git a/cd/utils/docker_tag.sh b/cd/utils/docker_tag.sh
new file mode 100755
index 000000000000..d16da4930774
--- /dev/null
+++ b/cd/utils/docker_tag.sh
@@ -0,0 +1,59 @@
+#!/usr/bin/env bash
+# -*- coding: utf-8 -*-
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+mxnet_variant=${1:?"Please specify the mxnet variant as the first parameter"}
+is_release=${RELEASE_BUILD:-false}
+version=${VERSION:-nightly}
+
+# The docker tags will be in the form <version>_<hardware>(_mkl)
+# Eg. nightly_cpu, 1.4.0_cpu_mkl, nightly_gpu_cu80_mkl, etc.
+
+if [[ ${mxnet_variant} == "cpu" ]]; then
+    tag_suffix="cpu"
+elif [[ ${mxnet_variant} == "mkl" ]]; then
+    tag_suffix="cpu_mkl"
+elif [[ ${mxnet_variant} == cu* ]]; then
+    tag_suffix="gpu_${mxnet_variant}"
+
+    # *mkl => *_mkl
+    if [[ $tag_suffix == *mkl ]]; then
+        tag_suffix="${tag_suffix:0:${#tag_suffix}-3}_mkl"
+    fi
+else
+    echo "Error: Unrecognized mxnet variant: '${mxnet_variant}'."
+    exit 1
+fi
+
+echo "${version}_${tag_suffix}"
+
+# Print out latest tags as well
+if [[ ${is_release} == "true" ]]; then
+    if [[ ${mxnet_variant} == "cpu" ]]; then
+        echo "latest"
+        echo "latest_cpu"
+    elif [[ ${mxnet_variant} == "mkl" ]]; then
+        echo "latest_cpu_mkl"
+    elif [[ ${mxnet_variant} == "cu90" ]]; then
+        echo "latest_gpu"
+    elif [[ ${mxnet_variant} == "cu90mkl" ]]; then
+        echo "latest_gpu_mkl"
+    fi
+fi
diff --git a/cd/utils/mxnet_base_image.sh b/cd/utils/mxnet_base_image.sh
new file mode 100755
index 000000000000..dcfe7216dcb4
--- /dev/null
+++ b/cd/utils/mxnet_base_image.sh
@@ -0,0 +1,49 @@
+#!/usr/bin/env bash
+# -*- coding: utf-8 -*-
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+mxnet_variant=${1:?"Please specify the mxnet variant as the first parameter"}
+
+case ${mxnet_variant} in 
+    cu80*)
+    echo "nvidia/cuda:8.0-cudnn7-runtime-ubuntu16.04"
+    ;;
+    cu90*)
+    echo "nvidia/cuda:9.0-cudnn7-runtime-ubuntu16.04"
+    ;;
+    cu92*)
+    echo "nvidia/cuda:9.2-cudnn7-runtime-ubuntu16.04"
+    ;;
+    cu100*)
+    echo "nvidia/cuda:10.0-cudnn7-runtime-ubuntu16.04"
+    ;;
+    cu101*)
+    echo "nvidia/cuda:10.1-cudnn7-runtime-ubuntu16.04"
+    ;;
+    cpu)
+    echo "ubuntu:16.04"
+    ;;
+    mkl)
+    echo "ubuntu:16.04"
+    ;;
+    *)
+    echo "Error: Unrecognized mxnet-variant: '${mxnet_variant}'"
+    exit 1
+    ;;
+esac
diff --git a/ci/build.py b/ci/build.py
index e6a183fefa34..8798c7ed2ef2 100755
--- a/ci/build.py
+++ b/ci/build.py
@@ -27,63 +27,17 @@
 
 import argparse
 import glob
-import logging
-import os
+import pprint
 import re
 import shutil
+import signal
 import subprocess
-import sys
-import tempfile
 from itertools import chain
 from subprocess import check_call, check_output
 from typing import *
-from util import *
-import docker
-import docker.models
-import docker.errors
-import signal
-import atexit
-import pprint
-
-
-class Cleanup:
-    """A class to cleanup containers"""
-    def __init__(self):
-        self.containers = set()
-        self.docker_stop_timeout = 3
 
-    def add_container(self, container: docker.models.containers.Container):
-        assert isinstance(container, docker.models.containers.Container)
-        self.containers.add(container)
-
-    def remove_container(self, container: docker.models.containers.Container):
-        assert isinstance(container, docker.models.containers.Container)
-        self.containers.remove(container)
-
-    def _cleanup_containers(self):
-        if self.containers:
-            logging.warning("Cleaning up containers")
-        else:
-            return
-        # noinspection PyBroadException
-        try:
-            stop_timeout = int(os.environ.get("DOCKER_STOP_TIMEOUT", self.docker_stop_timeout))
-        except Exception:
-            stop_timeout = 3
-        for container in self.containers:
-            try:
-                container.stop(timeout=stop_timeout)
-                logging.info("☠: stopped container %s", trim_container_id(container.id))
-                container.remove()
-                logging.info("🚽: removed container %s", trim_container_id(container.id))
-            except Exception as e:
-                logging.exception(e)
-        self.containers.clear()
-        logging.info("Cleaning up containers finished.")
-
-    def __call__(self):
-        """Perform cleanup"""
-        self._cleanup_containers()
+from safe_docker_run import SafeDockerClient
+from util import *
 
 
 def get_dockerfiles_path():
@@ -205,18 +159,13 @@ def default_ccache_dir() -> str:
     return os.path.join(os.path.expanduser("~"), ".ccache")
 
 
-def trim_container_id(cid):
-    """:return: trimmed container id"""
-    return cid[:12]
-
-
-def container_run(platform: str,
+def container_run(docker_client: SafeDockerClient,
+                  platform: str,
                   nvidia_runtime: bool,
                   docker_registry: str,
                   shared_memory_size: str,
                   local_ccache_dir: str,
                   command: List[str],
-                  cleanup: Cleanup,
                   environment: Dict[str, str],
                   dry_run: bool = False) -> int:
     """Run command in a container"""
@@ -232,13 +181,6 @@ def container_run(platform: str,
         'CCACHE_LOGFILE': '/tmp/ccache.log',  # a container-scoped log, useful for ccache
                                               # verification.
     })
-    # These variables are passed to the container to the process tree killer can find runaway
-    # process inside the container
-    # https://wiki.jenkins.io/display/JENKINS/ProcessTreeKiller
-    # https://github.com/jenkinsci/jenkins/blob/578d6bacb33a5e99f149de504c80275796f0b231/core/src/main/java/hudson/model/Run.java#L2393
-    #
-    jenkins_env_vars = ['BUILD_NUMBER', 'BUILD_ID', 'BUILD_TAG']
-    environment.update({k: os.environ[k] for k in jenkins_env_vars if k in os.environ})
     environment.update({k: os.environ[k] for k in ['CCACHE_MAXSIZE'] if k in os.environ})
 
     tag = get_docker_tag(platform=platform, registry=docker_registry)
@@ -248,7 +190,7 @@ def container_run(platform: str,
     os.makedirs(local_build_folder, exist_ok=True)
     os.makedirs(local_ccache_dir, exist_ok=True)
     logging.info("Using ccache directory: %s", local_ccache_dir)
-    docker_client = docker.from_env()
+
     # Equivalent command
     docker_cmd_list = [
         get_docker_binary(nvidia_runtime),
@@ -276,8 +218,7 @@ def container_run(platform: str,
     docker_cmd = ' \\\n\t'.join(docker_cmd_list)
     logging.info("Running %s in container %s", command, tag)
     logging.info("Executing the equivalent of:\n%s\n", docker_cmd)
-    # return code of the command inside docker
-    ret = 0
+
     if not dry_run:
         #############################
         #
@@ -288,10 +229,10 @@ def container_run(platform: str,
             # noinspection PyShadowingNames
             # runc is default (docker info | grep -i runtime)
             runtime = 'nvidia'
-        container = docker_client.containers.run(
+
+        return docker_client.run(
             tag,
             runtime=runtime,
-            detach=True,
             command=command,
             shm_size=shared_memory_size,
             user='{}:{}'.format(os.getuid(), os.getgid()),
@@ -305,61 +246,7 @@ def container_run(platform: str,
                     {'bind': '/work/ccache', 'mode': 'rw'},
             },
             environment=environment)
-        try:
-            logging.info("Started container: %s", trim_container_id(container.id))
-            # Race condition:
-            # If the previous call is interrupted then it's possible that the container is not cleaned up
-            # We avoid by masking the signals temporarily
-            cleanup.add_container(container)
-            signal.pthread_sigmask(signal.SIG_UNBLOCK, {signal.SIGINT, signal.SIGTERM})
-            #
-            #############################
-
-            stream = container.logs(stream=True, stdout=True, stderr=True)
-            sys.stdout.flush()
-            for chunk in stream:
-                sys.stdout.buffer.write(chunk)
-                sys.stdout.buffer.flush()
-            sys.stdout.flush()
-            stream.close()
-            try:
-                logging.info("Waiting for status of container %s for %d s.",
-                            trim_container_id(container.id),
-                            container_wait_s)
-                wait_result = container.wait(timeout=container_wait_s)
-                logging.info("Container exit status: %s", wait_result)
-                ret = wait_result.get('StatusCode', 200)
-                if ret != 0:
-                    logging.error("Container exited with an error 😞")
-                    logging.info("Executed command for reproduction:\n\n%s\n", " ".join(sys.argv))
-                else:
-                    logging.info("Container exited with success 👍")
-            except Exception as e:
-                logging.exception(e)
-                ret = 150
-
-            # Stop
-            try:
-                logging.info("Stopping container: %s", trim_container_id(container.id))
-                container.stop()
-            except Exception as e:
-                logging.exception(e)
-                ret = 151
-
-            # Remove
-            try:
-                logging.info("Removing container: %s", trim_container_id(container.id))
-                container.remove()
-            except Exception as e:
-                logging.exception(e)
-                ret = 152
-            cleanup.remove_container(container)
-            containers = docker_client.containers.list()
-            if containers:
-                logging.info("Other running containers: %s", [trim_container_id(x.id) for x in containers])
-        except docker.errors.NotFound as e:
-            logging.info("Container was stopped before cleanup started: %s", e)
-    return ret
+    return 0
 
 
 def list_platforms() -> str:
@@ -388,17 +275,6 @@ def log_environment():
     logging.debug("Build environment: %s", pp.pformat(dict(os.environ)))
 
 
-def script_name() -> str:
-    """:returns: script name with leading paths removed"""
-    return os.path.split(sys.argv[0])[1]
-
-def config_logging():
-    import time
-    logging.getLogger().setLevel(logging.INFO)
-    logging.getLogger("requests").setLevel(logging.WARNING)
-    logging.basicConfig(format='{}: %(asctime)sZ %(levelname)s %(message)s'.format(script_name()))
-    logging.Formatter.converter = time.gmtime
-
 def main() -> int:
     config_logging()
 
@@ -471,20 +347,7 @@ def main() -> int:
 
     command = list(chain(*args.command))
     docker_binary = get_docker_binary(args.nvidiadocker)
-
-    # Cleanup on signals and exit
-    cleanup = Cleanup()
-
-    def signal_handler(signum, _):
-        signal.pthread_sigmask(signal.SIG_BLOCK, {signum})
-        logging.warning("Signal %d received, cleaning up...", signum)
-        cleanup()
-        logging.warning("done. Exiting with error.")
-        sys.exit(1)
-
-    atexit.register(cleanup)
-    signal.signal(signal.SIGTERM, signal_handler)
-    signal.signal(signal.SIGINT, signal_handler)
+    docker_client = SafeDockerClient()
 
     environment = dict([(e.split('=')[:2] if '=' in e else (e, os.environ[e]))
                         for e in args.environment])
@@ -498,7 +361,7 @@ def signal_handler(signum, _):
             load_docker_cache(tag=tag, docker_registry=args.docker_registry)
         if not args.run_only:
             build_docker(platform=platform, docker_binary=docker_binary, registry=args.docker_registry,
-                     num_retries=args.docker_build_retries, no_cache=args.no_cache)
+                         num_retries=args.docker_build_retries, no_cache=args.no_cache)
         else:
             logging.info("Skipping docker build step.")
 
@@ -510,23 +373,23 @@ def signal_handler(signum, _):
         ret = 0
         if command:
             ret = container_run(
-                platform=platform, nvidia_runtime=args.nvidiadocker,
+                docker_client=docker_client, platform=platform, nvidia_runtime=args.nvidiadocker,
                 shared_memory_size=args.shared_memory_size, command=command, docker_registry=args.docker_registry,
-                local_ccache_dir=args.ccache_dir, cleanup=cleanup, environment=environment)
+                local_ccache_dir=args.ccache_dir, environment=environment)
         elif args.print_docker_run:
             command = []
             ret = container_run(
-                platform=platform, nvidia_runtime=args.nvidiadocker,
+                docker_client=docker_client, platform=platform, nvidia_runtime=args.nvidiadocker,
                 shared_memory_size=args.shared_memory_size, command=command, docker_registry=args.docker_registry,
-                local_ccache_dir=args.ccache_dir, dry_run=True, cleanup=cleanup, environment=environment)
+                local_ccache_dir=args.ccache_dir, dry_run=True, environment=environment)
         else:
             # With no commands, execute a build function for the target platform
             command = ["/work/mxnet/ci/docker/runtime_functions.sh", "build_{}".format(platform)]
             logging.info("No command specified, trying default build: %s", ' '.join(command))
             ret = container_run(
-                platform=platform, nvidia_runtime=args.nvidiadocker,
+                docker_client=docker_client, platform=platform, nvidia_runtime=args.nvidiadocker,
                 shared_memory_size=args.shared_memory_size, command=command, docker_registry=args.docker_registry,
-                local_ccache_dir=args.ccache_dir, cleanup=cleanup, environment=environment)
+                local_ccache_dir=args.ccache_dir, environment=environment)
 
         if ret != 0:
             logging.critical("Execution of %s failed with status: %d", command, ret)
@@ -553,9 +416,9 @@ def signal_handler(signum, _):
                 continue
             command = ["/work/mxnet/ci/docker/runtime_functions.sh", build_platform]
             container_run(
-                platform=platform, nvidia_runtime=args.nvidiadocker,
+                docker_client=docker_client, platform=platform, nvidia_runtime=args.nvidiadocker,
                 shared_memory_size=args.shared_memory_size, command=command, docker_registry=args.docker_registry,
-                local_ccache_dir=args.ccache_dir, cleanup=cleanup, environment=environment)
+                local_ccache_dir=args.ccache_dir, environment=environment)
             shutil.move(buildir(), plat_buildir)
             logging.info("Built files left in: %s", plat_buildir)
 
diff --git a/ci/docker_cache.py b/ci/docker_cache.py
index 3a2a1fb415ee..254d6237d6e2 100755
--- a/ci/docker_cache.py
+++ b/ci/docker_cache.py
@@ -24,21 +24,21 @@
 state as if the container would have been built locally already.
 """
 
-import os
-import logging
 import argparse
-import sys
+import logging
+import os
 import subprocess
-import json
+import sys
 from typing import *
+
 import build as build_util
+from docker_login import login_dockerhub, logout_dockerhub
 from util import retry
 
-DOCKERHUB_LOGIN_NUM_RETRIES = 5
-DOCKERHUB_RETRY_SECONDS = 5
 DOCKER_CACHE_NUM_RETRIES = 3
 DOCKER_CACHE_TIMEOUT_MINS = 45
 PARALLEL_BUILDS = 10
+DOCKER_CACHE_RETRY_SECONDS = 5
 
 
 def build_save_containers(platforms, registry, load_cache) -> int:
@@ -111,41 +111,8 @@ def _upload_image(registry, docker_tag, image_id) -> None:
     subprocess.check_call(push_cmd)
 
 
-@retry(target_exception=subprocess.CalledProcessError, tries=DOCKERHUB_LOGIN_NUM_RETRIES,
-       delay_s=DOCKERHUB_RETRY_SECONDS)
-def _login_dockerhub():
-    """
-    Login to the Docker Hub account
-    :return: None
-    """
-    dockerhub_credentials = _get_dockerhub_credentials()
-
-    logging.info('Logging in to DockerHub')
-    # We use password-stdin instead of --password to avoid leaking passwords in case of an error.
-    # This method will produce the following output:
-    # > WARNING! Your password will be stored unencrypted in /home/jenkins_slave/.docker/config.json.
-    # > Configure a credential helper to remove this warning. See
-    # > https://docs.docker.com/engine/reference/commandline/login/#credentials-store
-    # Since we consider the restricted slaves a secure environment, that's fine. Also, using this will require
-    # third party applications which would need a review first as well.
-    p = subprocess.run(['docker', 'login', '--username', dockerhub_credentials['username'], '--password-stdin'],
-                       stdout=subprocess.PIPE, input=str.encode(dockerhub_credentials['password']))
-    logging.info(p.stdout)
-    logging.info('Successfully logged in to DockerHub')
-
-
-def _logout_dockerhub():
-    """
-    Log out of DockerHub to delete local credentials
-    :return: None
-    """
-    logging.info('Logging out of DockerHub')
-    subprocess.call(['docker', 'logout'])
-    logging.info('Successfully logged out of DockerHub')
-
-
 @retry(target_exception=subprocess.TimeoutExpired, tries=DOCKER_CACHE_NUM_RETRIES,
-       delay_s=DOCKERHUB_RETRY_SECONDS)
+       delay_s=DOCKER_CACHE_RETRY_SECONDS)
 def load_docker_cache(registry, docker_tag) -> None:
     """
     Load the precompiled docker cache from the registry
@@ -187,37 +154,6 @@ def delete_local_docker_cache(docker_tag):
         logging.debug('Error during local cache deletion %s', error)
 
 
-def _get_dockerhub_credentials():  # pragma: no cover
-    import boto3
-    import botocore
-    secret_name = os.environ['DOCKERHUB_SECRET_NAME']
-    endpoint_url = os.environ['DOCKERHUB_SECRET_ENDPOINT_URL']
-    region_name = os.environ['DOCKERHUB_SECRET_ENDPOINT_REGION']
-
-    session = boto3.Session()
-    client = session.client(
-        service_name='secretsmanager',
-        region_name=region_name,
-        endpoint_url=endpoint_url
-    )
-    try:
-        get_secret_value_response = client.get_secret_value(
-            SecretId=secret_name
-        )
-    except botocore.exceptions.ClientError as client_error:
-        if client_error.response['Error']['Code'] == 'ResourceNotFoundException':
-            logging.exception("The requested secret %s was not found", secret_name)
-        elif client_error.response['Error']['Code'] == 'InvalidRequestException':
-            logging.exception("The request was invalid due to:")
-        elif client_error.response['Error']['Code'] == 'InvalidParameterException':
-            logging.exception("The request had invalid params:")
-        raise
-    else:
-        secret = get_secret_value_response['SecretString']
-        secret_dict = json.loads(secret)
-        return secret_dict
-
-
 def main() -> int:
     """
     Utility to create and publish the Docker cache to Docker Hub
@@ -248,11 +184,16 @@ def script_name() -> str:
     args = parser.parse_args()
 
     platforms = build_util.get_platforms()
+
+    secret_name = os.environ['DOCKERHUB_SECRET_NAME']
+    endpoint_url = os.environ['DOCKERHUB_SECRET_ENDPOINT_URL']
+    region_name = os.environ['DOCKERHUB_SECRET_ENDPOINT_REGION']
+
     try:
-        _login_dockerhub()
+        login_dockerhub(secret_name, endpoint_url, region_name)
         return build_save_containers(platforms=platforms, registry=args.docker_registry, load_cache=True)
     finally:
-        _logout_dockerhub()
+        logout_dockerhub()
 
 
 if __name__ == '__main__':
diff --git a/ci/docker_login.py b/ci/docker_login.py
new file mode 100755
index 000000000000..b3b4d46e17ce
--- /dev/null
+++ b/ci/docker_login.py
@@ -0,0 +1,137 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import argparse
+import json
+import logging
+import os
+import subprocess
+import sys
+
+from util import retry, config_logging
+
+DOCKERHUB_LOGIN_NUM_RETRIES = 5
+DOCKERHUB_RETRY_SECONDS = 5
+
+
+def _get_dockerhub_credentials(secret_name: str, secret_endpoint_url: str, secret_endpoint_region_name: str):
+    import boto3
+    import botocore
+
+    session = boto3.Session()
+    client = session.client(
+        service_name='secretsmanager',
+        region_name=secret_endpoint_region_name,
+        endpoint_url=secret_endpoint_url
+    )
+    try:
+        get_secret_value_response = client.get_secret_value(
+            SecretId=secret_name
+        )
+    except botocore.exceptions.ClientError as client_error:
+        if client_error.response['Error']['Code'] == 'ResourceNotFoundException':
+            logging.exception("The requested secret %s was not found", secret_name)
+        elif client_error.response['Error']['Code'] == 'InvalidRequestException':
+            logging.exception("The request was invalid due to:")
+        elif client_error.response['Error']['Code'] == 'InvalidParameterException':
+            logging.exception("The request had invalid params:")
+        raise
+    else:
+        secret = get_secret_value_response['SecretString']
+        secret_dict = json.loads(secret)
+        return secret_dict
+
+
+@retry(target_exception=subprocess.CalledProcessError, tries=DOCKERHUB_LOGIN_NUM_RETRIES,
+       delay_s=DOCKERHUB_RETRY_SECONDS)
+def login_dockerhub(secret_name: str, secret_endpoint_url: str, secret_endpoint_region_name: str):
+    """
+    Login to the Docker Hub account
+    :return: None
+    """
+    dockerhub_credentials = _get_dockerhub_credentials(secret_name, secret_endpoint_url, secret_endpoint_region_name)
+
+    logging.info('Logging in to DockerHub')
+    # We use password-stdin instead of --password to avoid leaking passwords in case of an error.
+    # This method will produce the following output:
+    # > WARNING! Your password will be stored unencrypted in /home/jenkins_slave/.docker/config.json.
+    # > Configure a credential helper to remove this warning. See
+    # > https://docs.docker.com/engine/reference/commandline/login/#credentials-store
+    # Since we consider the restricted slaves a secure environment, that's fine. Also, using this will require
+    # third party applications which would need a review first as well.
+    p = subprocess.run(['docker', 'login', '--username', dockerhub_credentials['username'], '--password-stdin'],
+                       stdout=subprocess.PIPE, input=str.encode(dockerhub_credentials['password']))
+    logging.info(p.stdout)
+    if p.returncode == 0:
+        logging.info('Successfully logged in to DockerHub')
+        return
+
+    raise RuntimeError("Failed to login to DockerHub")
+
+
+def logout_dockerhub():
+    """
+    Log out of DockerHub to delete local credentials
+    :return: None
+    """
+    logging.info('Logging out of DockerHub')
+    subprocess.call(['docker', 'logout'])
+    logging.info('Successfully logged out of DockerHub')
+
+
+def main(command_line_arguments):
+    config_logging()
+
+    parser = argparse.ArgumentParser(
+        description="Safe docker login utility to avoid leaking passwords",
+        epilog=""
+    )
+    parser.add_argument("--secret-name",
+                        help="Secret name",
+                        type=str,
+                        required=True)
+
+    parser.add_argument("--secret-endpoint-url",
+                        help="Endpoint Url",
+                        type=str,
+                        default=os.environ.get("DOCKERHUB_SECRET_ENDPOINT_URL", None))
+
+    parser.add_argument("--secret-endpoint-region",
+                        help="AWS Region",
+                        type=str,
+                        default=os.environ.get("DOCKERHUB_SECRET_ENDPOINT_REGION", None))
+
+    args = parser.parse_args(args=command_line_arguments)
+
+    if args.secret_endpoint_url is None:
+        raise RuntimeError("Could not determine secret-endpoint-url, please specify with --secret-endpoint-url")
+
+    if args.secret_endpoint_region is None:
+        raise RuntimeError("Could not determine secret-endpoint-region, please specify with --secret-endpoint-region")
+
+    try:
+        login_dockerhub(args.secret_name, args.secret_endpoint_url, args.secret_endpoint_region)
+    except Exception as err:
+        logging.exception(err)
+        exit(1)
+
+
+if __name__ == '__main__':
+    main(sys.argv[1:])
diff --git a/ci/logging.conf b/ci/logging.conf
new file mode 100644
index 000000000000..d80365e27bf1
--- /dev/null
+++ b/ci/logging.conf
@@ -0,0 +1,41 @@
+# -*- coding: utf-8 -*-
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+[loggers]
+keys=root
+
+[handlers]
+keys=consoleHandler
+
+[formatters]
+keys=simpleFormatter
+
+[logger_root]
+level=DEBUG
+handlers=consoleHandler
+
+[handler_consoleHandler]
+class=StreamHandler
+level=DEBUG
+formatter=simpleFormatter
+args=(sys.stdout,)
+
+[formatter_simpleFormatter]
+format=%(asctime)s - %(name)s - %(levelname)s - %(message)s
+datefmt=
\ No newline at end of file
diff --git a/ci/safe_docker_run.py b/ci/safe_docker_run.py
new file mode 100755
index 000000000000..e3b55bccdff8
--- /dev/null
+++ b/ci/safe_docker_run.py
@@ -0,0 +1,247 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+Docker command wrapper to guard against Zombie containers
+"""
+
+import argparse
+import atexit
+import logging
+import os
+import signal
+import sys
+from functools import reduce
+from itertools import chain
+from typing import Dict, Any
+
+import docker
+from docker.errors import NotFound
+from docker.models.containers import Container
+
+from util import config_logging
+
+DOCKER_STOP_TIMEOUT_SECONDS = 3
+CONTAINER_WAIT_SECONDS = 600
+
+
+class SafeDockerClient:
+    """
+    A wrapper around the docker client to ensure that no zombie containers are left hanging around
+    in case the script is not allowed to finish normally
+    """
+
+    @staticmethod
+    def _trim_container_id(cid):
+        """:return: trimmed container id"""
+        return cid[:12]
+
+    def __init__(self):
+        self._docker_client = docker.from_env()
+        self._containers = set()
+        self._docker_stop_timeout = DOCKER_STOP_TIMEOUT_SECONDS
+        self._container_wait_seconds = CONTAINER_WAIT_SECONDS
+
+        def signal_handler(signum, _):
+            signal.pthread_sigmask(signal.SIG_BLOCK, {signum})
+            logging.warning("Signal %d received, cleaning up...", signum)
+            self._clean_up()
+            logging.warning("done. Exiting with error.")
+            sys.exit(1)
+
+        atexit.register(self._clean_up)
+        signal.signal(signal.SIGTERM, signal_handler)
+        signal.signal(signal.SIGINT, signal_handler)
+
+    def _clean_up(self):
+        if self._containers:
+            logging.warning("Cleaning up containers")
+        else:
+            return
+        # noinspection PyBroadException
+        try:
+            stop_timeout = int(os.environ.get("DOCKER_STOP_TIMEOUT", self._docker_stop_timeout))
+        except Exception:
+            stop_timeout = 3
+        for container in self._containers:
+            try:
+                container.stop(timeout=stop_timeout)
+                logging.info("☠: stopped container %s", self._trim_container_id(container.id))
+                container.remove()
+                logging.info("🚽: removed container %s", self._trim_container_id(container.id))
+            except Exception as e:
+                logging.exception(e)
+        self._containers.clear()
+        logging.info("Cleaning up containers finished.")
+
+    def _add_container(self, container: Container) -> Container:
+        self._containers.add(container)
+        return container
+
+    def _remove_container(self, container: Container):
+        self._containers.remove(container)
+
+    def run(self, *args, **kwargs) -> int:
+        if "detach" in kwargs and kwargs.get("detach") is False:
+            raise ValueError("Can only safe run with 'detach' set to True")
+        else:
+            kwargs["detach"] = True
+
+        # These variables are passed to the container so the process tree killer can find runaway
+        # process inside the container
+        # https://wiki.jenkins.io/display/JENKINS/ProcessTreeKiller
+        # https://github.com/jenkinsci/jenkins/blob/578d6bacb33a5e99f149de504c80275796f0b231/core/src/main/java/hudson/model/Run.java#L2393
+        if "environment" not in kwargs:
+            kwargs["environment"] = {}
+
+        jenkins_env_vars = ["BUILD_NUMBER", "BUILD_ID", "BUILD_TAG"]
+        kwargs["environment"].update({k: os.environ[k] for k in jenkins_env_vars if k in os.environ})
+
+        ret = 0
+        try:
+            # Race condition:
+            # If the call to docker_client.containers.run is interrupted, it is possible that
+            # the container won't be cleaned up. We avoid this by temporarily masking the signals.
+            signal.pthread_sigmask(signal.SIG_BLOCK, {signal.SIGINT, signal.SIGTERM})
+            container = self._add_container(self._docker_client.containers.run(*args, **kwargs))
+            signal.pthread_sigmask(signal.SIG_UNBLOCK, {signal.SIGINT, signal.SIGTERM})
+            logging.info("Started container: %s", self._trim_container_id(container.id))
+            stream = container.logs(stream=True, stdout=True, stderr=True)
+            sys.stdout.flush()
+            for chunk in stream:
+                sys.stdout.buffer.write(chunk)
+                sys.stdout.buffer.flush()
+            sys.stdout.flush()
+            stream.close()
+
+            try:
+                logging.info("Waiting for status of container %s for %d s.",
+                             self._trim_container_id(container.id),
+                             self._container_wait_seconds)
+                wait_result = container.wait(timeout=self._container_wait_seconds)
+                logging.info("Container exit status: %s", wait_result)
+                ret = wait_result.get('StatusCode', 200)
+                if ret != 0:
+                    logging.error("Container exited with an error 😞")
+                    logging.info("Executed command for reproduction:\n\n%s\n", " ".join(sys.argv))
+                else:
+                    logging.info("Container exited with success 👍")
+            except Exception as err:
+                logging.exception(err)
+                return 150
+
+            try:
+                logging.info("Stopping container: %s", self._trim_container_id(container.id))
+                container.stop()
+            except Exception as e:
+                logging.exception(e)
+                ret = 151
+
+            try:
+                logging.info("Removing container: %s", self._trim_container_id(container.id))
+                container.remove()
+            except Exception as e:
+                logging.exception(e)
+                ret = 152
+            self._remove_container(container)
+            containers = self._docker_client.containers.list()
+            if containers:
+                logging.info("Other running containers: %s", [self._trim_container_id(x.id) for x in containers])
+        except NotFound as e:
+            logging.info("Container was stopped before cleanup started: %s", e)
+
+        return ret
+
+
+def _volume_mount(volume_dfn: str) -> Dict[str, Any]:
+    """
+    Converts docker volume mount format, e.g. docker run --volume /local/path:/container/path:ro
+    to an object understood by the python docker library, e.g. {"local/path": {"bind": "/container/path", "mode": "ro"}}
+    This is used by the argparser for automatic conversion and input validation.
+    If the mode is not specified, 'rw' is assumed.
+    :param volume_dfn: A string to convert to a volume mount object in the format <local path>:<container path>[:ro|rw]
+    :return: An object in the form {"<local path>" : {"bind": "<container path>", "mode": "rw|ro"}}
+    """
+    if volume_dfn is None:
+        raise argparse.ArgumentTypeError("Missing value for volume definition")
+
+    parts = volume_dfn.split(":")
+
+    if len(parts) < 2 or len(parts) > 3:
+        raise argparse.ArgumentTypeError("Invalid volume definition {}".format(volume_dfn))
+
+    mode = "rw"
+    if len(parts) == 3:
+        mode = parts[2]
+
+    if mode not in ["rw", "ro"]:
+        raise argparse.ArgumentTypeError("Invalid volume mount mode {} in volume definition {}".format(mode, volume_dfn))
+
+    return {parts[0]: {"bind": parts[1], "mode": mode}}
+
+
+def main(command_line_arguments):
+    config_logging()
+
+    parser = argparse.ArgumentParser(
+        description="""Wrapper around docker run that protects against Zombie containers""", epilog="")
+
+    parser.add_argument("-u", "--user",
+                        help="Username or UID (format: <name|uid>[:<group|gid>])",
+                        default=None)
+
+    parser.add_argument("-v", "--volume",
+                        action='append',
+                        type=_volume_mount,
+                        help="Bind mount a volume",
+                        default=[])
+
+    parser.add_argument("--cap-add",
+                        help="Add Linux capabilities",
+                        action="append",
+                        type=str,
+                        default=[])
+
+    parser.add_argument("--runtime",
+                        help="Runtime to use for this container",
+                        default=None)
+
+    parser.add_argument("--name",
+                        help="Assign a name to the container",
+                        default=None)
+
+    parser.add_argument("image", metavar="IMAGE")
+    parser.add_argument("command", metavar="COMMAND")
+    parser.add_argument("args", nargs='*', metavar="ARG")
+
+    args = parser.parse_args(args=command_line_arguments)
+    docker_client = SafeDockerClient()
+    return docker_client.run(args.image, **{
+        "command": " ".join(list(chain([args.command] + args.args))),
+        "user": args.user,
+        "runtime": args.runtime,
+        "name": args.name,
+        "volumes": reduce(lambda dct, v: {**dct, **v}, args.volume, {}),
+        "cap_add": args.cap_add
+    })
+
+
+if __name__ == "__main__":
+    exit(main(sys.argv[1:]))
diff --git a/ci/test_docker_cache.py b/ci/test_docker_cache.py
index 0a3bc4640c05..aeb399ff6b45 100644
--- a/ci/test_docker_cache.py
+++ b/ci/test_docker_cache.py
@@ -88,7 +88,7 @@ def setUp(self):
         base = os.path.split(os.path.realpath(__file__))[0]
         os.chdir(base)
 
-        docker_cache._login_dockerhub = MagicMock()  # Override login
+        docker_cache.login_dockerhub = MagicMock()  # Override login
 
         # Stop in case previous execution was dirty
         try:
@@ -135,7 +135,7 @@ def test_full_cache(self):
                 """
         platform = 'test_full_cache'
         docker_tag = build_util.get_docker_tag(platform=platform, registry=DOCKER_REGISTRY_PATH)
-        dockerfile_path = os.path.join(DOCKERFILE_DIR, 'Dockerfile.' + platform)
+        dockerfile_path = os.path.join(DOCKERFILE_DIR, 'Dockerfile.build.' + platform)
         try:
             with open(dockerfile_path, 'w') as dockerfile_handle:
                 dockerfile_handle.write(dockerfile_content)
@@ -144,13 +144,25 @@ def test_full_cache(self):
             docker_cache.delete_local_docker_cache(docker_tag=docker_tag)
 
             def warm_up_lambda_func():
-                build_util.build_docker(docker_binary='docker', platform=platform, registry=DOCKER_REGISTRY_PATH)
+                build_util.build_docker(
+                    docker_binary='docker',
+                    platform=platform,
+                    registry=DOCKER_REGISTRY_PATH,
+                    num_retries=3,
+                    no_cache=False
+                )
             _assert_docker_build(lambda_func=warm_up_lambda_func, expected_cache_hit_count=0,
                                  expected_cache_miss_count=4)
 
             # Assert local cache is properly primed
             def primed_cache_lambda_func():
-                build_util.build_docker(docker_binary='docker', platform=platform, registry=DOCKER_REGISTRY_PATH)
+                build_util.build_docker(
+                    docker_binary='docker',
+                    platform=platform,
+                    registry=DOCKER_REGISTRY_PATH,
+                    num_retries=3,
+                    no_cache=False
+                )
             _assert_docker_build(lambda_func=primed_cache_lambda_func, expected_cache_hit_count=4,
                                  expected_cache_miss_count=0)
 
@@ -169,8 +181,6 @@ def clean_cache_lambda_func():
             os.remove(dockerfile_path)
             docker_cache.delete_local_docker_cache(docker_tag=docker_tag)
 
-
-
     def test_partial_cache(self):
         """
         Test whether it's possible to restore cache and then pit it up partially by using a Dockerfile which shares
@@ -196,7 +206,7 @@ def test_partial_cache(self):
                 """
         platform = 'test_partial_cache'
         docker_tag = build_util.get_docker_tag(platform=platform, registry=DOCKER_REGISTRY_PATH)
-        dockerfile_path = os.path.join(DOCKERFILE_DIR, 'Dockerfile.' + platform)
+        dockerfile_path = os.path.join(DOCKERFILE_DIR, 'Dockerfile.build.' + platform)
         try:
             # Write initial Dockerfile
             with open(dockerfile_path, 'w') as dockerfile_handle:
@@ -206,13 +216,25 @@ def test_partial_cache(self):
             docker_cache.delete_local_docker_cache(docker_tag=docker_tag)
 
             def warm_up_lambda_func():
-                build_util.build_docker(docker_binary='docker', platform=platform, registry=DOCKER_REGISTRY_PATH)
+                build_util.build_docker(
+                    docker_binary='docker',
+                    platform=platform,
+                    registry=DOCKER_REGISTRY_PATH,
+                    num_retries=3,
+                    no_cache=False
+                )
             _assert_docker_build(lambda_func=warm_up_lambda_func, expected_cache_hit_count=0,
                                  expected_cache_miss_count=4)
 
             # Assert local cache is properly primed
             def primed_cache_lambda_func():
-                build_util.build_docker(docker_binary='docker', platform=platform, registry=DOCKER_REGISTRY_PATH)
+                build_util.build_docker(
+                    docker_binary='docker',
+                    platform=platform,
+                    registry=DOCKER_REGISTRY_PATH,
+                    num_retries=3,
+                    no_cache=False
+                )
             _assert_docker_build(lambda_func=primed_cache_lambda_func, expected_cache_hit_count=4,
                                  expected_cache_miss_count=0)
 
diff --git a/ci/test_docker_login.py b/ci/test_docker_login.py
new file mode 100644
index 000000000000..6c989ade92ff
--- /dev/null
+++ b/ci/test_docker_login.py
@@ -0,0 +1,234 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+Docker login tests
+"""
+import os
+import subprocess
+import unittest
+from unittest.mock import create_autospec, patch, call, MagicMock
+
+import boto3
+from boto3 import client
+from botocore.stub import Stubber
+
+from docker_login import login_dockerhub, logout_dockerhub, main, DOCKERHUB_RETRY_SECONDS, DOCKERHUB_LOGIN_NUM_RETRIES
+
+
+SECRET_NAME = "secret_name"
+SECRET_ENDPOINT_URL = "https://endpoint.url"
+SECRET_ENDPOINT_REGION = "us-east-2"
+
+
+def mock_boto(num_calls: int = 1):
+    mock_client = client("secretsmanager", region_name="us-east-1")
+    mock_session = create_autospec(boto3.Session)
+    mock_session.client.return_value = mock_client
+
+    # Stub get_secret_value response
+    stub = Stubber(mock_client)
+    for i in range(num_calls):
+        stub.add_response(
+            method="get_secret_value",
+            expected_params={
+                "SecretId": "secret_name"  # Matches os.environ['SECRET_NAME']
+            }, service_response={
+                "SecretString": """{"username": "myuser", "password": "mypass"}"""
+            })
+    return mock_session, stub
+
+
+class TestDockerLogin(unittest.TestCase):
+
+    @patch("subprocess.run", name="mock_subprocess_run")
+    def test_docker_login_success(self, mock_run):
+        """
+        Tests successful docker login returns True and calls docker appropriately
+        """
+        mock_session, stub = mock_boto()
+        stub.activate()
+        with patch("boto3.Session", return_value=mock_session):
+            mock_process = MagicMock(auto_spec=subprocess.Popen, name="mock_process")
+
+            # Simulate successful login
+            mock_process.returncode = 0
+            mock_run.return_value = mock_process
+
+            login_dockerhub(SECRET_NAME, SECRET_ENDPOINT_URL, SECRET_ENDPOINT_REGION)
+
+            # Check boto client is properly created
+            print(mock_session.client.call_args_list)
+            assert mock_session.client.call_args_list == [
+                call(service_name="secretsmanager", region_name="us-east-2", endpoint_url="https://endpoint.url")
+            ]
+
+            # Check that login call passes in the password in the correct way
+            assert mock_run.call_args_list == [
+                call(
+                    ["docker", "login", "--username", "myuser", "--password-stdin"],
+                    stdout=subprocess.PIPE,
+                    input=str.encode("mypass")
+                )
+            ]
+        stub.deactivate()
+
+    @patch("subprocess.run", name="mock_subprocess_run")
+    @patch("time.sleep")
+    def test_docker_login_retry(self, mock_sleep, mock_run):
+        """
+        Tests retry mechanism
+        """
+        num_tries = 3
+        mock_session, stub = mock_boto(num_calls=num_tries)
+        stub.activate()
+        with patch("boto3.Session", return_value=mock_session):
+            mock_process = MagicMock(auto_spec=subprocess.Popen, name="mock_process")
+
+            # Simulate successful login
+            mock_process.returncode = 0
+
+            # Simulate (num_tries - 1) errors + 1 success
+            mock_run.side_effect = \
+                [subprocess.CalledProcessError(1, "cmd", "some error")] * (num_tries - 1) + [mock_process]
+
+            login_dockerhub(SECRET_NAME, SECRET_ENDPOINT_URL, SECRET_ENDPOINT_REGION)
+
+            # Check boto client is properly created
+            print(mock_session.client.call_args_list)
+            assert mock_session.client.call_args_list == [
+                call(service_name="secretsmanager", region_name="us-east-2", endpoint_url="https://endpoint.url")
+            ] * num_tries
+
+            # Check that login call passes in the password in the correct way
+            cmd = ["docker", "login", "--username", "myuser", "--password-stdin"]
+            assert mock_run.call_args_list == [
+                call(cmd, stdout=subprocess.PIPE, input=str.encode("mypass"))
+            ] * num_tries
+
+            # Assert sleep was called appropriately
+            assert mock_sleep.call_args_list == [
+                call(2 ** retry_num * DOCKERHUB_RETRY_SECONDS) for retry_num in range(0, num_tries - 1)
+            ]
+        stub.deactivate()
+
+    @patch("subprocess.run", name="mock_subprocess_run")
+    @patch("time.sleep")
+    def test_docker_login_retry_exhausted(self, mock_sleep, mock_run):
+        """
+        Tests retry mechanism
+        """
+        num_tries = DOCKERHUB_LOGIN_NUM_RETRIES
+        mock_session, stub = mock_boto(num_calls=num_tries)
+        stub.activate()
+        with patch("boto3.Session", return_value=mock_session):
+            # Simulate num_tries errors
+            mock_run.side_effect = [subprocess.CalledProcessError(1, "cmd", "some error")] * num_tries
+
+            with self.assertRaises(subprocess.CalledProcessError):
+                login_dockerhub(SECRET_NAME, SECRET_ENDPOINT_URL, SECRET_ENDPOINT_REGION)
+
+            # Check boto client is properly created
+            assert mock_session.client.call_args_list == [
+                call(service_name="secretsmanager", region_name="us-east-2", endpoint_url="https://endpoint.url")
+            ] * num_tries
+
+            # Check that login call passes in the password in the correct way
+            cmd = ["docker", "login", "--username", "myuser", "--password-stdin"]
+            assert mock_run.call_args_list == [
+                call(cmd, stdout=subprocess.PIPE, input=str.encode("mypass"))
+            ] * num_tries
+
+            # Assert sleep was called appropriately
+            assert mock_sleep.call_args_list == [
+                call(2 ** retry_num * DOCKERHUB_RETRY_SECONDS) for retry_num in range(0, num_tries-1)
+            ]
+        stub.deactivate()
+
+    @patch("subprocess.run", name="mock_subprocess_run")
+    def test_docker_login_failed(self, mock_run):
+        """
+        Tests failed docker login return false
+        """
+        mock_session, stub = mock_boto()
+        stub.activate()
+        with patch("boto3.Session", return_value=mock_session):
+
+            mock_process = MagicMock(auto_spec=subprocess.Popen, name="mock_process")
+
+            # Simulate failed login
+            mock_process.returncode = 1
+            mock_run.return_value = mock_process
+
+            with self.assertRaises(RuntimeError):
+                login_dockerhub(SECRET_NAME, SECRET_ENDPOINT_URL, SECRET_ENDPOINT_REGION)
+        stub.deactivate()
+
+    @patch("subprocess.call", name="mock_subprocess_call")
+    def test_logout(self, mock_call):
+        """
+        Tests logout calls docker command appropriately
+        """
+        logout_dockerhub()
+        assert mock_call.call_args_list == [
+            call(["docker", "logout"])
+        ]
+
+    @patch("docker_login.login_dockerhub")
+    def test_main_exit(self, mock_login):
+        """
+        Tests main exits with error on failed docker login
+        """
+        mock_login.side_effect = RuntimeError("Didn't work")
+        with self.assertRaises(SystemExit):
+            main(["--secret-name", "name", "--secret-endpoint-url", "url", "--secret-endpoint-region", "r"])
+
+    @patch("docker_login.login_dockerhub")
+    def test_main_default_argument_values(self, mock_login):
+        """
+        Tests default arguments
+        """
+
+        # Good env
+        env = {
+            "DOCKERHUB_SECRET_ENDPOINT_URL": "url",
+            "DOCKERHUB_SECRET_ENDPOINT_REGION": "region"
+        }
+        with patch.dict(os.environ, env):
+            main(["--secret-name", "name"])
+            assert mock_login.call_args_list == [
+                call("name", "url", "region")
+            ]
+
+        # Bad envs - none or not all required vars defined
+        tests = [
+            {},
+            {"DOCKERHUB_SECRET_ENDPOINT_URL": "url"},
+            {"DOCKERHUB_SECRET_ENDPOINT_REGION": "region"}
+        ]
+        for bad_env in tests:
+            with patch.dict(os.environ, bad_env):
+                with self.assertRaises(RuntimeError):
+                    main(["--secret-name", "name"])
+
+
+if __name__ == '__main__':
+    import nose
+    nose.main()
diff --git a/ci/test_safe_docker_run.py b/ci/test_safe_docker_run.py
new file mode 100644
index 000000000000..433d42e8b2ea
--- /dev/null
+++ b/ci/test_safe_docker_run.py
@@ -0,0 +1,427 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+Safe docker run tests
+"""
+import itertools
+import os
+import signal
+import unittest
+from typing import Optional
+from unittest.mock import create_autospec, patch, call
+
+from docker import DockerClient
+from docker.models.containers import Container, ContainerCollection
+
+from safe_docker_run import SafeDockerClient, main
+
+
+def create_mock_container(status_code: int = 0):
+    """
+    Creates a mock docker container that exits with the specified status code
+    """
+    mock_container = create_autospec(Container, name="mock_container")
+    mock_container.wait.return_value = {
+        "StatusCode": status_code
+    }
+    return mock_container
+
+
+def create_mock_container_collection(container: Container):
+    """
+    Creates a mock ContainerCollection that return the supplied container when the 'run' method is called
+    """
+    mock_container_collection = create_autospec(ContainerCollection, name="mock_collection")
+    mock_container_collection.run.return_value = container
+    return mock_container_collection
+
+
+class MockDockerClient:
+    """
+    A mock DockerClient when docker.from_env is called
+    The supplied container will be returned when the client.containers.run method is called
+    """
+    def __init__(self, container: Container):
+        self._mock_client = create_autospec(DockerClient, name="mock_client")
+        self._mock_client.containers = create_mock_container_collection(container)
+        self._patch = patch("docker.from_env", return_value=self._mock_client)
+
+    def __enter__(self):
+        self._patch.start()
+        return self._mock_client
+
+    def __exit__(self, _, __, ___):
+        self._patch.stop()
+
+
+class TestSafeDockerRun(unittest.TestCase):
+
+    @patch("safe_docker_run.signal.pthread_sigmask")
+    @patch.dict(os.environ, {
+        "BUILD_NUMBER": "BUILD_NUMBER_5",
+        "BUILD_ID": "BUILD_ID_1",
+        "BUILD_TAG": "BUILD_TAG_7"
+    })
+    def test_run_successful(self, mock_pthread_sigmask):
+        """
+        Tests successful run
+        """
+        mock_container = create_mock_container()
+
+        with MockDockerClient(mock_container) as mock_client:
+            safe_docker = SafeDockerClient()
+
+            # Check return code is 0
+            assert safe_docker.run("image", "command") == 0
+
+            # Check call to container is correct
+            assert mock_client.containers.run.call_args_list == [
+                call("image", "command", detach=True, environment={
+                    "BUILD_NUMBER": "BUILD_NUMBER_5",
+                    "BUILD_ID": "BUILD_ID_1",
+                    "BUILD_TAG": "BUILD_TAG_7"
+                })
+            ]
+
+            # Check correct signals are blocked then unblocked
+            assert mock_pthread_sigmask.call_args_list == [
+                call(signal.SIG_BLOCK, {signal.SIGINT, signal.SIGTERM}),
+                call(signal.SIG_UNBLOCK, {signal.SIGINT, signal.SIGTERM})
+            ]
+
+            # Assert container is stopped and removed
+            assert mock_container.stop.call_count == 1
+            assert mock_container.remove.call_count == 1
+            assert len(safe_docker._containers) == 0
+
+    def test_run_detach(self):
+        """
+        Tests detach=True is passed to the underlying call by default
+        """
+        mock_container = create_mock_container()
+
+        # Test detach=True is passed in even if not specified
+        with MockDockerClient(mock_container) as mock_client:
+            safe_docker = SafeDockerClient()
+            assert safe_docker.run("image", "command") == 0
+            assert mock_client.containers.run.call_count == 1
+            _, kwargs = mock_client.containers.run.call_args
+            assert kwargs["detach"] is True
+
+        # Test passing in detach=True does not cause any issues
+        with MockDockerClient(mock_container) as mock_client:
+            safe_docker = SafeDockerClient()
+            assert safe_docker.run("image", "command", detach=True) == 0
+            assert mock_client.containers.run.call_count == 1
+            _, kwargs = mock_client.containers.run.call_args
+            assert kwargs["detach"] is True
+
+        # Test detach=False fails
+        with MockDockerClient(mock_container) as mock_client:
+            safe_docker = SafeDockerClient()
+            with self.assertRaises(ValueError):
+                safe_docker.run("image", "command", detach=False)
+                assert mock_client.containers.run.call_args_list == []
+
+    def test_jenkins_vars(self):
+        """
+        Tests jenkins environment variables are appropriately passed to the underlying docker run call
+        """
+        # NOTE: It's important that these variables are passed to the underlying docker container
+        # These variables are passed to the container so the process tree killer can find runaway
+        # process inside the container
+        # https://wiki.jenkins.io/display/JENKINS/ProcessTreeKiller
+        # https://github.com/jenkinsci/jenkins/blob/578d6bacb33a5e99f149de504c80275796f0b231/core/src/main/java/hudson/model/Run.java#L2393
+
+        jenkins_vars = {
+            "BUILD_NUMBER": "BUILD_NUMBER_5",
+            "BUILD_ID": "BUILD_ID_1",
+            "BUILD_TAG": "BUILD_TAG_7"
+        }
+        mock_container = create_mock_container()
+
+        # Test environment is empty if the jenkins vars are not present
+        with MockDockerClient(mock_container) as mock_client:
+            safe_docker = SafeDockerClient()
+            assert safe_docker.run("image", "command") == 0
+            assert mock_client.containers.run.call_count == 1
+            _, kwargs = mock_client.containers.run.call_args
+            assert kwargs["environment"] == {}
+
+        # Test environment contains jenkins env vars if they are present
+        with MockDockerClient(mock_container) as mock_client:
+            with patch.dict(os.environ, jenkins_vars):
+                safe_docker = SafeDockerClient()
+                assert safe_docker.run("image", "command") == 0
+                assert mock_client.containers.run.call_count == 1
+                _, kwargs = mock_client.containers.run.call_args
+                assert kwargs["environment"] == jenkins_vars
+
+        # Test jenkins env vars are added to callers env vars
+        user_env = {"key1": "value1", "key2": "value2"}
+        with MockDockerClient(mock_container) as mock_client:
+            with patch.dict(os.environ, jenkins_vars):
+                safe_docker = SafeDockerClient()
+                assert safe_docker.run("image", "command", environment=user_env) == 0
+                assert mock_client.containers.run.call_count == 1
+                _, kwargs = mock_client.containers.run.call_args
+                assert kwargs["environment"] == {**jenkins_vars, **user_env}
+
+    def test_run_args_kwargs_passed(self):
+        """
+        Tests args and kwargs are passed to the container run call
+        """
+        mock_container = create_mock_container()
+
+        # Test detach=True is passed in even if not specified
+        with MockDockerClient(mock_container) as mock_client:
+            safe_docker = SafeDockerClient()
+            assert safe_docker.run(
+                "image",
+                "command",
+                "another_arg",
+                str_param="value",
+                bool_param=True,
+                none_param=None,
+                int_param=5,
+                float_param=5.2,
+                list_param=["this", "is", "a", "list"],
+                map_param={
+                    "a": "5",
+                    "b": True,
+                    "c": 2
+                }) == 0
+            assert mock_client.containers.run.call_args_list == [
+                call(
+                    "image",
+                    "command",
+                    "another_arg",
+                    detach=True,
+                    environment={},
+                    str_param="value",
+                    bool_param=True,
+                    none_param=None,
+                    int_param=5,
+                    float_param=5.2,
+                    list_param=["this", "is", "a", "list"],
+                    map_param={
+                        "a": "5",
+                        "b": True,
+                        "c": 2
+                    }
+                )
+            ]
+
+    def test_container_returns_non_zero_status_code(self):
+        """
+        Tests non-zero code from container is returned and the container
+        is cleaned up
+        """
+        mock_container = create_mock_container(status_code=10)
+        with MockDockerClient(mock_container):
+            safe_docker = SafeDockerClient()
+            # check return code and that container gets cleaned up
+            assert safe_docker.run("image", "command") == 10
+            assert mock_container.stop.call_count == 1
+            assert mock_container.remove.call_count == 1
+            assert len(safe_docker._containers) == 0
+
+    def test_container_wait_raises_returns_150(self):
+        """
+        Tests 150 is returned if an error is raised when calling container.wait
+        """
+        mock_container = create_mock_container()
+        mock_container.wait.side_effect = RuntimeError("Something bad happened")
+        with MockDockerClient(mock_container):
+            safe_docker = SafeDockerClient()
+            assert safe_docker.run("image", "command") == 150
+
+    def test_container_stop_raises_returns_151(self):
+        """
+        Tests 151 is returned if an error is raised when calling container.stop
+        """
+        mock_container = create_mock_container()
+        mock_container.stop.side_effect = RuntimeError("Something bad happened")
+        with MockDockerClient(mock_container):
+            safe_docker = SafeDockerClient()
+            assert safe_docker.run("image", "command") == 151
+
+    def test_container_remove_raises_returns_152(self):
+        """
+        Tests 152 is returned if an error is raised when calling container.remove
+        """
+        mock_container = create_mock_container()
+        mock_container.remove.side_effect = RuntimeError("Something bad happened")
+        with MockDockerClient(mock_container):
+            safe_docker = SafeDockerClient()
+            assert safe_docker.run("image", "command") == 152
+
+    def test_main(self):
+        """
+        Tests main function against different command line arguments
+        """
+        tests = [
+            # ( supplied command line arguments, expected call )
+            (
+                ["image", "command"],
+                call("image", command="command", runtime=None, user=None, name=None, volumes={}, cap_add=[])
+            ),
+            (
+                ["image", "command", "arg1", "arg2"],
+                call("image", command="command arg1 arg2", runtime=None, user=None, name=None, volumes={}, cap_add=[])
+            ),
+            (
+                ["--runtime", "nvidia", "image", "command"],
+                call("image", command="command", runtime="nvidia", user=None, name=None, volumes={}, cap_add=[])
+            ),
+            (
+                ["--user", "1001:1001", "image", "command"],
+                call("image", command="command", runtime=None, user="1001:1001", name=None, volumes={}, cap_add=[])
+            ),
+            ([
+                "--volume", "/local/path1:/container/path1",
+                "--volume", "/local/path2:/container/path2:ro",
+                "image",
+                "command"
+            ], call("image", command="command", runtime=None, user=None, name=None, volumes={
+                "/local/path1": {
+                    "bind": "/container/path1",
+                    "mode": "rw"
+                },
+                "/local/path2": {
+                    "bind": "/container/path2",
+                    "mode": "ro"
+                }
+            }, cap_add=[])),
+            ([
+                "--runtime", "nvidia",
+                "-u", "1001:1001",
+                "-v", "/local/path1:/container/path1",
+                "-v", "/local/path2:/container/path2:ro",
+                "--cap-add", "bob",
+                "--cap-add", "jimmy",
+                "--name",
+                "container_name",
+                "image",
+                "command",
+                "arg1",
+                "arg2"
+            ], call(
+                "image",
+                command="command arg1 arg2",
+                runtime="nvidia",
+                user="1001:1001",
+                name="container_name",
+                volumes={
+                    "/local/path1": {
+                        "bind": "/container/path1",
+                        "mode": "rw"
+                    },
+                    "/local/path2": {
+                        "bind": "/container/path2",
+                        "mode": "ro"
+                    }
+                }, cap_add=["bob", "jimmy"])
+            )
+        ]
+
+        # Tests valid arguments
+        mock_docker = create_autospec(SafeDockerClient)
+        mock_docker.run.return_value = 0
+        with patch("safe_docker_run.SafeDockerClient", return_value=mock_docker):
+            for test in tests:
+                arguments, expected_call = test
+                main(arguments)
+                assert mock_docker.run.call_args == expected_call
+
+        # Tests invalid arguments
+        tests = [
+            [],
+            None,
+            ["image"],
+            # Test some bad volume mounts
+            ["-v", "bob", "image", "args"],
+            ["-v", "/local/path", "image", "args"],
+            ["-v", "/local/path:/container/path:blah", "image", "args"],
+            ["-v", "", "image", "args"],
+            ["-v", "a:b:c:d", "image", "args"]
+        ]
+
+        mock_docker = create_autospec(SafeDockerClient)
+        with patch("safe_docker_run.SafeDockerClient", return_value=mock_docker):
+            with self.assertRaises(SystemExit):
+                for test in tests:
+                    main(test)
+
+    def test_clean_up(self):
+        """
+        Tests container clean up in case of SIGTERM and SIGINT
+        """
+        import subprocess
+        import time
+        import docker.errors
+
+        docker_client = docker.from_env()
+        container_name = "safedockertestcontainer1234"
+
+        def get_container(name: str) -> Optional[Container]:
+            try:
+                return docker_client.containers.get(name)
+            except docker.errors.NotFound:
+                return None
+
+        def remove_container_if_exists(name: str):
+            container = get_container(name)
+            if container:
+                container.stop()
+                container.remove()
+
+        def wait_for_container(name: str) -> bool:
+            for _ in itertools.count(5):
+                if get_container(name):
+                    return True
+                time.sleep(1)
+            return False
+
+        # Clear any containers with container name
+        remove_container_if_exists(container_name)
+
+        # None => not signal is emitted - we should still finish with no containers at the end due
+        # to the atexit
+        for sig in [None, signal.SIGTERM, signal.SIGINT]:
+            # Execute the safe docker run script in a different process
+            proc = subprocess.Popen(['./safe_docker_run.py', "--name", container_name, "ubuntu:18.04", "sleep 10"])
+            # NOTE: we need to wait for the container to come up as not all operating systems support blocking signals
+            if wait_for_container(container_name) is False:
+                raise RuntimeError("Test container did not come up")
+
+            # Issue the signal and wait for the process to finish
+            if sig:
+                proc.send_signal(sig)
+            proc.wait()
+
+            # The container should no longer exist
+            assert get_container(container_name) is None
+
+
+if __name__ == '__main__':
+    import nose
+    nose.main()
diff --git a/ci/util.py b/ci/util.py
index 9a8d52eb1716..4b3a399184f9 100644
--- a/ci/util.py
+++ b/ci/util.py
@@ -18,6 +18,9 @@
 import os
 import contextlib
 import logging
+import logging.config
+import sys
+
 
 def get_mxnet_root() -> str:
     curpath = os.path.abspath(os.path.dirname(__file__))
@@ -32,6 +35,7 @@ def is_mxnet_root(path: str) -> bool:
         curpath = parent
     return curpath
 
+
 @contextlib.contextmanager
 def remember_cwd():
     '''
@@ -113,3 +117,16 @@ def chdir_to_script_directory():
     os.chdir(base)
 
 
+def script_name() -> str:
+    """:returns: script name with leading paths removed"""
+    return os.path.split(sys.argv[0])[1]
+
+
+def config_logging():
+    conf_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "logging.conf")
+    logging.config.fileConfig(os.getenv('LOGGING_CONF', conf_path))
+
+    # Force botocore and requests are set to WARNING to avoid leaking any credentials
+    # or sensitive information
+    logging.getLogger("botocore").setLevel(logging.WARNING)
+    logging.getLogger("requests").setLevel(logging.WARNING)