diff --git a/.github/workflows/test-inference-pipeline.yml b/.github/workflows/test-inference-pipeline.yml
index 2c841ec..062ed09 100644
--- a/.github/workflows/test-inference-pipeline.yml
+++ b/.github/workflows/test-inference-pipeline.yml
@@ -43,6 +43,14 @@ jobs:
 
       - name: Run inference for deepFCD
         run: |
-          ./app/inference.py sub-00055 t1.nii.gz flair.nii.gz ~/io cuda 1 1
+          ./app/inference.py ${CI_TESTING_PATIENT_ID} t1.nii.gz flair.nii.gz ~/io cuda 1 1
         env:
-          CI_TESTING_GT: "./tests/segmentations/sub-00055/sub-00055_label_dilated_final.nii.gz"
\ No newline at end of file
+          CI_TESTING_PATIENT_ID: "sub-00055"
+          CI_TESTING_GT: "./tests/segmentations/sub-00055/sub-00055_label_dilated_final.nii.gz"
+          CI_TESTING_PRED_DIR: "/home/ga/io"
+          
+      - name: Run tests to compare outputs with previous validated runs
+        run: bash ./tests/run_tests.sh
+        env:
+          CI_TESTING_PATIENT_ID: "sub-00055"
+          CI_TESTING_PRED_DIR: "/home/ga/io"
\ No newline at end of file
diff --git a/Dockerfile b/Dockerfile
index baa13c9..3a91e2e 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -58,6 +58,8 @@ RUN python -m pip install -r /app/requirements.txt \
 
 COPY app/ /app/
 
+COPY tests/ /tests/
+
 RUN sudo chmod -R 777 /app && sudo chmod +x /app/inference.py
 
 CMD ["python3"]
\ No newline at end of file
diff --git a/Makefile b/Makefile
index 3381e2a..420284b 100644
--- a/Makefile
+++ b/Makefile
@@ -1,11 +1,12 @@
 ACCOUNT := noelmni
 SERVICE := deep-fcd
 IMAGE 	:= $(ACCOUNT)/$(SERVICE) # noelmni/deep-fcd
-TAG		:= resamp_orig
+TAG		:= latest
 UID		:= 2551
 GID		:= 618
 CASE_ID := sub-00055
 TMPDIR	:= /host/hamlet/local_raid/data/ravnoor/sandbox
+PRED_DIR	:= /host/hamlet/local_raid/data/ravnoor/sandbox/pytests
 BRAIN_MASKING := 1
 PREPROCESS		:= 1
 
@@ -47,6 +48,17 @@ test-pipeline-docker_ci:
 	$(ACCOUNT)/$(SERVICE):$(TAG) \
 	/app/inference.py $(CASE_ID) T1.nii.gz FLAIR.nii.gz /tmp cuda0 $(BRAIN_MASKING) $(PREPROCESS)
 
+test-pipeline-docker_testing:
+	docker run --rm -it --init \
+	--gpus=all	\
+	--user="$(UID):$(GID)" \
+	--volume="$(PRED_DIR):/tmp" \
+	--env CI_TESTING=1 \
+	--env CI_TESTING_PATIENT_ID=$(CASE_ID) \
+	--env CI_TESTING_PRED_DIR=/tmp \
+	$(ACCOUNT)/$(SERVICE):$(TAG) \
+		bash /tests/run_tests.sh
+
 test-reporting:
 	./app/utils/reporting.py $(CASE_ID) $(TMPDIR)/
 
@@ -68,22 +80,22 @@ prune:
 	docker image prune
 
 runner-build:
-	docker-compose -f runner.docker-compose.yml build
+	docker-compose -f ci/runner.docker-compose.yml build
 
 runner-ps:
-	docker-compose -f runner.docker-compose.yml ps
+	docker-compose -f ci/runner.docker-compose.yml ps
 
 runner-up:
-	docker-compose -f runner.docker-compose.yml up --remove-orphans -d
+	docker-compose -f ci/runner.docker-compose.yml up --remove-orphans -d
 
 runner-down:
-	docker-compose -f runner.docker-compose.yml down
+	docker-compose -f ci/runner.docker-compose.yml down
 
 runner-logs:
-	docker-compose -f runner.docker-compose.yml logs -f
+	docker-compose -f ci/runner.docker-compose.yml logs -f
 
 runner-scale:
-	docker-compose up --scale runner=1 -d
+	docker-compose -f ci/runner.docker-compose.yml up --scale runner=1 -d
 
 runner-bash:
-	docker-compose -f runner.docker-compose.yml exec -it runner bash
\ No newline at end of file
+	docker-compose -f ci/runner.docker-compose.yml exec -it runner bash
\ No newline at end of file
diff --git a/runner.Dockerfile b/ci/runner.Dockerfile
similarity index 95%
rename from runner.Dockerfile
rename to ci/runner.Dockerfile
index d8d2ab2..81b1c45 100644
--- a/runner.Dockerfile
+++ b/ci/runner.Dockerfile
@@ -11,6 +11,7 @@ LABEL maintainer="Ravnoor Singh Gill <ravnoor@gmail.com>" \
 RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub
 
 ARG RUNNER_VERSION=2.309.0
+ARG NVM_VERSION=0.39.5
 
 ENV DEBIAN_FRONTEND=noninteractive
 RUN apt-get update -y
@@ -32,7 +33,7 @@ ENV HOME=/home/ga
 
 # https://stackoverflow.com/questions/25899912/how-to-install-nvm-in-docker/60137919#60137919
 SHELL ["/bin/bash", "--login", "-i", "-c"]
-RUN curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v0.39.5/install.sh | bash
+RUN curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v${NVM_VERSION}/install.sh | bash
 RUN source /root/.bashrc && nvm install 16
 SHELL ["/bin/bash", "--login", "-c"]
 
diff --git a/runner.docker-compose.yml b/ci/runner.docker-compose.yml
similarity index 90%
rename from runner.docker-compose.yml
rename to ci/runner.docker-compose.yml
index 3394366..03811b8 100644
--- a/runner.docker-compose.yml
+++ b/ci/runner.docker-compose.yml
@@ -2,7 +2,7 @@ version: '3.9'
 
 services:
   runner:
-    image: noelmni/deep-fcd:runner_alpha
+    image: noelmni/deep-fcd:runner_latest
     # command: '/app/inference.py FCD_001 T1.nii.gz FLAIR.nii.gz /io cuda0 1 1'
     # command: nvidia-smi
     # entrypoint: /bin/bash
@@ -11,6 +11,7 @@ services:
       dockerfile: runner.Dockerfile
       args:
         RUNNER_VERSION: '2.309.0'
+        NVM_VERSION: '0.39.5'
     deploy:
       resources:
         reservations:
diff --git a/start-runner.sh b/ci/start-runner.sh
similarity index 100%
rename from start-runner.sh
rename to ci/start-runner.sh
diff --git a/tests/run_tests.sh b/tests/run_tests.sh
new file mode 100644
index 0000000..05d0c9e
--- /dev/null
+++ b/tests/run_tests.sh
@@ -0,0 +1,9 @@
+#!/usr/bin/env bash
+set -e
+
+pushd "$(dirname "$0")"
+
+echo "Running all tests"
+python3 test_deepFCD.py $@
+
+popd
\ No newline at end of file
diff --git a/tests/segmentations/sub-00055/noel_deepFCD_dropoutMC/sub-00055_noel_deepFCD_dropoutMC_prob_mean_1.nii.gz b/tests/segmentations/sub-00055/noel_deepFCD_dropoutMC/sub-00055_noel_deepFCD_dropoutMC_prob_mean_1.nii.gz
new file mode 100644
index 0000000..a0b887c
Binary files /dev/null and b/tests/segmentations/sub-00055/noel_deepFCD_dropoutMC/sub-00055_noel_deepFCD_dropoutMC_prob_mean_1.nii.gz differ
diff --git a/tests/segmentations/sub-00055/noel_deepFCD_dropoutMC/sub-00055_noel_deepFCD_dropoutMC_prob_var_1.nii.gz b/tests/segmentations/sub-00055/noel_deepFCD_dropoutMC/sub-00055_noel_deepFCD_dropoutMC_prob_var_1.nii.gz
new file mode 100644
index 0000000..dc4088a
Binary files /dev/null and b/tests/segmentations/sub-00055/noel_deepFCD_dropoutMC/sub-00055_noel_deepFCD_dropoutMC_prob_var_1.nii.gz differ
diff --git a/tests/test_deepFCD.py b/tests/test_deepFCD.py
new file mode 100644
index 0000000..496d465
--- /dev/null
+++ b/tests/test_deepFCD.py
@@ -0,0 +1,150 @@
+"""
+Test deepFCD.py
+
+nptest.assert_allclose
+self.assertEqual
+self.assertTrue
+
+"""
+
+import os
+import unittest
+from tempfile import mktemp
+
+import ants
+import numpy as np
+import numpy.testing as nptest
+
+from utils import compare_images
+
+
+params = {}
+if os.environ.get("CI_TESTING") is not None:
+    params["CI_TESTING_PRED_DIR"] = os.environ.get("CI_TESTING_PRED_DIR")
+    params["CI_TESTING_PATIENT_ID"] = os.environ.get("CI_TESTING_PATIENT_ID")
+else:
+    params["CI_TESTING_PRED_DIR"] = "/host/hamlet/local_raid/data/ravnoor/sandbox/pytests"
+    params["CI_TESTING_PATIENT_ID"] = "sub-00055"
+
+
+class TestModule_deepFCD(unittest.TestCase):
+
+    def setUp(self):
+        # load predictions from a previous validated run (known as ground-truth labels in this context)
+        self.gt_deepMask = ants.image_read('segmentations/sub-00055/sub-00055_brain_mask_final.nii.gz').clone('unsigned int')
+        self.gt_deepFCD_mean = ants.image_read('segmentations/sub-00055/noel_deepFCD_dropoutMC/sub-00055_noel_deepFCD_dropoutMC_prob_mean_1.nii.gz').clone('float')
+        self.gt_deepFCD_var = ants.image_read('segmentations/sub-00055/noel_deepFCD_dropoutMC/sub-00055_noel_deepFCD_dropoutMC_prob_var_1.nii.gz').clone('float')
+        
+        pred_path = os.path.join(params["CI_TESTING_PRED_DIR"], params["CI_TESTING_PATIENT_ID"])
+        # load predicitions from the most recent run
+        self.pred_deepMask = ants.image_read(pred_path + '/' + params["CI_TESTING_PATIENT_ID"] + '_brain_mask_final.nii.gz').clone('unsigned int')
+        self.pred_deepFCD_mean = ants.image_read(pred_path + '/noel_deepFCD_dropoutMC/' + params["CI_TESTING_PATIENT_ID"] + '_noel_deepFCD_dropoutMC_prob_mean_1.nii.gz').clone('float')
+        self.pred_deepFCD_var = ants.image_read(pred_path + '/noel_deepFCD_dropoutMC/' + params["CI_TESTING_PATIENT_ID"] + '_noel_deepFCD_dropoutMC_prob_var_1.nii.gz').clone('float')
+
+        self.imgs = [self.pred_deepMask, self.pred_deepFCD_mean, self.pred_deepFCD_var]
+        self.pixeltypes = ['unsigned char', 'unsigned int', 'float']
+
+    def tearDown(self):
+        pass
+
+    def test_image_header_info(self):
+        # def image_header_info(filename):
+        for img in self.imgs:
+            img.set_spacing([6.9]*img.dimension)
+            img.set_origin([3.6]*img.dimension)
+            tmpfile = mktemp(suffix='.nii.gz')
+            ants.image_write(img, tmpfile)
+
+            info = ants.image_header_info(tmpfile)
+            self.assertEqual(info['dimensions'], img.shape)
+            nptest.assert_allclose(info['direction'], img.direction)
+            self.assertEqual(info['nComponents'], img.components)
+            self.assertEqual(info['nDimensions'], img.dimension)
+            self.assertEqual(info['origin'], img.origin)
+            self.assertEqual(info['pixeltype'], img.pixeltype)
+            self.assertEqual(info['pixelclass'], 'vector' if img.has_components else 'scalar')
+            self.assertEqual(info['spacing'], img.spacing)
+
+            try:
+                os.remove(tmpfile)
+            except:
+                pass
+
+        # non-existent file
+        with self.assertRaises(Exception):
+            tmpfile = mktemp(suffix='.nii.gz')
+            ants.image_header_info(tmpfile)
+
+
+    def test_image_read_write(self):
+        # def image_read(filename, dimension=None, pixeltype='float'):
+        # def image_write(image, filename):
+
+        # test scalar images
+        for img in self.imgs:
+            img = (img - img.min()) / (img.max() - img.min())
+            img = img * 255.
+            img = img.clone('unsigned char')
+            for ptype in self.pixeltypes:
+                img = img.clone(ptype)
+                tmpfile = mktemp(suffix='.nii.gz')
+                ants.image_write(img, tmpfile)
+
+                img2 = ants.image_read(tmpfile)
+                self.assertTrue(ants.image_physical_space_consistency(img,img2))
+                self.assertEqual(img2.components, img.components)
+                nptest.assert_allclose(img.numpy(), img2.numpy())
+
+            # unsupported ptype
+            with self.assertRaises(Exception):
+                ants.image_read(tmpfile, pixeltype='not-suppoted-ptype')
+
+        # test saving/loading as npy
+        for img in self.imgs:
+            tmpfile = mktemp(suffix='.npy')
+            ants.image_write(img, tmpfile)
+            img2 = ants.image_read(tmpfile)
+
+            self.assertTrue(ants.image_physical_space_consistency(img,img2))
+            self.assertEqual(img2.components, img.components)
+            nptest.assert_allclose(img.numpy(), img2.numpy())
+
+            # with no json header
+            arr = img.numpy()
+            tmpfile = mktemp(suffix='.npy')
+            np.save(tmpfile, arr)
+            img2 = ants.image_read(tmpfile)
+            nptest.assert_allclose(img.numpy(), img2.numpy())
+
+        # non-existant file
+        with self.assertRaises(Exception):
+            tmpfile = mktemp(suffix='.nii.gz')
+            ants.image_read(tmpfile)
+
+
+    def test_brain_mask_segmentation(self):
+        metric = compare_images(self.gt_deepMask, self.pred_deepMask)
+        print("overlap of the brain mask with the label: {}".format(metric))
+        # set relative tolerance to 0.05
+        # predicted image is expected to have overlap within 0.05
+        nptest.assert_allclose(1., metric, rtol=0.05, atol=0)
+
+
+    def test_deepFCD_segmentation_mean(self):
+        metric = compare_images(self.gt_deepFCD_mean, self.pred_deepFCD_mean, metric_type='correlation')
+        print("correlation of the mean probability map with the the label: {}".format(metric))
+        # set relative tolerance to 0.05
+        # predicted image is expected to have correlation within 0.05
+        nptest.assert_allclose(1., metric, rtol=0.05, atol=0)
+
+
+    def test_deepFCD_segmentation_var(self):
+        metric = compare_images(self.gt_deepFCD_var, self.pred_deepFCD_var, metric_type='correlation')
+        print("correlation of the mean uncertainty map with the the label: {}".format(metric))
+        # set relative tolerance to 0.05
+        # predicted image is expected to have correlation within 0.05
+        nptest.assert_allclose(1., metric, rtol=0.05, atol=0)
+
+
+if __name__ == '__main__':
+    unittest.main()
\ No newline at end of file
diff --git a/tests/utils.py b/tests/utils.py
new file mode 100644
index 0000000..f914abf
--- /dev/null
+++ b/tests/utils.py
@@ -0,0 +1,85 @@
+import ants
+import numpy as np
+
+def dilate_labels(label, dilated_label_fname):
+  """
+    Apply morphological operations to an image
+
+    ANTsR function: `morphology`
+
+    Arguments
+    ---------
+    input : ANTsImage
+        input image
+
+    operation : string
+        operation to apply
+            "close" Morpholgical closing
+            "dilate" Morpholgical dilation
+            "erode" Morpholgical erosion
+            "open" Morpholgical opening
+
+    radius : scalar
+        radius of structuring element
+
+    mtype : string
+        type of morphology
+            "binary" Binary operation on a single value
+            "grayscale" Grayscale operations
+
+    value : scalar
+        value to operation on (type='binary' only)
+
+    shape : string
+        shape of the structuring element ( type='binary' only )
+            "ball" spherical structuring element
+            "box" box shaped structuring element
+            "cross" cross shaped structuring element
+            "annulus" annulus shaped structuring element
+            "polygon" polygon structuring element
+
+    radius_is_parametric : boolean
+        used parametric radius boolean (shape='ball' and shape='annulus' only)
+
+    thickness : scalar
+        thickness (shape='annulus' only)
+
+    lines : integer
+        number of lines in polygon (shape='polygon' only)
+
+    include_center : boolean
+        include center of annulus boolean (shape='annulus' only)
+
+    Returns
+    -------
+    ANTsImage
+
+    Example
+    -------
+    >>> import ants
+    >>> fi = ants.image_read( ants.get_ants_data('r16') , 2 )
+    >>> mask = ants.get_mask( fi )
+    >>> dilated_ball = ants.morphology( mask, operation='dilate', radius=3, mtype='binary', shape='ball')
+    >>> eroded_box = ants.morphology( mask, operation='erode', radius=3, mtype='binary', shape='box')
+    >>> opened_annulus = ants.morphology( mask, operation='open', radius=5, mtype='binary', shape='annulus', thickness=2)
+  """
+  label = ants.image_read(label)
+  ants.morphology(label, operation='dilate', radius=30, mtype='binary', shape='ball').to_filename(dilated_label_fname)
+
+
+def compare_images(predicted_image, ground_truth_image, metric_type='correlation'):
+  """
+    Measure similarity between two images.
+    NOTE: Similarity is actually returned as distance (i.e. dissimilarity)
+    per ITK/ANTs convention. E.g. using Correlation metric, the similarity
+    of an image with itself returns -1.
+  """
+  # predicted_image = ants.image_read(predicted_image)
+  # ground_truth_image = ants.image_read(ground_truth_image)
+  if metric_type == 'correlation':
+    metric = ants.image_similarity(predicted_image, ground_truth_image, metric_type='ANTSNeighborhoodCorrelation')
+    metric = np.abs(metric)
+  else:
+    metric = ants.label_overlap_measures(predicted_image, ground_truth_image).TotalOrTargetOverlap[1]
+
+  return metric
\ No newline at end of file