Merge pull request #1130 from lukeyeager/fix-torch-cudavisibledevices

Bugfix - Torch and CUDA_VISIBLE_DEVICES
NVIDIA · Oct 4, 2016 · 38825d1 · 38825d1
2 parents f8b74e8 + b647c20
commit 38825d1
Showing 1 changed file with 25 additions and 3 deletions.
diff --git a/digits/model/tasks/torch_train.py b/digits/model/tasks/torch_train.py
@@ -29,6 +29,28 @@
 TORCH_MODEL_FILE = 'model.lua'
 TORCH_SNAPSHOT_PREFIX = 'snapshot'
 
+
+def subprocess_visible_devices(gpus):
+    """
+    Calculates CUDA_VISIBLE_DEVICES for a subprocess
+    """
+    if not isinstance(gpus, list):
+        raise ValueError('gpus should be a list')
+    gpus = [int(g) for g in gpus]
+
+    old_cvd = os.environ.get('CUDA_VISIBLE_DEVICES', None)
+    if old_cvd is None:
+        real_gpus = gpus
+    else:
+        map_visible_to_real = {}
+        for visible, real in enumerate(old_cvd.split(',')):
+            map_visible_to_real[visible] = int(real)
+        real_gpus = []
+        for visible_gpu in gpus:
+            real_gpus.append(map_visible_to_real[visible_gpu])
+    return ','.join(str(g) for g in real_gpus)
+
+
 @subclass
 class TorchTrainTask(TrainTask):
     """
@@ -239,7 +261,7 @@ def task_arguments(self, resources, env):
             # don't make other GPUs visible though since Torch will load
             # CUDA libraries and allocate memory on all visible GPUs by
             # default.
-            env['CUDA_VISIBLE_DEVICES'] = ','.join(identifiers)
+            env['CUDA_VISIBLE_DEVICES'] = subprocess_visible_devices(identifiers)
             # switch to GPU mode
             args.append('--type=cuda')
         else:
@@ -570,7 +592,7 @@ def infer_one_image(self, image, snapshot_epoch=None, layers=None, gpu=None):
         if gpu is not None:
             args.append('--type=cuda')
             # make only the selected GPU visible
-            env['CUDA_VISIBLE_DEVICES'] = "%d" % gpu
+            env['CUDA_VISIBLE_DEVICES'] = subprocess_visible_devices([gpu])
         else:
             args.append('--type=float')
 
@@ -860,7 +882,7 @@ def infer_many_images(self, images, snapshot_epoch=None, gpu=None):
             if gpu is not None:
                 args.append('--type=cuda')
                 # make only the selected GPU visible
-                env['CUDA_VISIBLE_DEVICES'] = "%d" % gpu
+                env['CUDA_VISIBLE_DEVICES'] = subprocess_visible_devices([gpu])
             else:
                 args.append('--type=float')