-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_all.py
107 lines (78 loc) · 2.87 KB
/
run_all.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
import argparse
import subprocess
from threading import Thread
from queue import Queue, Empty
from pathlib import Path
import torch
import time
def wait_gpu_free(id, free_count_req):
free_count = 0
sleep_sec = 30.0
print(f"Waiting for GPU {id} to go idle... ({free_count_req})")
while True:
device_util = torch.cuda.utilization(id)
if device_util == 0:
free_count += 1
print(f"GPU {id} free at {free_count}/{free_count_req}")
if free_count >= free_count_req:
return
else:
free_count = 0
time.sleep(sleep_sec)
def worker_func(qu: Queue, device, wait_free=False, wait_free_count=None):
env = os.environ.copy()
# env["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
env["CUDA_VISIBLE_DEVICES"] = str(device)
if wait_free:
wait_gpu_free(int(device), wait_free_count)
while True:
try:
proc = qu.get(block=False)
except Empty:
return
subprocess.run(proc, env=env)
def main():
parser = argparse.ArgumentParser()
parser.add_argument("script")
parser.add_argument("--device", nargs="+", required=True)
parser.add_argument("--wait_gpu_free", action="store_true")
parser.add_argument("--wait_gpu_free_times", type=int, default=10)
parser.add_argument("--reversed", action="store_true")
# parser.add_argument("--cfg", nargs="+", required=True)
args, other_args = parser.parse_known_args()
try:
split_idx = other_args.index("--")
other_args_var, other_args_const = other_args[:split_idx], other_args[split_idx+1:]
except ValueError:
split_idx = next(((i+1) for i, arg in enumerate(other_args[1:]) if arg.startswith("--")), len(other_args))
other_args_var, other_args_const = other_args[:split_idx], other_args[split_idx:]
script = args.script
devices = args.device
assert Path(script).is_file()
other_args_var_grouped = []
for arg in other_args_var:
if arg.startswith("--"):
other_args_var_grouped.append([])
other_args_var_grouped[-1].append(arg)
num_args_var = len(other_args_var_grouped)
num_tasks = len(other_args_var_grouped[0]) - 1
cmds = []
for i in (range(num_tasks) if not args.reversed else reversed(range(num_tasks))):
cmd = ["python", script]
for group in other_args_var_grouped:
cmd += [group[0], group[i+1]]
cmd += other_args_const
cmds.append(cmd)
print(cmds)
qu = Queue()
for cmd in cmds:
qu.put(cmd)
workers = [Thread(target=worker_func, args=(qu, device, args.wait_gpu_free, args.wait_gpu_free_times)) for device in devices]
for worker in workers:
worker.start()
for worker in workers:
worker.join()
if __name__ == "__main__":
main()