Merge pull request #12 from Spico197/dev

Dev
Spico197 · Aug 4, 2021 · 6b24567 · 6b24567
2 parents 875eb6b + 6dc0eaa
commit 6b24567
Show file tree

Hide file tree

Showing 5 changed files with 33 additions and 11 deletions.
diff --git a/README.md b/README.md
@@ -130,6 +130,7 @@ send_email(
 To get more reminders, please check `watchmen/reminder.py`.
 
 ## UPDATE
+- v0.3.8: change `OK` status to be shown only in the finished queue, and show `ready` in the working queue. Fix severe bug when scheduling
 - v0.3.7: much faster due to lock free changes! fix timeout and schedule bug
 - v0.3.6: fix front-end api hostname bug
 - v0.3.5: fix front-end api port bug

diff --git a/watchmen/__init__.py b/watchmen/__init__.py
@@ -1,4 +1,4 @@
 from .client import WatchClient
 from .client import ClientMode
 
-__version__ = "0.3.7"
+__version__ = "0.3.8"
diff --git a/watchmen/client.py b/watchmen/client.py
@@ -1,4 +1,5 @@
 import time
+import logging
 import datetime
 import getpass
 from enum import Enum
@@ -11,9 +12,13 @@
 from watchmen.listener import check_gpus_existence, check_req_gpu_num
 
 
+logger = logging.getLogger("common")
+
+
 class ClientStatus(str, Enum):
     WAITING = "waiting"
     TIMEOUT = "timeout"
+    READY = "ready"
     OK = "ok"
 
 
@@ -118,8 +123,11 @@ def ping(self):
         else:
             if result["msg"] == ClientStatus.WAITING:
                 return False, result["available_gpus"]
-            elif result["msg"] == ClientStatus.OK:
+            elif result["msg"] == ClientStatus.READY:
                 return True, result["available_gpus"]
+            elif result["msg"] == ClientStatus.OK:
+                logger.warning("Status is OK, which has finished requesting GPUs.")
+                return False, result["available_gpus"]
             elif result["msg"] == ClientStatus.TIMEOUT:
                 raise RuntimeError("status changed to TIMEOUT")
 

diff --git a/watchmen/server.py b/watchmen/server.py
@@ -246,21 +246,25 @@ def check_gpu_info():
 def check_work(queue_timeout):
     logger.info("regular check")
     marked_finished = []
-    reserved_gpus = set()  # whether there can be multiple `ok` in one scan
+    reserved_gpus = set()
+    client_list = []
     queue_num = 0
     for client_id, client in cc.work_queue.items():
         time_delta = datetime.datetime.now() - client.last_request_time
         logger.info(f"client: {client.id}, time_delta.seconds: {time_delta.seconds}, time_delta: {time_delta}")
         if time_delta.seconds > queue_timeout:
-            if client.status != ClientStatus.OK:
+            if client.status != ClientStatus.READY:
                 client.status = ClientStatus.TIMEOUT
-            client.queue_num = -1  # invalid client
+            else:
+                client.status = ClientStatus.OK
+            # invalid client
+            client.queue_num = -1
             marked_finished.append(client_id)
             continue
         client.queue_num = queue_num
         ok = False
         available_gpus = []
-        if client.status == ClientStatus.OK:
+        if client.status == ClientStatus.READY:
             reserved_gpus |= set(client.available_gpus)
         else:
             try:
@@ -279,12 +283,16 @@ def check_work(queue_timeout):
             except RuntimeError as err:
                 client.msg = str(err)
 
-        if ok and len(set(available_gpus) & reserved_gpus) <= 0:
-            client.status = ClientStatus.OK
+        client_list.append([client_id, client, ok, set(available_gpus)])
+        queue_num += 1
+
+    # post check and assignment, and make sure gpus of `ready` clients will not be assigned to the others
+    for client_id, client, ok, available_gpu_set in client_list:
+        if ok and len(available_gpu_set) > 0 and len(available_gpu_set & reserved_gpus) < 1:
+            client.status = ClientStatus.READY
             client.available_gpus = available_gpus
             reserved_gpus |= set(client.available_gpus)
             logger.info(f"client: {client.id} is ready, available gpus: {client.available_gpus}")
-        queue_num += 1
 
     for client_id in marked_finished:
         logger.info(f"client {client.id} marked as finished, status: {client.status}")
@@ -335,7 +343,7 @@ def api_server(host, port):
                         help="host address for api server")
     parser.add_argument("--port", type=str, default=62333,
                         help="port for api server")
-    parser.add_argument("--queue_timeout", type=int, default=120,
+    parser.add_argument("--queue_timeout", type=int, default=300,
                         help="timeout for queue waiting (seconds)")
     parser.add_argument("--request_interval", type=int, default=1,
                         help="interval for gpu status requesting (seconds)")

diff --git a/watchmen/templates/index.html b/watchmen/templates/index.html
@@ -103,6 +103,10 @@
     .ok {
       color: green;
     }
+
+    .ready {
+      color: green;
+    }
   </style>
 </head>
 
@@ -161,6 +165,7 @@
           <th>Mode</th>
           <th>GPU Scope</th>
           <th>Request GPU Num</th>
+          <th>Available GPUs</th>
           <th>Register Time</th>
           <th>Last Request Time</th>
         </tr>
@@ -245,7 +250,7 @@
         for (let i = 0; i < data.work_queue.length; i++) {
           let c = data.work_queue[i]
           let tr = document.createElement("tr")
-          tr.innerHTML = `<td>${c.queue_num}</td> <td class="${c.status}">${c.status}</td> <td>${c.id}</td> <td>${c.mode}</td> <td>${c.gpus}</td> <td>${c.req_gpu_num}</td> <td>${c.register_time}</td> <td>${c.last_request_time}</td>`
+          tr.innerHTML = `<td>${c.queue_num}</td> <td class="${c.status}">${c.status}</td> <td>${c.id}</td> <td>${c.mode}</td> <td>${c.gpus}</td> <td>${c.req_gpu_num}</td> <td>${c.available_gpus}</td> <td>${c.register_time}</td> <td>${c.last_request_time}</td>`
           workingStats.appendChild(tr)
         }
       } else if (selected === "finished-queue-template") {