Skip to content

Commit

Permalink
Merge pull request #12 from Spico197/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
Spico197 authored Aug 4, 2021
2 parents 875eb6b + 6dc0eaa commit 6b24567
Show file tree
Hide file tree
Showing 5 changed files with 33 additions and 11 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@ send_email(
To get more reminders, please check `watchmen/reminder.py`.

## UPDATE
- v0.3.8: change `OK` status to be shown only in the finished queue, and show `ready` in the working queue. Fix severe bug when scheduling
- v0.3.7: much faster due to lock free changes! fix timeout and schedule bug
- v0.3.6: fix front-end api hostname bug
- v0.3.5: fix front-end api port bug
Expand Down
2 changes: 1 addition & 1 deletion watchmen/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .client import WatchClient
from .client import ClientMode

__version__ = "0.3.7"
__version__ = "0.3.8"
10 changes: 9 additions & 1 deletion watchmen/client.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import time
import logging
import datetime
import getpass
from enum import Enum
Expand All @@ -11,9 +12,13 @@
from watchmen.listener import check_gpus_existence, check_req_gpu_num


logger = logging.getLogger("common")


class ClientStatus(str, Enum):
WAITING = "waiting"
TIMEOUT = "timeout"
READY = "ready"
OK = "ok"


Expand Down Expand Up @@ -118,8 +123,11 @@ def ping(self):
else:
if result["msg"] == ClientStatus.WAITING:
return False, result["available_gpus"]
elif result["msg"] == ClientStatus.OK:
elif result["msg"] == ClientStatus.READY:
return True, result["available_gpus"]
elif result["msg"] == ClientStatus.OK:
logger.warning("Status is OK, which has finished requesting GPUs.")
return False, result["available_gpus"]
elif result["msg"] == ClientStatus.TIMEOUT:
raise RuntimeError("status changed to TIMEOUT")

Expand Down
24 changes: 16 additions & 8 deletions watchmen/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,21 +246,25 @@ def check_gpu_info():
def check_work(queue_timeout):
logger.info("regular check")
marked_finished = []
reserved_gpus = set() # whether there can be multiple `ok` in one scan
reserved_gpus = set()
client_list = []
queue_num = 0
for client_id, client in cc.work_queue.items():
time_delta = datetime.datetime.now() - client.last_request_time
logger.info(f"client: {client.id}, time_delta.seconds: {time_delta.seconds}, time_delta: {time_delta}")
if time_delta.seconds > queue_timeout:
if client.status != ClientStatus.OK:
if client.status != ClientStatus.READY:
client.status = ClientStatus.TIMEOUT
client.queue_num = -1 # invalid client
else:
client.status = ClientStatus.OK
# invalid client
client.queue_num = -1
marked_finished.append(client_id)
continue
client.queue_num = queue_num
ok = False
available_gpus = []
if client.status == ClientStatus.OK:
if client.status == ClientStatus.READY:
reserved_gpus |= set(client.available_gpus)
else:
try:
Expand All @@ -279,12 +283,16 @@ def check_work(queue_timeout):
except RuntimeError as err:
client.msg = str(err)

if ok and len(set(available_gpus) & reserved_gpus) <= 0:
client.status = ClientStatus.OK
client_list.append([client_id, client, ok, set(available_gpus)])
queue_num += 1

# post check and assignment, and make sure gpus of `ready` clients will not be assigned to the others
for client_id, client, ok, available_gpu_set in client_list:
if ok and len(available_gpu_set) > 0 and len(available_gpu_set & reserved_gpus) < 1:
client.status = ClientStatus.READY
client.available_gpus = available_gpus
reserved_gpus |= set(client.available_gpus)
logger.info(f"client: {client.id} is ready, available gpus: {client.available_gpus}")
queue_num += 1

for client_id in marked_finished:
logger.info(f"client {client.id} marked as finished, status: {client.status}")
Expand Down Expand Up @@ -335,7 +343,7 @@ def api_server(host, port):
help="host address for api server")
parser.add_argument("--port", type=str, default=62333,
help="port for api server")
parser.add_argument("--queue_timeout", type=int, default=120,
parser.add_argument("--queue_timeout", type=int, default=300,
help="timeout for queue waiting (seconds)")
parser.add_argument("--request_interval", type=int, default=1,
help="interval for gpu status requesting (seconds)")
Expand Down
7 changes: 6 additions & 1 deletion watchmen/templates/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,10 @@
.ok {
color: green;
}

.ready {
color: green;
}
</style>
</head>

Expand Down Expand Up @@ -161,6 +165,7 @@
<th>Mode</th>
<th>GPU Scope</th>
<th>Request GPU Num</th>
<th>Available GPUs</th>
<th>Register Time</th>
<th>Last Request Time</th>
</tr>
Expand Down Expand Up @@ -245,7 +250,7 @@
for (let i = 0; i < data.work_queue.length; i++) {
let c = data.work_queue[i]
let tr = document.createElement("tr")
tr.innerHTML = `<td>${c.queue_num}</td> <td class="${c.status}">${c.status}</td> <td>${c.id}</td> <td>${c.mode}</td> <td>${c.gpus}</td> <td>${c.req_gpu_num}</td> <td>${c.register_time}</td> <td>${c.last_request_time}</td>`
tr.innerHTML = `<td>${c.queue_num}</td> <td class="${c.status}">${c.status}</td> <td>${c.id}</td> <td>${c.mode}</td> <td>${c.gpus}</td> <td>${c.req_gpu_num}</td> <td>${c.available_gpus}</td> <td>${c.register_time}</td> <td>${c.last_request_time}</td>`
workingStats.appendChild(tr)
}
} else if (selected === "finished-queue-template") {
Expand Down

0 comments on commit 6b24567

Please sign in to comment.