Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Nvidia monitor fixes #239

Merged
merged 6 commits into from
May 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -254,4 +254,4 @@ to CMake using `Gperftools_ROOT_DIR`.

# Copyright

Copyright (c) 2018-2023 CERN.
Copyright (c) 2018-2024 CERN.
41 changes: 41 additions & 0 deletions package/scripts/gpu-burner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#! /usr/bin/env python3
#
# This is a slightly adapted "hello, world" script from
# pycuda, that can be used for stressing a CUDA GPU for
# tests
#
# pycuda is required!
#

import pycuda.autoinit # noqa: F401
import pycuda.driver as drv
import numpy
from time import time

from pycuda.compiler import SourceModule

mod = SourceModule(
"""
__global__ void multiply_them(float *dest, float *a, float *b, float *c)
{
const int i = threadIdx.x;
dest[i] = a[i] * b[i] + c[i];
}
"""
)

multiply_them = mod.get_function("multiply_them")

a = numpy.random.randn(1024).astype(numpy.float32)
b = numpy.random.randn(1024).astype(numpy.float32)
c = numpy.random.randn(1024).astype(numpy.float32)

dest = numpy.zeros_like(a)

start = time()
while time() - start < 20:
multiply_them(
drv.Out(dest), drv.In(a), drv.In(b), drv.In(c), block=(1024, 1, 1), grid=(1, 1)
)

print(dest - a * b + c)
15 changes: 10 additions & 5 deletions package/scripts/precook_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,19 +95,24 @@ def make_net(proc_net, fixed_value, rand=False):

def make_nvidia(proc_nvidia, fixed_value, rand=False):
# idx
print(proc_nvidia, fixed_value, rand)
smi_fname = os.path.join(proc_nvidia, "smi")
pct_lim = 100
memory_lim = 10000
with open(smi_fname, "w") as f:
params = [
0, # idx
pid, # pid
"G", # type
random.randint(0, memory_lim) if rand else fixed_value, # sm
random.randint(0, memory_lim) if rand else fixed_value, # mem
# enc, dec are not monitored metrics
0, # enc
0, # dec
random.randint(0, pct_lim) if rand else fixed_value, # sm
random.randint(0, pct_lim) if rand else fixed_value, # mem
# The following are not monitored metrics
"-", # enc
"-", # dec
"-", # jpg
"-", # ofa
random.randint(0, memory_lim) if rand else fixed_value, # fb
0, # ccpm
"python3", # command
]
for param in params:
Expand Down
2 changes: 1 addition & 1 deletion package/scripts/precooked_tests/drop/1/nvidia/smi
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0 1729 G 50 50 0 0 50 python3
0 1729 G 50 50 - - - - 50 0 python3
2 changes: 1 addition & 1 deletion package/scripts/precooked_tests/drop/2/nvidia/smi
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0 1729 G 100 100 0 0 100 python3
0 1729 G 100 100 - - - - 100 0 python3
2 changes: 1 addition & 1 deletion package/scripts/precooked_tests/drop/3/nvidia/smi
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0 1729 G 20 20 0 0 20 python3
0 1729 G 20 20 - - - - 20 0 python3
8 changes: 5 additions & 3 deletions package/scripts/prmon_compress_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,9 +122,11 @@ def main():

parser.add_argument(
"--precision",
type=lambda x: float(x)
if 0 < float(x) < 1
else parser.exit(-1, "Precision must be strictly between 0 and 1"),
type=lambda x: (
float(x)
if 0 < float(x) < 1
else parser.exit(-1, "Precision must be strictly between 0 and 1")
),
default=0.05,
help="precision value for interpolation threshold",
)
Expand Down
10 changes: 5 additions & 5 deletions package/src/nvidiamon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,20 +76,20 @@ void nvidiamon::update_stats(const std::vector<pid_t>& pids,
// Loop over output
unsigned int gpu_idx{}, sm{}, mem{}, fb_mem{};
pid_t pid{};
std::string enc{}, dec{}, cg_type{}, cmd_name{};
std::string enc{}, dec{}, jpg{}, ofa{}, cg_type{}, ccpm{}, cmd_name{};
std::unordered_map<unsigned int, bool>
activegpus{}; // Avoid double counting active GPUs
for (const auto& s : cmd_result.second) {
if (s[0] == '#') continue;
std::istringstream instr(s);
instr >> gpu_idx >> pid >> cg_type >> sm >> mem >> enc >> dec >> fb_mem >>
cmd_name;
instr >> gpu_idx >> pid >> cg_type >> sm >> mem >> enc >> dec >> jpg >> ofa >> fb_mem >>
ccpm >> cmd_name;
auto read_ok = !(instr.fail() || instr.bad()); // eof() is ok
if (read_ok) {
if (log_level <= spdlog::level::debug) {
std::stringstream strm;
strm << "Good read: " << gpu_idx << " " << pid << " " << cg_type << " "
<< sm << " " << mem << " " << enc << " " << dec << " " << fb_mem
<< sm << " " << mem << " " << enc << " " << dec << " " << jpg << " " << ofa << " " << fb_mem << " " << ccpm
<< " " << cmd_name << std::endl;
debug(strm.str());
}
Expand All @@ -115,7 +115,7 @@ void nvidiamon::update_stats(const std::vector<pid_t>& pids,
std::stringstream strm;
strm << "Bad read of line: " << s << std::endl;
strm << "Parsed to: " << gpu_idx << " " << pid << " " << cg_type << " "
<< sm << " " << mem << " " << enc << " " << dec << " " << fb_mem
<< sm << " " << mem << " " << enc << " " << dec << " " << jpg << " " << ofa << " " << fb_mem << " " << ccpm
<< " " << cmd_name << std::endl;

strm << "StringStream status: good()=" << instr.good();
Expand Down
Loading