Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Check dyno and dcgm existence before disable them #2496

Closed
wants to merge 3 commits into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 23 additions & 7 deletions torchbenchmark/util/triton_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import tabulate
import torch
import triton

from torchbenchmark.util.env_check import fresh_triton_cache, set_random_seed
from torchbenchmark.util.experiment.metrics import get_peak_memory
from torchbenchmark.util.extra_args import apply_decoration_args, parse_decoration_args
Expand Down Expand Up @@ -1008,6 +1009,7 @@ def nsys_rep(self, input_id: int, fn_name: str) -> str:
def ncu_trace(
self, input_id: int, fn_name: str, replay: bool = False, profile_ir=False
) -> str:
import shutil
import subprocess

# collect the ncu trace
Expand All @@ -1031,6 +1033,7 @@ def ncu_trace(
"_ncu_trace_in_task",
]
)

# Disable DCGM
disable_dyno_dcgm = [
"sudo",
Expand All @@ -1045,13 +1048,26 @@ def ncu_trace(
"stop",
"nvidia-dcgm",
]
if (
subprocess.run(disable_dyno_dcgm).returncode != 0
and subprocess.run(disable_dcgm_service).returncode != 0
):
warnings.warn(
"DCGM may not have been successfully disabled. Proceeding to collect NCU trace anyway..."
)

def service_exists(service_name):
try:
result = subprocess.run(
["systemctl", "status", service_name],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
check=True,
)
return result.returncode == 0
except subprocess.CalledProcessError:
return False

if shutil.which("dyno") or service_exists("nvidia-dcgm"):
dyno_result = subprocess.run(disable_dyno_dcgm).returncode
systemctl_result = subprocess.run(disable_dcgm_service).returncode
if dyno_result != 0 and systemctl_result != 0:
warnings.warn(
"DCGM may not have been successfully disabled. Proceeding to collect NCU trace anyway..."
)
ncu_output_dir = self.get_temp_path(f"ncu_traces/{fn_name}_{input_id}")
ncu_output_dir.mkdir(parents=True, exist_ok=True)
ext = ".csv" if not replay else ".ncu-rep"
Expand Down
Loading