Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Limit gProfiler memory & CPU usage and --log-usage support in exe mode. #564

Draft
wants to merge 4 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 66 additions & 8 deletions gprofiler/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@
from typing import Iterable, Optional, Type, cast

import configargparse
from granulate_utils.exceptions import AlreadyInCgroup, UnsupportedCGroupV2
from granulate_utils.linux.cgroups.cpu_cgroup import CpuCgroup
from granulate_utils.linux.cgroups.memory_cgroup import MemoryCgroup
from granulate_utils.linux.ns import is_running_in_init_pid
from granulate_utils.linux.process import is_process_running
from granulate_utils.metadata import Metadata
Expand Down Expand Up @@ -67,6 +70,10 @@
DEFAULT_PROFILING_DURATION = datetime.timedelta(seconds=60).seconds
DEFAULT_SAMPLING_FREQUENCY = 11

# Limits same as in the k8s DaemonSet.
DEFAULT_CPU_LIMIT = 0.5 # 500m
DEFAULT_MEMORY_LIMIT = (1 << 30) # 1Gi

# 1 KeyboardInterrupt raised per this many seconds, no matter how many SIGINTs we get.
SIGINT_RATELIMIT = 0.5

Expand Down Expand Up @@ -602,6 +609,30 @@ def parse_cmd_args() -> configargparse.Namespace:
" beginning of a session.",
)

parser.add_argument(
"--limit-memory",
default=DEFAULT_MEMORY_LIMIT,
dest="memory_limit",
type=int,
help=f"Limit on the memory used by gProfiler. Units are bytes and the default is '{DEFAULT_MEMORY_LIMIT}'."
)

parser.add_argument(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How do I specific no CPU limit? i.e --limit-cpu none should be a possible parameter, not necessarily this way, but you should be able to limit ONLY the memory or CPU.

"--limit-cpu",
default=DEFAULT_CPU_LIMIT,
dest="cpu_limit",
type=float,
help=f"Limit on the cpu used by gProfiler. Units are cores and the default is '{DEFAULT_CPU_LIMIT}'."
)

parser.add_argument(
"--no-cgroups",
action="store_false",
dest="cgroups_changes",
default=True,
help="Disable the cgroups changes.",
)

args = parser.parse_args()

args.perf_inject = args.nodejs_mode == "perf"
Expand Down Expand Up @@ -679,12 +710,6 @@ def verify_preconditions(args: configargparse.Namespace) -> None:
)
sys.exit(1)

if args.log_usage and get_run_mode() not in ("k8s", "container"):
# TODO: we *can* move into another cpuacct cgroup, to let this work also when run as a standalone
# executable.
print("--log-usage is available only when run as a container!", file=sys.stderr)
sys.exit(1)


def setup_signals() -> None:
# When we run under staticx & PyInstaller, both of them forward (some of the) signals to gProfiler.
Expand Down Expand Up @@ -723,6 +748,30 @@ def init_pid_file(pid_file: str) -> None:
Path(pid_file).write_text(str(os.getpid()))


# Set limits and return path of the cgroup.
def set_limits(cpu: float, memory: int):
try:
cpu_cgroup = CpuCgroup()
memory_cgroup = MemoryCgroup()
except UnsupportedCGroupV2:
logger.debug("cgroup v2 is not supported by gProfiler, cpu and memory limits wouldn't be set.")
raise

try:
cpu_cgroup.move_to_cgroup("gprofiler", os.getpid())
except AlreadyInCgroup:
logger.debug("gProfiler have already a cpu group.")
else:
cpu_cgroup.set_cpu_limit_cores(cpu)

try:
memory_cgroup.move_to_cgroup("gprofiler", os.getpid())
except AlreadyInCgroup:
logger.warning("gProfiler have already a memory group.")
else:
memory_cgroup.set_limit_in_bytes(memory)


def main() -> None:
args = parse_cmd_args()
verify_preconditions(args)
Expand All @@ -738,10 +787,19 @@ def main() -> None:
remote_logs_handler,
)

# TODO(Creatone): Check the containerized scenario.
if args.cgroups_changes and get_run_mode() not in ("k8s", "container"):
logger.info(f"Trying to set resource limits, cpu='{args.cpu_limit}' "
f"cores and memory='{args.memory_limit >> 20:.2f}' MB.")
try:
set_limits(args.cpu_limit, args.memory_limit)
except Exception:
logger.exception("Failed to set resource limits, continuing anyway")

setup_signals()
reset_umask()
# assume we run in the root cgroup (when containerized, that's our view)
usage_logger = CgroupsUsageLogger(logger, "/") if args.log_usage else NoopUsageLogger()

usage_logger = CgroupsUsageLogger(logger, CpuCgroup().cgroup) if args.log_usage else NoopUsageLogger()

try:
init_pid_file(args.pid_file)
Expand Down
17 changes: 14 additions & 3 deletions gprofiler/usage_loggers.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,17 @@

import psutil

CGROUPFS_ROOT = "/sys/fs/cgroup" # TODO extract from /proc/mounts, this may change
from granulate_utils.linux.cgroups.cgroup import find_v1_hierarchies, find_v2_hierarchy


# TODO(Creatone): Move it to granulate-utils. Consider change.
def _obtain_cgroup_controller_path(cgroup: str, controller: str) -> str:
cgroup_v1_hierarchies = find_v1_hierarchies()
if len(cgroup_v1_hierarchies) != 1:
assert controller in cgroup_v1_hierarchies
return f"{cgroup_v1_hierarchies[controller]}{cgroup}"
else:
return f"{find_v2_hierarchy()}/{controller}{cgroup}"
Copy link
Contributor

@Jongy Jongy Oct 28, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think this will work now - because cgroups v2 files are different. For example - there is no cpuacct.usage file, but cpu.stat file which contains a usage_usec field.

I created a ticket for cgroups v2 support. Until that's done, I suggest you raise an exception here if v2 is in use.

Also I see that granulate-utils does this check: len(get_cgroups(os.getpid())) == 1 (in _verify_preconditions). It's subtly different from what you did here (checks the controllers available for this processes instead of the mounted controllers). I think it'll produce the same results, but let's be consistent and use the same check here (you can export it to a function in granulate-utils - generally, if you need to make changes in granulate-utils, you can opne a PR there as well, and in the gProfiler repo you just update the revision of the submodule to point to your PR revision)



class UsageLoggerInterface:
Expand All @@ -30,7 +40,8 @@ class CpuUsageLogger(UsageLoggerInterface):

def __init__(self, logger: logging.LoggerAdapter, cgroup: str):
self._logger = logger
self._cpuacct_usage = Path(f"{CGROUPFS_ROOT}{cgroup}cpuacct/cpuacct.usage")
cpu_root = _obtain_cgroup_controller_path(cgroup, 'cpuacct')
self._cpuacct_usage = Path(os.path.join(cpu_root, "cpuacct.usage"))
self._last_usage: Optional[int] = None
self._last_ts: Optional[float] = None

Expand Down Expand Up @@ -78,7 +89,7 @@ class MemoryUsageLogger(UsageLoggerInterface):

def __init__(self, logger: logging.LoggerAdapter, cgroup: str):
self._logger = logger
memory_root = f"{CGROUPFS_ROOT}{cgroup}memory"
memory_root = _obtain_cgroup_controller_path(cgroup, 'memory')
self._memory_usage = Path(os.path.join(memory_root, "memory.usage_in_bytes"))
self._memory_watermark = Path(os.path.join(memory_root, "memory.max_usage_in_bytes"))
self._last_usage: Optional[int] = None
Expand Down
59 changes: 59 additions & 0 deletions tests/test_cgroups.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
#
# Copyright (c) Granulate. All rights reserved.
# Licensed under the AGPL3 License. See LICENSE.md in the project root for license information.
#
import os
import subprocess
from pathlib import Path
from subprocess import Popen
from typing import List

import pytest
from docker import DockerClient
from docker.models.images import Image

from tests.utils import run_privileged_container, _print_process_output


def test_cgroup_limit_container(
docker_client: DockerClient,
gprofiler_docker_image: Image,
output_directory: Path,
) -> None:
logs = run_privileged_container(docker_client, gprofiler_docker_image,
command=['-v', '--limit-cpu', '0.5', '--limit-memory', '1048576', '-o',
str(output_directory)])

limit_log = "Trying to set resource limits, cpu='0.5' cores and memory='1024.00' MB."

assert limit_log not in logs


def test_cgroup_limit_privileged_executable(
gprofiler_exe: Path,
output_directory: Path,
) -> None:
os.mkdir(output_directory)

command = (
['sudo', str(gprofiler_exe), '-v', '--limit-cpu', '0.5',
'--limit-memory', str((1 << 30)), '-o', str(output_directory), "-d", "5",
"--no-java", "--no-python", "--no-php", "--no-ruby", "--no-nodejs", "--no-dotnet"]
)

popen = Popen(command, stdout=subprocess.PIPE)
assert popen.wait() == 0
stdout, _ = popen.communicate()
logs = stdout.decode("utf-8").splitlines()
limit_log = "Trying to set resource limits, cpu='0.5' cores and memory='1024.00' MB."

present = False
for line in logs:
if limit_log in line:
present = True
assert present


# Not implemented yet.
def test_cgroup_try_limit_no_privileged_executable():
assert False