diff --git a/azurelinuxagent/common/cgroupapi.py b/azurelinuxagent/common/cgroupapi.py index b72a2b31a2..22eac67740 100644 --- a/azurelinuxagent/common/cgroupapi.py +++ b/azurelinuxagent/common/cgroupapi.py @@ -61,7 +61,7 @@ def get_extension_cgroups(self, extension_name): def start_extension_command(self, extension_name, command, timeout, shell, cwd, env, stdout, stderr, error_code): raise NotImplementedError() - def cleanup_old_cgroups(self): + def cleanup_legacy_cgroups(self): raise NotImplementedError() @staticmethod @@ -100,20 +100,52 @@ def _foreach_controller(operation, message): is not mounted or if an error occurs in the operation :return: Returns a list of error messages or an empty list if no errors occurred """ - errors = [] mounted_controllers = os.listdir(CGROUPS_FILE_SYSTEM_ROOT) for controller in CGROUP_CONTROLLERS: try: if controller not in mounted_controllers: - logger.warn('Cgroup controller "{0}" is not mounted. {1}'.format(controller, message)) + logger.warn('Cgroup controller "{0}" is not mounted. {1}', controller, message) else: operation(controller) except Exception as e: - msg = 'Error in cgroup controller "{0}": {1}. {2}'.format(controller, ustr(e), message) - logger.warn(msg) - errors.append(msg) - return errors + logger.warn('Error in cgroup controller "{0}": {1}. {2}', controller, ustr(e), message) + + @staticmethod + def _foreach_legacy_cgroup(operation): + """ + Previous versions of the daemon (2.2.31-2.2.40) wrote their PID to /sys/fs/cgroup/{cpu,memory}/WALinuxAgent/WALinuxAgent; + starting from version 2.2.41 we track the agent service in walinuxagent.service instead of WALinuxAgent/WALinuxAgent. Also, + when running under systemd, the PIDs should not be explicitly moved to the cgroup filesystem. The older daemons would + incorrectly do that under certain conditions. + + This method checks for the existence of the legacy cgroups and, if the daemon's PID has been added to them, executes the + given operation on the cgroups. After this check, the method attempts to remove the legacy cgroups. + + :param operation: + The function to execute on each legacy cgroup. It must take 2 arguments: the controller and the daemon's PID + """ + legacy_cgroups = [] + for controller in ['cpu', 'memory']: + cgroup = os.path.join(CGROUPS_FILE_SYSTEM_ROOT, controller, "WALinuxAgent", "WALinuxAgent") + if os.path.exists(cgroup): + logger.info('Found legacy cgroup {0}', cgroup) + legacy_cgroups.append((controller, cgroup)) + + try: + for controller, cgroup in legacy_cgroups: + procs_file = os.path.join(cgroup, "cgroup.procs") + + if os.path.exists(procs_file): + procs_file_contents = fileutil.read_file(procs_file).strip() + daemon_pid = fileutil.read_file(get_agent_pid_file_path()).strip() + + if daemon_pid in procs_file_contents: + operation(controller, daemon_pid) + finally: + for _, cgroup in legacy_cgroups: + logger.info('Removing {0}', cgroup) + shutil.rmtree(cgroup, ignore_errors=True) class FileSystemCgroupsApi(CGroupsApi): @@ -143,6 +175,10 @@ def _try_mkdir(path): else: raise + @staticmethod + def _get_agent_cgroup_path(controller): + return os.path.join(CGROUPS_FILE_SYSTEM_ROOT, controller, VM_AGENT_CGROUP_NAME) + @staticmethod def _get_extension_cgroups_root_path(controller): return os.path.join(CGROUPS_FILE_SYSTEM_ROOT, controller, EXTENSIONS_ROOT_CGROUP_NAME) @@ -158,8 +194,7 @@ def _get_extension_cgroup_path(self, controller, extension_name): return os.path.join(extensions_root, cgroup_name) def _create_extension_cgroup(self, controller, extension_name): - return CGroup.create(self._get_extension_cgroup_path(controller, extension_name), - controller, extension_name) + return CGroup.create(self._get_extension_cgroup_path(controller, extension_name), controller, extension_name) @staticmethod def _add_process_to_cgroup(pid, cgroup_path): @@ -167,40 +202,20 @@ def _add_process_to_cgroup(pid, cgroup_path): fileutil.append_file(tasks_file, "{0}\n".format(pid)) logger.info("Added PID {0} to cgroup {1}".format(pid, cgroup_path)) - def cleanup_old_cgroups(self): - # Old daemon versions (2.2.31-2.2.40) wrote their PID to the WALinuxAgent/WALinuxAgent cgroup. - # Starting from version 2.2.41, we track the agent service in walinuxagent.service. This method - # cleans up the old behavior by moving the daemon's PID to the new cgroup and deleting the old cgroup. - daemon_pid_file = get_agent_pid_file_path() - daemon_pid = fileutil.read_file(daemon_pid_file) - - def cleanup_old_controller(controller): - old_path = os.path.join(CGROUPS_FILE_SYSTEM_ROOT, controller, "WALinuxAgent", "WALinuxAgent") - new_path = os.path.join(CGROUPS_FILE_SYSTEM_ROOT, controller, VM_AGENT_CGROUP_NAME) - - if not os.path.isdir(old_path): - return - - contents = fileutil.read_file(os.path.join(old_path, "cgroup.procs")) - - if daemon_pid in contents: - fileutil.append_file(os.path.join(new_path, "cgroup.procs"), daemon_pid) - shutil.rmtree(old_path, ignore_errors=True) - - errors = self._foreach_controller(cleanup_old_controller, - "Failed to update the tracking of the daemon; resource usage of the agent " - "will not include the daemon process.") - - if len(errors) == 0: - msg = 'Successfully cleaned up old cgroups in WALinuxAgent/WALinuxAgent.' - else: - msg = 'Failed to clean up old cgroups in WALinuxAgent/WALinuxAgent. Errors: {0}'.format(errors) + def cleanup_legacy_cgroups(self): + """ + Previous versions of the daemon (2.2.31-2.2.40) wrote their PID to /sys/fs/cgroup/{cpu,memory}/WALinuxAgent/WALinuxAgent; + starting from version 2.2.41 we track the agent service in walinuxagent.service instead of WALinuxAgent/WALinuxAgent. This + method moves the daemon's PID from the legacy cgroups to the newer cgroups. + """ + def move_daemon_pid(controller, daemon_pid): + new_path = FileSystemCgroupsApi._get_agent_cgroup_path(controller) + logger.info("Writing daemon's PID ({0}) to {1}", daemon_pid, new_path) + fileutil.append_file(os.path.join(new_path, "cgroup.procs"), daemon_pid) + msg = "Moved daemon's PID from legacy cgroup to {0}".format(new_path) + add_event(AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.CGroupsCleanUp, is_success=True, message=msg) - add_event(AGENT_NAME, - version=CURRENT_VERSION, - op=WALAEventOperation.CGroupsCleanUp, - is_success=len(errors) == 0, - message=msg) + CGroupsApi._foreach_legacy_cgroup(move_daemon_pid) def create_agent_cgroups(self): """ @@ -211,7 +226,7 @@ def create_agent_cgroups(self): pid = int(os.getpid()) def create_cgroup(controller): - path = os.path.join(CGROUPS_FILE_SYSTEM_ROOT, controller, VM_AGENT_CGROUP_NAME) + path = FileSystemCgroupsApi._get_agent_cgroup_path(controller) if not os.path.isdir(path): FileSystemCgroupsApi._try_mkdir(path) @@ -532,6 +547,15 @@ def create_cgroup(controller): # The process terminated in time and successfully return extension_cgroups, process_output - def cleanup_old_cgroups(self): - # No cleanup needed from the old daemon in the systemd case. - pass + def cleanup_legacy_cgroups(self): + """ + Previous versions of the daemon (2.2.31-2.2.40) wrote their PID to /sys/fs/cgroup/{cpu,memory}/WALinuxAgent/WALinuxAgent; + starting from version 2.2.41 we track the agent service in walinuxagent.service instead of WALinuxAgent/WALinuxAgent. If + we find that any of the legacy groups include the PID of the daemon then we disable data collection for this instance + (under systemd, moving PIDs across the cgroup file system can produce unpredictable results) + """ + def report_error(_, daemon_pid): + raise CGroupsException( + "The daemon's PID ({0}) was already added to the legacy cgroup; this invalidates resource usage data.".format(daemon_pid)) + + CGroupsApi._foreach_legacy_cgroup(report_error) diff --git a/azurelinuxagent/common/cgroupconfigurator.py b/azurelinuxagent/common/cgroupconfigurator.py index 02cd0fb6ca..33f26e0b73 100644 --- a/azurelinuxagent/common/cgroupconfigurator.py +++ b/azurelinuxagent/common/cgroupconfigurator.py @@ -79,7 +79,7 @@ def enable(self): def disable(self): self._enabled = False - def _invoke_cgroup_operation(self, operation, error_message): + def _invoke_cgroup_operation(self, operation, error_message, on_error=None): """ Ensures the given operation is invoked only if cgroups are enabled and traps any errors on the operation. """ @@ -89,7 +89,12 @@ def _invoke_cgroup_operation(self, operation, error_message): try: return operation() except Exception as e: - logger.warn("{0}. Error: {1}".format(error_message, ustr(e))) + logger.warn("{0} Error: {1}".format(error_message, ustr(e))) + if on_error is not None: + try: + on_error(e) + except Exception as ex: + logger.warn("CGroupConfigurator._invoke_cgroup_operation: {0}".format(ustr(e))) def create_agent_cgroups(self, track_cgroups): """ @@ -104,14 +109,26 @@ def __impl(): return cgroups - self._invoke_cgroup_operation(__impl, "Failed to create a cgroup for the VM Agent; resource usage for the Agent will not be tracked") + self._invoke_cgroup_operation(__impl, "Failed to create a cgroup for the VM Agent; resource usage for the Agent will not be tracked.") - def cleanup_old_cgroups(self): + def cleanup_legacy_cgroups(self): def __impl(): - self._cgroups_api.cleanup_old_cgroups() + self._cgroups_api.cleanup_legacy_cgroups() - self._invoke_cgroup_operation(__impl, "Failed to update the tracking of the daemon; resource usage " - "of the agent will not include the daemon process.") + message = 'Failed to process legacy cgroups. Collection of resource usage data will be disabled.' + + def disable_cgroups(exception): + self.disable() + CGroupsTelemetry.reset() + add_event( + AGENT_NAME, + version=CURRENT_VERSION, + op=WALAEventOperation.CGroupsCleanUp, + is_success=False, + log_event=False, + message='{0} {1}'.format(message, ustr(exception))) + + self._invoke_cgroup_operation(__impl, message, on_error=disable_cgroups) def create_extension_cgroups_root(self): """ @@ -120,7 +137,7 @@ def create_extension_cgroups_root(self): def __impl(): self._cgroups_api.create_extension_cgroups_root() - self._invoke_cgroup_operation(__impl, "Failed to create a root cgroup for extensions; resource usage for extensions will not be tracked") + self._invoke_cgroup_operation(__impl, "Failed to create a root cgroup for extensions; resource usage for extensions will not be tracked.") def create_extension_cgroups(self, name): """ @@ -129,7 +146,7 @@ def create_extension_cgroups(self, name): def __impl(): return self._cgroups_api.create_extension_cgroups(name) - return self._invoke_cgroup_operation(__impl, "Failed to create a cgroup for extension '{0}'; resource usage will not be tracked".format(name)) + return self._invoke_cgroup_operation(__impl, "Failed to create a cgroup for extension '{0}'; resource usage will not be tracked.".format(name)) def remove_extension_cgroups(self, name): """ diff --git a/azurelinuxagent/common/cgroupstelemetry.py b/azurelinuxagent/common/cgroupstelemetry.py index 3e5fb2c9ea..168b4d5d50 100644 --- a/azurelinuxagent/common/cgroupstelemetry.py +++ b/azurelinuxagent/common/cgroupstelemetry.py @@ -132,7 +132,7 @@ def prune_all_tracked(): CGroupsTelemetry.stop_tracking(cgroup) @staticmethod - def cleanup(): + def reset(): with CGroupsTelemetry._rlock: CGroupsTelemetry._tracked *= 0 # emptying the list CGroupsTelemetry._cgroup_metrics = {} diff --git a/azurelinuxagent/common/exception.py b/azurelinuxagent/common/exception.py index bf34593e81..91c53fcbb9 100644 --- a/azurelinuxagent/common/exception.py +++ b/azurelinuxagent/common/exception.py @@ -55,7 +55,11 @@ class CGroupsException(AgentError): def __init__(self, msg, inner=None): super(AgentError, self).__init__(msg, inner) + # TODO: AgentError should set the message - investigate whether doing it there would break anything + self.message = msg + def __str__(self): + return self.message class ExtensionError(AgentError): """ diff --git a/azurelinuxagent/daemon/main.py b/azurelinuxagent/daemon/main.py index 0f5296a49a..a951e3948b 100644 --- a/azurelinuxagent/daemon/main.py +++ b/azurelinuxagent/daemon/main.py @@ -69,7 +69,6 @@ def run(self, child_args=None): self.initialize_environment() CGroupConfigurator.get_instance().create_agent_cgroups(track_cgroups=False) - CGroupConfigurator.get_instance().cleanup_old_cgroups() # If FIPS is enabled, set the OpenSSL environment variable # Note: diff --git a/azurelinuxagent/ga/update.py b/azurelinuxagent/ga/update.py index 04a40873db..f2f3c1f6d7 100644 --- a/azurelinuxagent/ga/update.py +++ b/azurelinuxagent/ga/update.py @@ -457,7 +457,7 @@ def _ensure_readonly_files(self): def _ensure_cgroups_initialized(self): configurator = CGroupConfigurator.get_instance() configurator.create_agent_cgroups(track_cgroups=True) - configurator.cleanup_old_cgroups() + configurator.cleanup_legacy_cgroups() configurator.create_extension_cgroups_root() def _evaluate_agent_health(self, latest_agent): diff --git a/tests/common/test_cgroupapi.py b/tests/common/test_cgroupapi.py index 12575860ce..db3558a48d 100644 --- a/tests/common/test_cgroupapi.py +++ b/tests/common/test_cgroupapi.py @@ -18,16 +18,16 @@ from __future__ import print_function import subprocess - -from azurelinuxagent.common.cgroupapi import CGroupsApi, FileSystemCgroupsApi, SystemdCgroupsApi, VM_AGENT_CGROUP_NAME -from azurelinuxagent.common.exception import ExtensionError, ExtensionErrorCodes +from azurelinuxagent.common.cgroupapi import CGroupsApi, FileSystemCgroupsApi, SystemdCgroupsApi, CGROUPS_FILE_SYSTEM_ROOT, VM_AGENT_CGROUP_NAME +from azurelinuxagent.common.exception import CGroupsException, ExtensionError, ExtensionErrorCodes from azurelinuxagent.common.future import ustr from azurelinuxagent.common.utils import shellutil from nose.plugins.attrib import attr +from tests.utils.cgroups_tools import CGroupsTools from tests.tools import * -class CGroupsApiTestCase(AgentTestCase): +class _MockedFileSystemTestCase(AgentTestCase): def setUp(self): AgentTestCase.setUp(self) @@ -36,12 +36,15 @@ def setUp(self): os.mkdir(os.path.join(self.cgroups_file_system_root, "cpu")) os.mkdir(os.path.join(self.cgroups_file_system_root, "memory")) - self.mock__base_cgroups = patch("azurelinuxagent.common.cgroupapi.CGROUPS_FILE_SYSTEM_ROOT", self.cgroups_file_system_root) - self.mock__base_cgroups.start() + self.mock_cgroups_file_system_root = patch("azurelinuxagent.common.cgroupapi.CGROUPS_FILE_SYSTEM_ROOT", self.cgroups_file_system_root) + self.mock_cgroups_file_system_root.start() def tearDown(self): - self.mock__base_cgroups.stop() + self.mock_cgroups_file_system_root.stop() + AgentTestCase.tearDown(self) + +class CGroupsApiTestCase(_MockedFileSystemTestCase): def test_create_should_return_a_SystemdCgroupsApi_on_systemd_platforms(self): with patch("azurelinuxagent.common.cgroupapi.CGroupsApi._is_systemd", return_value=True): api = CGroupsApi.create() @@ -119,59 +122,31 @@ def controller_operation(controller): self.assertEqual(len(successful_controllers), 1, 'The operation was not executed on unexpected controllers: {0}'.format(successful_controllers)) args, kwargs = mock_logger_warn.call_args - message = args[0] - self.assertIn('Error in cgroup controller "cpu": A test exception.', message) - - -class FileSystemCgroupsApiTestCase(AgentTestCase): - - def setUp(self): - AgentTestCase.setUp(self) - - self.cgroups_file_system_root = os.path.join(self.tmp_dir, "cgroup") - os.mkdir(self.cgroups_file_system_root) - os.mkdir(os.path.join(self.cgroups_file_system_root, "cpu")) - os.mkdir(os.path.join(self.cgroups_file_system_root, "memory")) + (message_format, controller, error, message) = args + self.assertEquals(message_format, 'Error in cgroup controller "{0}": {1}. {2}') + self.assertEquals(controller, 'cpu') + self.assertEquals(error, 'A test exception') + self.assertEquals(message, 'A dummy message') - self.mock__base_cgroups = patch("azurelinuxagent.common.cgroupapi.CGROUPS_FILE_SYSTEM_ROOT", self.cgroups_file_system_root) - self.mock__base_cgroups.start() - def tearDown(self): - self.mock__base_cgroups.stop() - - AgentTestCase.tearDown(self) - - @patch('time.sleep', side_effect=lambda _: mock_sleep()) - def test_cleanup_old_cgroups_should_move_daemon_pid_on_all_controllers(self, _): - # Set up the mock /var/run/waagent.pid file +class FileSystemCgroupsApiTestCase(_MockedFileSystemTestCase): + def test_cleanup_legacy_cgroups_should_move_daemon_pid_to_new_cgroup_and_remove_legacy_cgroups(self): + # Set up a mock /var/run/waagent.pid file daemon_pid = "42" - daemon_pid_file_tmp = os.path.join(self.tmp_dir, "waagent.pid") - with open(daemon_pid_file_tmp, "w") as f: - f.write(daemon_pid) + daemon_pid_file = os.path.join(self.tmp_dir, "waagent.pid") + fileutil.write_file(daemon_pid_file, daemon_pid + "\n") # Set up old controller cgroups and add the daemon PID to them - old_cpu_cgroup = os.path.join(self.cgroups_file_system_root, "cpu", "WALinuxAgent", "WALinuxAgent") - old_memory_cgroup = os.path.join(self.cgroups_file_system_root, "memory", "WALinuxAgent", "WALinuxAgent") - - os.makedirs(old_cpu_cgroup) - os.makedirs(old_memory_cgroup) - - fileutil.write_file(os.path.join(old_cpu_cgroup, "cgroup.procs"), daemon_pid + "\n") - fileutil.write_file(os.path.join(old_memory_cgroup, "cgroup.procs"), daemon_pid + "\n") + legacy_cpu_cgroup = CGroupsTools.create_legacy_agent_cgroup(self.cgroups_file_system_root, "cpu", daemon_pid) + legacy_memory_cgroup = CGroupsTools.create_legacy_agent_cgroup(self.cgroups_file_system_root, "memory", daemon_pid) - # Set up new controller cgroups and add another PID to them - new_cpu_cgroup = os.path.join(self.cgroups_file_system_root, "cpu", VM_AGENT_CGROUP_NAME) - new_memory_cgroup = os.path.join(self.cgroups_file_system_root, "memory", VM_AGENT_CGROUP_NAME) - - os.makedirs(new_cpu_cgroup) - os.makedirs(new_memory_cgroup) - - fileutil.write_file(os.path.join(new_cpu_cgroup, "cgroup.procs"), "999\n") - fileutil.write_file(os.path.join(new_memory_cgroup, "cgroup.procs"), "999\n") + # Set up new controller cgroups and add extension handler's PID to them + new_cpu_cgroup = CGroupsTools.create_agent_cgroup(self.cgroups_file_system_root, "cpu", "999") + new_memory_cgroup = CGroupsTools.create_agent_cgroup(self.cgroups_file_system_root, "memory", "999") with patch("azurelinuxagent.common.cgroupapi.add_event") as mock_add_event: - with patch("azurelinuxagent.common.cgroupapi.get_agent_pid_file_path", return_value=daemon_pid_file_tmp): - FileSystemCgroupsApi().cleanup_old_cgroups() + with patch("azurelinuxagent.common.cgroupapi.get_agent_pid_file_path", return_value=daemon_pid_file): + FileSystemCgroupsApi().cleanup_legacy_cgroups() # The method should have added the daemon PID to the new controllers and deleted the old ones new_cpu_contents = fileutil.read_file(os.path.join(new_cpu_cgroup, "cgroup.procs")) @@ -180,44 +155,19 @@ def test_cleanup_old_cgroups_should_move_daemon_pid_on_all_controllers(self, _): self.assertTrue(daemon_pid in new_cpu_contents) self.assertTrue(daemon_pid in new_memory_contents) - self.assertFalse(os.path.exists(old_cpu_cgroup)) - self.assertFalse(os.path.exists(old_memory_cgroup)) + self.assertFalse(os.path.exists(legacy_cpu_cgroup)) + self.assertFalse(os.path.exists(legacy_memory_cgroup)) # Assert the event parameters that were sent out - _, kwargs = mock_add_event.call_args_list[0] - self.assertEquals(kwargs['op'], 'CGroupsCleanUp') - self.assertEquals(kwargs['is_success'], True) - self.assertEquals(kwargs['message'], 'Successfully cleaned up old cgroups in WALinuxAgent/WALinuxAgent.') - - def test_cleanup_old_cgroups_should_report_errors_from_all_controllers_that_failed(self): - # Set up the mock /var/run/waagent.pid file - daemon_pid = "42" - daemon_pid_file_tmp = os.path.join(self.tmp_dir, "waagent.pid") - with open(daemon_pid_file_tmp, "w") as f: - f.write(daemon_pid) - - # Set up old controller cgroups and add the daemon PID to them, but don't set up new controllers in order - # to force errors on cleanup - old_cpu_cgroup = os.path.join(self.cgroups_file_system_root, "cpu", "WALinuxAgent", "WALinuxAgent") - old_memory_cgroup = os.path.join(self.cgroups_file_system_root, "memory", "WALinuxAgent", "WALinuxAgent") - - os.makedirs(old_cpu_cgroup) - os.makedirs(old_memory_cgroup) - - fileutil.write_file(os.path.join(old_cpu_cgroup, "cgroup.procs"), daemon_pid + "\n") - fileutil.write_file(os.path.join(old_memory_cgroup, "cgroup.procs"), daemon_pid + "\n") - - with patch("azurelinuxagent.common.cgroupapi.add_event") as mock_add_event: - with patch("azurelinuxagent.common.cgroupapi.get_agent_pid_file_path", return_value=daemon_pid_file_tmp): - FileSystemCgroupsApi().cleanup_old_cgroups() - - # Assert there were errors for both controllers - _, kwargs = mock_add_event.call_args_list[0] - self.assertEquals(kwargs['op'], 'CGroupsCleanUp') - self.assertEquals(kwargs['is_success'], False) - self.assertIn("Failed to clean up old cgroups in WALinuxAgent/WALinuxAgent.", kwargs['message']) - self.assertIn("Error in cgroup controller \"cpu\": [Errno 2] No such file or directory", kwargs['message']) - self.assertIn("Error in cgroup controller \"memory\": [Errno 2] No such file or directory", kwargs['message']) + self.assertEquals(len(mock_add_event.call_args_list), 2) + self.assertTrue(all(kwargs['op'] == 'CGroupsCleanUp' for _, kwargs in mock_add_event.call_args_list)) + self.assertTrue(all(kwargs['is_success'] for _, kwargs in mock_add_event.call_args_list)) + self.assertTrue(any( + re.match(r"Moved daemon's PID from legacy cgroup to /.*/cgroup/cpu/walinuxagent.service", kwargs['message']) + for _, kwargs in mock_add_event.call_args_list)) + self.assertTrue(any( + re.match(r"Moved daemon's PID from legacy cgroup to /.*/cgroup/memory/walinuxagent.service", kwargs['message']) + for _, kwargs in mock_add_event.call_args_list)) def test_create_agent_cgroups_should_create_cgroups_on_all_controllers(self): agent_cgroups = FileSystemCgroupsApi().create_agent_cgroups() @@ -332,28 +282,17 @@ def test_start_extension_command_should_add_the_child_process_to_the_extension_c @skip_if_predicate_false(is_systemd_present, "Systemd cgroups API doesn't manage cgroups on systems not using systemd.") class SystemdCgroupsApiTestCase(AgentTestCase): - def setUp(self): - AgentTestCase.setUp(self) - - self.cgroups_file_system_root = os.path.join(self.tmp_dir, "cgroup") - os.mkdir(self.cgroups_file_system_root) - os.mkdir(os.path.join(self.cgroups_file_system_root, "cpu")) - os.mkdir(os.path.join(self.cgroups_file_system_root, "memory")) - - def tearDown(self): - AgentTestCase.tearDown(self) - - def test_it_should_return_extensions_slice_root_name(self): + def test_get_extensions_slice_root_name_should_return_the_root_slice_for_extensions(self): root_slice_name = SystemdCgroupsApi()._get_extensions_slice_root_name() self.assertEqual(root_slice_name, "system-walinuxagent.extensions.slice") - def test_it_should_return_extension_slice_name(self): + def test_get_extension_slice_name_should_return_the_slice_for_the_given_extension(self): extension_name = "Microsoft.Azure.DummyExtension-1.0" extension_slice_name = SystemdCgroupsApi()._get_extension_slice_name(extension_name) self.assertEqual(extension_slice_name, "system-walinuxagent.extensions-Microsoft.Azure.DummyExtension_1.0.slice") @attr('requires_sudo') - def test_it_should_create_extensions_root_slice(self): + def test_create_extension_cgroups_root_should_create_extensions_root_slice(self): self.assertTrue(i_am_root(), "Test does not run when non-root") SystemdCgroupsApi().create_extension_cgroups_root() @@ -369,7 +308,7 @@ def test_it_should_create_extensions_root_slice(self): shellutil.run_get_output("systemctl daemon-reload") @attr('requires_sudo') - def test_it_should_create_extension_slice(self): + def test_create_extension_cgroups_should_create_extension_slice(self): self.assertTrue(i_am_root(), "Test does not run when non-root") extension_name = "Microsoft.Azure.DummyExtension-1.0" @@ -636,9 +575,6 @@ def mock_popen(*args, **kwargs): @patch("azurelinuxagent.common.utils.fileutil.read_file") def test_create_agent_cgroups_should_create_cgroups_on_all_controllers(self, patch_read_file): - mock__base_cgroups = patch("azurelinuxagent.common.cgroupapi.CGROUPS_FILE_SYSTEM_ROOT", - self.cgroups_file_system_root) - mock__base_cgroups.start() mock_proc_self_cgroup = '''12:blkio:/system.slice/walinuxagent.service 11:memory:/system.slice/walinuxagent.service 10:perf_event:/ @@ -657,11 +593,50 @@ def test_create_agent_cgroups_should_create_cgroups_on_all_controllers(self, pat agent_cgroups = SystemdCgroupsApi().create_agent_cgroups() def assert_cgroup_created(controller): - expected_cgroup_path = os.path.join(self.cgroups_file_system_root, controller, "system.slice", VM_AGENT_CGROUP_NAME) + expected_cgroup_path = os.path.join(CGROUPS_FILE_SYSTEM_ROOT, controller, "system.slice", VM_AGENT_CGROUP_NAME) self.assertTrue(any(cgroups.path == expected_cgroup_path for cgroups in agent_cgroups)) self.assertTrue(any(cgroups.name == VM_AGENT_CGROUP_NAME for cgroups in agent_cgroups)) assert_cgroup_created("cpu") assert_cgroup_created("memory") - mock__base_cgroups.stop() + + +class SystemdCgroupsApiMockedFileSystemTestCase(_MockedFileSystemTestCase): + def test_cleanup_legacy_cgroups_should_remove_legacy_cgroups(self): + # Set up a mock /var/run/waagent.pid file + daemon_pid_file = os.path.join(self.tmp_dir, "waagent.pid") + fileutil.write_file(daemon_pid_file, "42\n") + + # Set up old controller cgroups, but do not add the daemon's PID to them + legacy_cpu_cgroup = CGroupsTools.create_legacy_agent_cgroup(self.cgroups_file_system_root, "cpu", '') + legacy_memory_cgroup = CGroupsTools.create_legacy_agent_cgroup(self.cgroups_file_system_root, "memory", '') + + with patch("azurelinuxagent.common.cgroupapi.add_event") as mock_add_event: + with patch("azurelinuxagent.common.cgroupapi.get_agent_pid_file_path", return_value=daemon_pid_file): + SystemdCgroupsApi().cleanup_legacy_cgroups() + + self.assertFalse(os.path.exists(legacy_cpu_cgroup)) + self.assertFalse(os.path.exists(legacy_memory_cgroup)) + + def test_cleanup_legacy_cgroups_should_report_an_error_when_the_daemon_pid_was_added_to_the_legacy_cgroups(self): + # Set up a mock /var/run/waagent.pid file + daemon_pid = "42" + daemon_pid_file = os.path.join(self.tmp_dir, "waagent.pid") + fileutil.write_file(daemon_pid_file, daemon_pid + "\n") + + # Set up old controller cgroups and add the daemon's PID to them + legacy_cpu_cgroup = CGroupsTools.create_legacy_agent_cgroup(self.cgroups_file_system_root, "cpu", daemon_pid) + legacy_memory_cgroup = CGroupsTools.create_legacy_agent_cgroup(self.cgroups_file_system_root, "memory", daemon_pid) + + with patch("azurelinuxagent.common.cgroupapi.add_event") as mock_add_event: + with patch("azurelinuxagent.common.cgroupapi.get_agent_pid_file_path", return_value=daemon_pid_file): + with self.assertRaises(CGroupsException) as context_manager: + SystemdCgroupsApi().cleanup_legacy_cgroups() + + self.assertEquals(context_manager.exception.message, "The daemon's PID ({0}) was already added to the legacy cgroup; this invalidates resource usage data.".format(daemon_pid)) + + # The method should have deleted the legacy cgroups + self.assertFalse(os.path.exists(legacy_cpu_cgroup)) + self.assertFalse(os.path.exists(legacy_memory_cgroup)) + diff --git a/tests/common/test_cgroupconfigurator.py b/tests/common/test_cgroupconfigurator.py index 9315300ff4..e57fd959ab 100644 --- a/tests/common/test_cgroupconfigurator.py +++ b/tests/common/test_cgroupconfigurator.py @@ -19,10 +19,14 @@ import subprocess +import errno +from azurelinuxagent.common.cgroup import CGroup +from azurelinuxagent.common.cgroupapi import VM_AGENT_CGROUP_NAME from azurelinuxagent.common.cgroupconfigurator import CGroupConfigurator from azurelinuxagent.common.cgroupstelemetry import CGroupsTelemetry from azurelinuxagent.common.exception import CGroupsException from azurelinuxagent.common.osutil.default import DefaultOSUtil +from tests.utils.cgroups_tools import CGroupsTools from tests.tools import * @@ -106,7 +110,7 @@ def test_cgroup_operations_should_not_invoke_the_cgroup_api_when_cgroups_are_not # List of operations to test, and the functions to mock used in order to do verifications operations = [ [lambda: configurator.create_agent_cgroups(track_cgroups=False), "azurelinuxagent.common.cgroupapi.FileSystemCgroupsApi.create_agent_cgroups"], - [lambda: configurator.cleanup_old_cgroups(), "azurelinuxagent.common.cgroupapi.FileSystemCgroupsApi.cleanup_old_cgroups"], + [lambda: configurator.cleanup_legacy_cgroups(), "azurelinuxagent.common.cgroupapi.FileSystemCgroupsApi.cleanup_legacy_cgroups"], [lambda: configurator.create_extension_cgroups_root(), "azurelinuxagent.common.cgroupapi.FileSystemCgroupsApi.create_extension_cgroups_root"], [lambda: configurator.create_extension_cgroups("A.B.C-1.0.0"), "azurelinuxagent.common.cgroupapi.FileSystemCgroupsApi.create_extension_cgroups"], [lambda: configurator.remove_extension_cgroups("A.B.C-1.0.0"), "azurelinuxagent.common.cgroupapi.FileSystemCgroupsApi.remove_extension_cgroups"] @@ -121,28 +125,30 @@ def test_cgroup_operations_should_not_invoke_the_cgroup_api_when_cgroups_are_not def test_cgroup_operations_should_log_a_warning_when_the_cgroup_api_raises_an_exception(self): configurator = CGroupConfigurator.get_instance() - # List of operations to test, and the functions to mock in order to raise exceptions - operations = [ - [lambda: configurator.create_agent_cgroups(track_cgroups=False), "azurelinuxagent.common.cgroupapi.FileSystemCgroupsApi.create_agent_cgroups"], - [lambda: configurator.cleanup_old_cgroups(), "azurelinuxagent.common.cgroupapi.FileSystemCgroupsApi.cleanup_old_cgroups"], - [lambda: configurator.create_extension_cgroups_root(), "azurelinuxagent.common.cgroupapi.FileSystemCgroupsApi.create_extension_cgroups_root"], - [lambda: configurator.create_extension_cgroups("A.B.C-1.0.0"), "azurelinuxagent.common.cgroupapi.FileSystemCgroupsApi.create_extension_cgroups"], - [lambda: configurator.remove_extension_cgroups("A.B.C-1.0.0"), "azurelinuxagent.common.cgroupapi.FileSystemCgroupsApi.remove_extension_cgroups"] - ] + # cleanup_legacy_cgroups disables cgroups on error, so make disable() a no-op + with patch.object(configurator, "disable"): + # List of operations to test, and the functions to mock in order to raise exceptions + operations = [ + [lambda: configurator.create_agent_cgroups(track_cgroups=False), "azurelinuxagent.common.cgroupapi.FileSystemCgroupsApi.create_agent_cgroups"], + [lambda: configurator.cleanup_legacy_cgroups(), "azurelinuxagent.common.cgroupapi.FileSystemCgroupsApi.cleanup_legacy_cgroups"], + [lambda: configurator.create_extension_cgroups_root(), "azurelinuxagent.common.cgroupapi.FileSystemCgroupsApi.create_extension_cgroups_root"], + [lambda: configurator.create_extension_cgroups("A.B.C-1.0.0"), "azurelinuxagent.common.cgroupapi.FileSystemCgroupsApi.create_extension_cgroups"], + [lambda: configurator.remove_extension_cgroups("A.B.C-1.0.0"), "azurelinuxagent.common.cgroupapi.FileSystemCgroupsApi.remove_extension_cgroups"] + ] - def raise_exception(*_): - raise Exception("A TEST EXCEPTION") + def raise_exception(*_): + raise Exception("A TEST EXCEPTION") - for op in operations: - with patch("azurelinuxagent.common.cgroupconfigurator.logger.warn") as mock_logger_warn: - with patch(op[1], raise_exception): - op[0]() + for op in operations: + with patch("azurelinuxagent.common.cgroupconfigurator.logger.warn") as mock_logger_warn: + with patch(op[1], raise_exception): + op[0]() - self.assertEquals(mock_logger_warn.call_count, 1) + self.assertEquals(mock_logger_warn.call_count, 1) - args, kwargs = mock_logger_warn.call_args - message = args[0] - self.assertIn("A TEST EXCEPTION", message) + args, kwargs = mock_logger_warn.call_args + message = args[0] + self.assertIn("A TEST EXCEPTION", message) def test_start_extension_command_should_forward_to_subprocess_popen_when_groups_are_not_enabled(self): configurator = CGroupConfigurator.get_instance() @@ -216,3 +222,74 @@ def raise_exception(*_, **__): stdout=subprocess.PIPE, stderr=subprocess.PIPE) self.assertIn("A TEST EXCEPTION", str(context_manager.exception)) + + def test_cleanup_legacy_cgroups_should_disable_cgroups_when_it_fails_to_process_legacy_cgroups(self): + # Set up a mock /var/run/waagent.pid file + daemon_pid = "42" + daemon_pid_file = os.path.join(self.tmp_dir, "waagent.pid") + fileutil.write_file(daemon_pid_file, daemon_pid + "\n") + + # Set up old controller cgroups and add the daemon PID to them + CGroupsTools.create_legacy_agent_cgroup(self.cgroups_file_system_root, "cpu", daemon_pid) + CGroupsTools.create_legacy_agent_cgroup(self.cgroups_file_system_root, "memory", daemon_pid) + + # Set up new controller cgroups and add extension handler's PID to them + CGroupsTools.create_agent_cgroup(self.cgroups_file_system_root, "cpu", "999") + CGroupsTools.create_agent_cgroup(self.cgroups_file_system_root, "memory", "999") + + def mock_append_file(filepath, contents, **kwargs): + if re.match(r'/.*/cpu/.*/cgroup.procs', filepath): + raise OSError(errno.ENOSPC, os.strerror(errno.ENOSPC)) + fileutil.append_file(filepath, controller, **kwargs) + + # Start tracking a couple of dummy cgroups + CGroupsTelemetry.track_cgroup(CGroup("dummy", "/sys/fs/cgroup/memory/system.slice/dummy.service", "cpu")) + CGroupsTelemetry.track_cgroup(CGroup("dummy", "/sys/fs/cgroup/memory/system.slice/dummy.service", "memory")) + + cgroup_configurator = CGroupConfigurator.get_instance() + + with patch("azurelinuxagent.common.cgroupconfigurator.add_event") as mock_add_event: + with patch("azurelinuxagent.common.cgroupapi.get_agent_pid_file_path", return_value=daemon_pid_file): + with patch("azurelinuxagent.common.cgroupapi.fileutil.append_file", side_effect=mock_append_file): + cgroup_configurator.cleanup_legacy_cgroups() + + self.assertEquals(len(mock_add_event.call_args_list), 1) + _, kwargs = mock_add_event.call_args_list[0] + self.assertEquals(kwargs['op'], 'CGroupsCleanUp') + self.assertFalse(kwargs['is_success']) + self.assertEquals(kwargs['message'], 'Failed to process legacy cgroups. Collection of resource usage data will be disabled. [Errno 28] No space left on device') + + self.assertFalse(cgroup_configurator.enabled()) + self.assertEquals(len(CGroupsTelemetry._tracked), 0) + + @patch("azurelinuxagent.common.cgroupapi.CGroupsApi._is_systemd", return_value=True) + def test_cleanup_legacy_cgroups_should_disable_cgroups_when_the_daemon_was_added_to_the_legacy_cgroup_on_systemd(self, _): + # Set up a mock /var/run/waagent.pid file + daemon_pid = "42" + daemon_pid_file = os.path.join(self.tmp_dir, "waagent.pid") + fileutil.write_file(daemon_pid_file, daemon_pid + "\n") + + # Set up old controller cgroups and add the daemon PID to them + CGroupsTools.create_legacy_agent_cgroup(self.cgroups_file_system_root, "cpu", daemon_pid) + CGroupsTools.create_legacy_agent_cgroup(self.cgroups_file_system_root, "memory", daemon_pid) + + # Start tracking a couple of dummy cgroups + CGroupsTelemetry.track_cgroup(CGroup("dummy", "/sys/fs/cgroup/memory/system.slice/dummy.service", "cpu")) + CGroupsTelemetry.track_cgroup(CGroup("dummy", "/sys/fs/cgroup/memory/system.slice/dummy.service", "memory")) + + cgroup_configurator = CGroupConfigurator.get_instance() + + with patch("azurelinuxagent.common.cgroupconfigurator.add_event") as mock_add_event: + with patch("azurelinuxagent.common.cgroupapi.get_agent_pid_file_path", return_value=daemon_pid_file): + cgroup_configurator.cleanup_legacy_cgroups() + + self.assertEquals(len(mock_add_event.call_args_list), 1) + _, kwargs = mock_add_event.call_args_list[0] + self.assertEquals(kwargs['op'], 'CGroupsCleanUp') + self.assertFalse(kwargs['is_success']) + self.assertEquals( + kwargs['message'], + "Failed to process legacy cgroups. Collection of resource usage data will be disabled. The daemon's PID ({0}) was already added to the legacy cgroup; this invalidates resource usage data.".format(daemon_pid)) + + self.assertFalse(cgroup_configurator.enabled()) + self.assertEquals(len(CGroupsTelemetry._tracked), 0) diff --git a/tests/common/test_cgroupstelemetry.py b/tests/common/test_cgroupstelemetry.py index 34abfb0880..0556cd1b31 100644 --- a/tests/common/test_cgroupstelemetry.py +++ b/tests/common/test_cgroupstelemetry.py @@ -74,11 +74,11 @@ def make_new_cgroup(name="test-cgroup"): class TestCGroupsTelemetry(AgentTestCase): def setUp(self): AgentTestCase.setUp(self) - CGroupsTelemetry.cleanup() + CGroupsTelemetry.reset() def tearDown(self): AgentTestCase.tearDown(self) - CGroupsTelemetry.cleanup() + CGroupsTelemetry.reset() def _assert_cgroup_metrics_equal(self, cpu_usage, memory_usage, max_memory_usage): for _, cgroup_metric in CGroupsTelemetry._cgroup_metrics.items(): diff --git a/tests/ga/test_monitor.py b/tests/ga/test_monitor.py index d330c98e8e..caac7930b6 100644 --- a/tests/ga/test_monitor.py +++ b/tests/ga/test_monitor.py @@ -619,7 +619,7 @@ class TestExtensionMetricsDataTelemetry(AgentTestCase): def setUp(self): AgentTestCase.setUp(self) - CGroupsTelemetry.cleanup() + CGroupsTelemetry.reset() @patch('azurelinuxagent.common.event.EventLogger.add_event') @patch("azurelinuxagent.common.cgroupstelemetry.CGroupsTelemetry.poll_all_tracked") diff --git a/tests/utils/cgroups_tools.py b/tests/utils/cgroups_tools.py new file mode 100644 index 0000000000..19aeaa9edd --- /dev/null +++ b/tests/utils/cgroups_tools.py @@ -0,0 +1,50 @@ +# Copyright 2018 Microsoft Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Requires Python 2.6+ and Openssl 1.0+ +# + +import os +from azurelinuxagent.common.cgroupapi import VM_AGENT_CGROUP_NAME +from azurelinuxagent.common.utils import fileutil + +class CGroupsTools(object): + @staticmethod + def create_legacy_agent_cgroup(cgroups_file_system_root, controller, daemon_pid): + """ + Previous versions of the daemon (2.2.31-2.2.40) wrote their PID to /sys/fs/cgroup/{cpu,memory}/WALinuxAgent/WALinuxAgent; + starting from version 2.2.41 we track the agent service in walinuxagent.service instead of WALinuxAgent/WALinuxAgent. + + This method creates a mock cgroup using the legacy path and adds the given PID to it. + """ + legacy_cgroup = os.path.join(cgroups_file_system_root, controller, "WALinuxAgent", "WALinuxAgent") + if not os.path.exists(legacy_cgroup): + os.makedirs(legacy_cgroup) + fileutil.append_file(os.path.join(legacy_cgroup, "cgroup.procs"), daemon_pid + "\n") + return legacy_cgroup + + @staticmethod + def create_agent_cgroup(cgroups_file_system_root, controller, extension_handler_pid): + """ + Previous versions of the daemon (2.2.31-2.2.40) wrote their PID to /sys/fs/cgroup/{cpu,memory}/WALinuxAgent/WALinuxAgent; + starting from version 2.2.41 we track the agent service in walinuxagent.service instead of WALinuxAgent/WALinuxAgent. + + This method creates a mock cgroup using the newer path and adds the given PID to it. + """ + new_cgroup = os.path.join(cgroups_file_system_root, controller, VM_AGENT_CGROUP_NAME) + if not os.path.exists(new_cgroup): + os.makedirs(new_cgroup) + fileutil.append_file(os.path.join(new_cgroup, "cgroup.procs"), extension_handler_pid + "\n") + return new_cgroup +