Skip to content

Commit

Permalink
Merge branch 'develop' into cgroupsv2-daemon
Browse files Browse the repository at this point in the history
  • Loading branch information
nagworld9 authored Sep 20, 2024
2 parents 97524a1 + 4dcf95c commit 058699c
Show file tree
Hide file tree
Showing 7 changed files with 16 additions and 90 deletions.
7 changes: 6 additions & 1 deletion .github/workflows/ci_pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -148,9 +148,14 @@ jobs:
# * 'contextmanager-generator-missing-cleanup' are false positives if yield is used inside an if-else block for contextmanager generator functions.
# (https://pylint.readthedocs.io/en/latest/user_guide/messages/warning/contextmanager-generator-missing-cleanup.html).
# This is not implemented on versions (3.0-3.7) Bad option value 'contextmanager-generator-missing-cleanup' (bad-option-value)
# * 3.9-3.11 will produce "too-many-positional-arguments" for several methods that are having more than 5 args, so we suppress that warning.
# (R0917: Too many positional arguments (8/5) (too-many-positional-arguments))
PYLINT_OPTIONS="--rcfile=ci/pylintrc --jobs=0"
if [[ "${{ matrix.python-version }}" == "3.9" ]]; then
PYLINT_OPTIONS="$PYLINT_OPTIONS --disable=no-member --ignore=main.py"
PYLINT_OPTIONS="$PYLINT_OPTIONS --disable=no-member,too-many-positional-arguments --ignore=main.py"
fi
if [[ "${{ matrix.python-version }}" =~ ^3\.(10|11)$ ]]; then
PYLINT_OPTIONS="$PYLINT_OPTIONS --disable=too-many-positional-arguments"
fi
if [[ "${{ matrix.python-version }}" =~ ^3\.[0-7]$ ]]; then
PYLINT_OPTIONS="$PYLINT_OPTIONS --disable=no-self-use,bad-option-value"
Expand Down
23 changes: 1 addition & 22 deletions azurelinuxagent/common/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,9 +170,7 @@ def load_conf_from_file(conf_file_path, conf=__conf__):
"ResourceDisk.MountPoint": "/mnt/resource",
"ResourceDisk.MountOptions": None,
"ResourceDisk.Filesystem": "ext3",
"AutoUpdate.GAFamily": "Prod",
"Debug.CgroupMonitorExpiryTime": "2022-03-31",
"Debug.CgroupMonitorExtensionName": "Microsoft.Azure.Monitor.AzureMonitorLinuxAgent",
"AutoUpdate.GAFamily": "Prod"
}


Expand Down Expand Up @@ -616,25 +614,6 @@ def get_enable_agent_memory_usage_check(conf=__conf__):
"""
return conf.get_switch("Debug.EnableAgentMemoryUsageCheck", False)


def get_cgroup_monitor_expiry_time(conf=__conf__):
"""
cgroups monitoring for pilot extensions disabled after expiry time
NOTE: This option is experimental and may be removed in later versions of the Agent.
"""
return conf.get("Debug.CgroupMonitorExpiryTime", "2022-03-31")


def get_cgroup_monitor_extension_name (conf=__conf__):
"""
cgroups monitoring extension name
NOTE: This option is experimental and may be removed in later versions of the Agent.
"""
return conf.get("Debug.CgroupMonitorExtensionName", "Microsoft.Azure.Monitor.AzureMonitorLinuxAgent")


def get_enable_fast_track(conf=__conf__):
"""
If True, the agent use FastTrack when retrieving goal states
Expand Down
44 changes: 4 additions & 40 deletions azurelinuxagent/ga/exthandlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1325,7 +1325,7 @@ def set_extension_resource_limits(self):
extension_name = self.get_full_name()
# setup the resource limits for extension operations and it's services.
man = self.load_manifest()
resource_limits = man.get_resource_limits(extension_name, self.ext_handler.version)
resource_limits = man.get_resource_limits()
if not CGroupConfigurator.get_instance().is_extension_resource_limits_setup_completed(extension_name,
cpu_quota=resource_limits.get_extension_slice_cpu_quota()):
CGroupConfigurator.get_instance().setup_extension_slice(
Expand Down Expand Up @@ -1395,7 +1395,7 @@ def _enable_extension(self, extension, uninstall_exit_code):
self.__set_extension_state(extension, ExtensionState.Enabled)

# start tracking the extension services cgroup.
resource_limits = man.get_resource_limits(self.get_full_name(), self.ext_handler.version)
resource_limits = man.get_resource_limits()
CGroupConfigurator.get_instance().start_tracking_extension_services_cgroups(
resource_limits.get_service_list())

Expand Down Expand Up @@ -1462,7 +1462,7 @@ def uninstall(self, extension=None):
man = self.load_manifest()

# stop tracking extension services cgroup.
resource_limits = man.get_resource_limits(self.get_full_name(), self.ext_handler.version)
resource_limits = man.get_resource_limits()
CGroupConfigurator.get_instance().stop_tracking_extension_services_cgroups(
resource_limits.get_service_list())
CGroupConfigurator.get_instance().remove_extension_services_drop_in_files(
Expand Down Expand Up @@ -2132,14 +2132,6 @@ def get_env_file(self):
def get_log_dir(self):
return os.path.join(conf.get_ext_log_dir(), self.ext_handler.name)

@staticmethod
def is_azuremonitorlinuxagent(extension_name):
cgroup_monitor_extension_name = conf.get_cgroup_monitor_extension_name()
if re.match(r"\A" + cgroup_monitor_extension_name, extension_name) is not None\
and datetime.datetime.utcnow() < datetime.datetime.strptime(conf.get_cgroup_monitor_expiry_time(), "%Y-%m-%d"):
return True
return False

@staticmethod
def _read_status_file(ext_status_file):
err_count = 0
Expand Down Expand Up @@ -2258,35 +2250,7 @@ def supports_multiple_extensions(self):
value = self.data['handlerManifest'].get('supportsMultipleExtensions', False)
return self._parse_boolean_value(value, default_val=False)

def get_resource_limits(self, extension_name, str_version):
"""
Placeholder values for testing and monitoring the monitor extension resource usage.
This is not effective after nov 30th.
"""
if ExtHandlerInstance.is_azuremonitorlinuxagent(extension_name):
if FlexibleVersion(str_version) < FlexibleVersion("1.12"):
test_man = {
"resourceLimits": {
"services": [
{
"name": "mdsd.service"
}
]
}
}
return ResourceLimits(test_man.get('resourceLimits', None))
else:
test_man = {
"resourceLimits": {
"services": [
{
"name": "azuremonitoragent.service"
}
]
}
}
return ResourceLimits(test_man.get('resourceLimits', None))

def get_resource_limits(self):
return ResourceLimits(self.data.get('resourceLimits', None))

def report_invalid_boolean_properties(self, ext_name):
Expand Down
2 changes: 0 additions & 2 deletions tests/test_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,6 @@
Debug.CgroupDisableOnProcessCheckFailure = True
Debug.CgroupDisableOnQuotaCheckFailure = True
Debug.CgroupLogMetrics = False
Debug.CgroupMonitorExpiryTime = 2022-03-31
Debug.CgroupMonitorExtensionName = Microsoft.Azure.Monitor.AzureMonitorLinuxAgent
Debug.EnableAgentMemoryUsageCheck = False
Debug.EnableCgroupV2ResourceLimiting = False
Debug.EnableExtensionPolicy = False
Expand Down
14 changes: 0 additions & 14 deletions tests_e2e/tests/ext_cgroups/install_extensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
from datetime import datetime, timedelta
from pathlib import Path

from tests_e2e.tests.lib.agent_test_context import AgentVmTestContext
Expand All @@ -35,7 +34,6 @@ def __init__(self, context: AgentVmTestContext):
self._ssh_client = self._context.create_ssh_client()

def run(self):
self._prepare_agent()
# Install the GATest extension to test service cgroups
self._install_gatest_extension()
# Install the Azure Monitor Agent to test long running process cgroup
Expand All @@ -45,18 +43,6 @@ def run(self):
# Install the CSE extension to test extension cgroup
self._install_cse()

def _prepare_agent(self):
log.info("=====Executing update-waagent-conf remote script to update monitoring deadline flag for tracking azuremonitoragent service")
future_date = datetime.utcnow() + timedelta(days=2)
expiry_time = future_date.date().strftime("%Y-%m-%d")
# Agent needs extension info and it's services info in the handlermanifest.xml to monitor and limit the resource usage.
# As part of pilot testing , agent hardcoded azuremonitoragent service name to monitor it for sometime in production without need of manifest update from extesnion side.
# So that they can get sense of resource usage for their extensions. This we did for few months and now we no logner monitoring it in production.
# But we are changing the config flag expiry time to future date in this test. So that test agent will start track the cgroups that is used by the service.
result = self._ssh_client.run_command(f"update-waagent-conf Debug.CgroupMonitorExpiryTime={expiry_time}", use_sudo=True)
log.info(result)
log.info("Updated agent cgroups config(CgroupMonitorExpiryTime)")

def _install_ama(self):
ama_extension = VirtualMachineExtensionClient(
self._context.vm, VmExtensionIds.AzureMonitorLinuxAgent,
Expand Down
6 changes: 2 additions & 4 deletions tests_e2e/tests/lib/cgroup_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,9 @@
CGROUP_TRACKED_PATTERN = re.compile(r'Started tracking cgroup ([^\s]+)\s+\[(?P<path>[^\s]+)\]')

GATESTEXT_FULL_NAME = "Microsoft.Azure.Extensions.Edp.GATestExtGo"
GATESTEXT_SERVICE = "gatestext.service"
GATESTEXT_SERVICE = "gatestext"
AZUREMONITOREXT_FULL_NAME = "Microsoft.Azure.Monitor.AzureMonitorLinuxAgent"
AZUREMONITORAGENT_SERVICE = "azuremonitoragent.service"
MDSD_SERVICE = "mdsd.service"

AZUREMONITORAGENT_SERVICE = "azuremonitoragent"

def verify_if_distro_supports_cgroup():
"""
Expand Down
10 changes: 3 additions & 7 deletions tests_e2e/tests/scripts/ext_cgroups-check_cgroups_extensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
from tests_e2e.tests.lib.agent_log import AgentLog
from tests_e2e.tests.lib.cgroup_helpers import verify_if_distro_supports_cgroup, \
verify_agent_cgroup_assigned_correctly, BASE_CGROUP, EXT_CONTROLLERS, get_unit_cgroup_mount_path, \
GATESTEXT_SERVICE, AZUREMONITORAGENT_SERVICE, MDSD_SERVICE, check_agent_quota_disabled, \
GATESTEXT_SERVICE, AZUREMONITORAGENT_SERVICE, check_agent_quota_disabled, \
check_cgroup_disabled_with_unknown_process, CGROUP_TRACKED_PATTERN, AZUREMONITOREXT_FULL_NAME, GATESTEXT_FULL_NAME, \
print_cgroups
from tests_e2e.tests.lib.logging import log
Expand Down Expand Up @@ -118,10 +118,6 @@ def verify_extension_service_cgroup_created_on_file_system():
# Azure Monitor Extension Service
azuremonitoragent_cgroup_mount_path = get_unit_cgroup_mount_path(AZUREMONITORAGENT_SERVICE)
azuremonitoragent_service_name = AZUREMONITORAGENT_SERVICE
# Old versions of AMA extension has different service name
if azuremonitoragent_cgroup_mount_path is None:
azuremonitoragent_cgroup_mount_path = get_unit_cgroup_mount_path(MDSD_SERVICE)
azuremonitoragent_service_name = MDSD_SERVICE
verify_extension_service_cgroup_created(azuremonitoragent_service_name, azuremonitoragent_cgroup_mount_path)

log.info('Verified all extension service cgroup paths created in file system .\n')
Expand Down Expand Up @@ -177,7 +173,7 @@ def verify_ext_cgroups_tracked():
azuremonitoragent_cgroups_tracked = True
elif name.startswith(GATESTEXT_SERVICE):
gatestext_service_cgroups_tracked = True
elif name.startswith(AZUREMONITORAGENT_SERVICE) or name.startswith(MDSD_SERVICE):
elif name.startswith(AZUREMONITORAGENT_SERVICE):
azuremonitoragent_service_cgroups_tracked = True
cgroups_added_for_telemetry.append((name, path))

Expand Down Expand Up @@ -218,7 +214,7 @@ def main():
main()
except Exception as e:
# It is possible that agent cgroup can be disabled due to UNKNOWN process or throttled before we run this check, in that case, we should ignore the validation
if check_cgroup_disabled_with_unknown_process() and retry_if_false(check_agent_quota_disabled()):
if check_cgroup_disabled_with_unknown_process() and retry_if_false(check_agent_quota_disabled):
log.info("Cgroup is disabled due to UNKNOWN process, ignoring ext cgroups validations")
else:
raise

0 comments on commit 058699c

Please sign in to comment.