diff --git a/README.md b/README.md index 1c7019e..b2adee8 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ Please prefer installation via system packages like `python3-requests`. Alternatively you can install with pip: - pip3 install requests + pip3 install -r requirements.txt Make sure to modify the shebang to your environment, one of the following should be fine. @@ -43,12 +43,23 @@ optional arguments: --password PASSWORD, -p PASSWORD Password for Basic Auth --mode MODE, -m MODE Check mode + --exclude [EXCLUDE ...] + Exclude alarms or usage from the check results. Can be used multiple times and supports regular expressions. --max-age MAX_AGE, -M MAX_AGE Max age in minutes for capacity usage updates. Defaults to 5 --version, -V Print version --insecure Do not verify TLS certificate. Be careful with this option, please ``` +The `--exclude` parameter will match against alarms and capacity-usage. It uses the following string representation (whitespaces included) to match against: + +* alarms: `severity` `node_display_name` `feature_display_name` `event_type_display_name` +* capacity-usage: `severity` `display_name` + +## Examples + +Mode: cluster-status + ``` $ ./check_vmware_nsxt.py --api 'https://vmware-nsx.local' -u icinga -p password --mode cluster-status [OK] control_cluster_status=STABLE - mgmt_cluster_status=STABLE - control_cluster_status=STABLE - nodes_online=3 @@ -66,14 +77,25 @@ $ ./check_vmware_nsxt.py --api 'https://vmware-nsx.local' -u icinga -p password | nodes_online=3;;;0 ``` +Mode: alarms + ``` $ ./check_vmware_nsxt.py --api 'https://vmware-nsx.local' -u icinga -p password --mode alarms [WARNING] 1 alarms - 1 medium [MEDIUM] (2021-04-26 17:25:18) (node1) Intelligence Health/Storage Latency High - Intelligence node storage latency is high. +| alarms=1;;;0 alarms.medium=1;;;0 +``` + +``` +$ ./check_vmware_nsxt.py --api 'https://vmware-nsx.local' -u icinga -p password --mode alarms --exclude "LOW" +# Excluded alerts will still be counted, but are not factored into the exit code +[OK] 1 alarms | alarms=1;;;0 ``` +Mode: capacity-usage + ``` $ ./check_vmware_nsxt.py --api 'https://vmware-nsx.local' -u icinga -p password --mode capacity-usage [OK] 28 info - no usages - last update: 2021-04-29 19:06:12 diff --git a/check_vmware_nsxt.py b/check_vmware_nsxt.py index 9707668..7904044 100644 --- a/check_vmware_nsxt.py +++ b/check_vmware_nsxt.py @@ -37,6 +37,7 @@ import logging import datetime import ssl +import re from urllib.parse import urljoin import urllib3 import requests @@ -127,26 +128,26 @@ def request(self, url, method='GET'): except Exception as json_exc: raise CriticalException('Could not decode API JSON: ' + str(json_exc)) # pylint: disable=raise-missing-from - def get_cluster_status(self): + def get_cluster_status(self, excludes=None): """ GET and build ClusterStatus """ - return ClusterStatus(self.request('cluster/status')) + return ClusterStatus(self.request('cluster/status'), excludes) - def get_alarms(self): + def get_alarms(self, excludes=None): """ GET and build Alarms """ status = "OPEN" # status = "RESOLVED" # for testing result = self.request('alarms?page_size=100&status=%s&sort_ascending=false' % status) - return Alarms(result['results']) + return Alarms(data=result['results'], excludes=excludes) - def get_capacity_usage(self): + def get_capacity_usage(self, excludes=None): """ GET and build CapacityUsage """ - return CapacityUsage(self.request('capacity/usage'), self.max_age) + return CapacityUsage(self.request('capacity/usage'), self.max_age, excludes) class CheckResult: @@ -203,9 +204,12 @@ class ClusterStatus(CheckResult): https://vdc-download.vmware.com/vmwb-repository/dcr-public/787988e9-6348-4b2a-8617-e6d672c690ee/a187360c-77d5-4c0c-92a8-8e07aa161a27/api_includes/method_ReadClusterStatus.html """ - def __init__(self, data): + def __init__(self, data, excludes): super().__init__() self.data = data + self.excludes = excludes + if excludes is None: + self.excludes = [] def build_output(self): for area in ['control_cluster_status', 'mgmt_cluster_status', 'control_cluster_status']: @@ -234,14 +238,33 @@ class Alarms(CheckResult): https://vdc-download.vmware.com/vmwb-repository/dcr-public/787988e9-6348-4b2a-8617-e6d672c690ee/a187360c-77d5-4c0c-92a8-8e07aa161a27/api_includes/method_GetAlarms.html """ - def __init__(self, data): + def __init__(self, data, excludes): super().__init__() self.data = data + self.excludes = excludes + if excludes is None: + self.excludes = [] + + def _is_excluded(self, alarm): + # to exclude via --exclude + identifier = "%s %s %s %s" % ( + alarm['severity'], + alarm['node_display_name'], + alarm['feature_display_name'], + alarm['event_type_display_name']) + for exclude in self.excludes: + regexp = re.compile(exclude) + if bool(regexp.search(identifier)): + return True + return False def build_output(self): states = {} for alarm in self.data: + if self._is_excluded(alarm): + continue + severity = alarm['severity'] if severity in states: states[severity] += 1 @@ -270,7 +293,11 @@ def build_status(self): states = [] for alarm in self.data: - state = WARNING if alarm['severity'] in ['MEDIUM', 'LOW'] else CRITICAL # CRITICAL, HIGH + if self._is_excluded(alarm): + continue + + # HIGH == CRITICAL + state = WARNING if alarm['severity'] in ['MEDIUM', 'LOW'] else CRITICAL states.append(state) if len(states) > 0: @@ -285,15 +312,33 @@ class CapacityUsage(CheckResult): https://vdc-download.vmware.com/vmwb-repository/dcr-public/787988e9-6348-4b2a-8617-e6d672c690ee/a187360c-77d5-4c0c-92a8-8e07aa161a27/api_includes/method_GetProtonCapacityUsage.html """ - def __init__(self, data, max_age): + def __init__(self, data, max_age, excludes): super().__init__() self.data = data self.max_age = max_age + self.excludes = excludes + if excludes is None: + self.excludes = [] + + def _is_excluded(self, usage): + # to exclude via --exclude + identifier = "%s %s" % ( + usage['severity'], + usage['display_name']) + + for exclude in self.excludes: + regexp = re.compile(exclude) + if bool(regexp.search(identifier)): + return True + return False def build_output(self): states = {} for usage in self.data['capacity_usage']: + if self._is_excluded(usage): + continue + severity = usage['severity'] # INFO, WARNING, CRITICAL, ERROR if severity in states: @@ -341,6 +386,9 @@ def build_status(self): self.summary.append("last update older than %s minutes" % (self.max_age)) for usage in self.data['capacity_usage']: + if self._is_excluded(usage): + continue + severity = usage['severity'] # INFO, WARNING, CRITICAL, ERROR if severity == "INFO": @@ -398,6 +446,8 @@ def commandline(args): help='Password for Basic Auth', required=True) parser.add_argument('--mode', '-m', choices=['cluster-status', 'alarms', 'capacity-usage'], help='Check mode to exectue. Hint: alarms will only include open alarms.', required=True) + parser.add_argument('--exclude', nargs='*', action='extend', type=str, + help="Exclude alarms or usage from the check results. Can be used multiple times and supports regular expressions.") parser.add_argument('--max-age', '-M', type=int, help='Max age in minutes for capacity usage updates. Defaults to 5', default=5, required=False) parser.add_argument('--insecure', @@ -421,11 +471,11 @@ def main(args): client = Client(args.api, args.username, args.password, verify=(not args.insecure), max_age=args.max_age) if args.mode == 'cluster-status': - return client.get_cluster_status().print_and_return() + return client.get_cluster_status(args.exclude).print_and_return() if args.mode == 'alarms': - return client.get_alarms().print_and_return() + return client.get_alarms(args.exclude).print_and_return() if args.mode == 'capacity-usage': - return client.get_capacity_usage().print_and_return() + return client.get_capacity_usage(args.exclude).print_and_return() print("[UNKNOWN] unknown mode %s" % args.mode) return UNKNOWN diff --git a/test_check_vmware_nsxt.py b/test_check_vmware_nsxt.py index 56cada1..410a674 100644 --- a/test_check_vmware_nsxt.py +++ b/test_check_vmware_nsxt.py @@ -153,6 +153,26 @@ def test_alarms_ok(self, mock_req, mock_print): self.assertEqual(actual, expected) mock_print.assert_called_with('[WARNING] 1 alarms - 1 medium\n\n[MEDIUM] (2021-04-26 15:25:18) (node1) Intelligence Health/Storage Latency High - Intelligence node storage latency is high.\n| alarms=1;;;0 alarms.medium=1;;;0') + @mock.patch('builtins.print') + @mock.patch('requests.request') + def test_alarms_exclude(self, mock_req, mock_print): + + with open('testdata/fixtures/alarms.json') as f: + testdata = json.load(f) + + m = mock.MagicMock() + m.status_code = 200 + m.json.return_value = testdata + mock_req.return_value = m + + c = Client('api', 'username', 'password', logger=None, verify=True, max_age=5) + + actual = c.get_alarms(excludes=["M[A-Z]+M"]).print_and_return() + expected = 0 + + self.assertEqual(actual, expected) + mock_print.assert_called_with('[OK] 1 alarms\n| alarms=1;;;0') + @mock.patch('builtins.print') @mock.patch('requests.request') def test_capacity_usage_ok(self, mock_req, mock_print): @@ -172,3 +192,20 @@ def test_capacity_usage_ok(self, mock_req, mock_print): self.assertEqual(actual, expected) mock_print.assert_called_with('[WARNING] 28 info - last update: 2021-04-30 09:17:40 - last update older than 5 minutes\n\n[OK] [INFO] System-wide NAT rules: 0 of 25000 (0%)\n[OK] [INFO] Network Introspection Rules: 1 of 10000 (0.01%)\n[OK] [INFO] System-wide Endpoint Protection Enabled Hosts: 0 of 256 (0%)\n[OK] [INFO] Hypervisor Hosts: 18 of 1024 (1.75%)\n[OK] [INFO] System-wide Firewall Rules: 81 of 100000 (0.08%)\n[OK] [INFO] System-wide DHCP Pools: 0 of 10000 (0%)\n[OK] [INFO] System-wide Edge Nodes: 10 of 320 (3.12%)\n[OK] [INFO] Active Directory Domains (Identity Firewall): 0 of 4 (0%)\n[OK] [INFO] vSphere Clusters Prepared for NSX: 4 of 128 (3.12%)\n[OK] [INFO] Prefix-lists: 20 of 500 (4%)\n[OK] [INFO] Logical Switches: 12 of 10000 (0.12%)\n[OK] [INFO] System-wide Logical Switch Ports: 145 of 25000 (0.58%)\n[OK] [INFO] Active Directory Groups (Identity Firewall): 0 of 100000 (0%)\n[OK] [INFO] Distributed Firewall Rules: 75 of 100000 (0.07%)\n[OK] [INFO] System-wide Endpoint Protection Enabled Virtual Machines: 0 of 7500 (0%)\n[OK] [INFO] Distributed Firewall Sections: 23 of 10000 (0.23%)\n[OK] [INFO] Groups Based on IP Sets: 37 of 10000 (0.37%)\n[OK] [INFO] Edge Clusters: 3 of 160 (1.87%)\n[OK] [INFO] Tier-1 Logical Routers with NAT Enabled: 0 of 4000 (0%)\n[OK] [INFO] System-wide Firewall Sections: 29 of 10000 (0.29%)\n[OK] [INFO] Network Introspection Sections: 1 of 500 (0.2%)\n[OK] [INFO] Groups: 74 of 20000 (0.37%)\n[OK] [INFO] Tier-1 Logical Routers: 4 of 4000 (0.1%)\n[OK] [INFO] IP Sets: 37 of 10000 (0.37%)\n[OK] [INFO] Network Introspection Service Chains: 0 of 24 (0%)\n[OK] [INFO] Network Introspection Service Paths: 0 of 4000 (0%)\n[OK] [INFO] Tier-0 Logical Routers: 2 of 160 (1.25%)\n[OK] [INFO] DHCP Server Instances: 0 of 10000 (0%)\n| number_of_nat_rules=0%;70;100;0;100 number_of_si_rules=0.01%;70;100;0;100 number_of_gi_protected_hosts=0%;70;100;0;100 number_of_prepared_hosts=1.75%;70;100;0;100 number_of_firewall_rules=0.08%;70;100;0;100 number_of_dhcp_ip_pools=0%;70;100;0;100 number_of_edge_nodes=3.12%;70;100;0;100 number_of_active_directory_domains=0%;70;100;0;100 number_of_vcenter_clusters=3.12%;70;100;0;100 number_of_prefix_list=4%;70;100;0;100 number_of_logical_switches=0.12%;70;100;0;100 number_of_logical_ports=0.58%;70;100;0;100 number_of_active_directory_groups=0%;70;100;0;100 number_of_dfw_rules=0.07%;70;100;0;100 number_of_gi_protected_vms=0%;70;100;0;100 number_of_dfw_sections=0.23%;70;100;0;100 number_of_groups_based_on_ip_sets=0.37%;70;100;0;100 number_of_edge_clusters=1.87%;70;100;0;100 number_of_tier1_with_nat_rule=0%;70;100;0;100 number_of_firewall_sections=0.29%;70;100;0;100 number_of_si_sections=0.2%;70;100;0;100 number_of_nsgroup=0.37%;70;100;0;100 number_of_tier1_routers=0.1%;70;100;0;100 number_of_ipsets=0.37%;70;100;0;100 number_of_si_service_chains=0%;70;100;0;100 number_of_si_service_paths=0%;70;100;0;100 number_of_tier0_routers=1.25%;70;100;0;100 number_of_dhcp_servers=0%;70;100;0;100') + + @mock.patch('builtins.print') + @mock.patch('requests.request') + def test_capacity_usage_exclude(self, mock_req, mock_print): + + with open('testdata/fixtures/capacity-usage.json') as f: + testdata = json.load(f) + + m = mock.MagicMock() + m.status_code = 200 + m.json.return_value = testdata + mock_req.return_value = m + + c = Client('api', 'username', 'password', logger=None, verify=True, max_age=5) + + actual = c.get_capacity_usage(".*").print_and_return() + expected = 0