Skip to content

Commit

Permalink
Improvements (#70)
Browse files Browse the repository at this point in the history
* TOOLS-1392: (ASADM-HEALTHCHECK) Fix physical memory check to handle different units.

* TOOLS-1396: (ASADM-HEALTHCHECK) Fix rack membership check.

* TOOLS-1397: (ASADM-HEALTHCHECK) Add rule to check if we don't have too many sprigs per partition for all flash.

* TOOLS-1400: (ASADM) Fix unique data usage computation to support post 4.2 changes for tombstone size.

* TOOLS-1401: (ASADM-HEALTHCHECK) Add rule to report excessive heartbeat interval/timeout.
  • Loading branch information
hbpatre authored Oct 4, 2019
1 parent 268337f commit 9afa3fc
Show file tree
Hide file tree
Showing 24 changed files with 1,912 additions and 320 deletions.
13 changes: 12 additions & 1 deletion lib/client/node.py
Original file line number Diff line number Diff line change
Expand Up @@ -612,11 +612,14 @@ def _info_peers_helper(self, peers):
peers_list = util.parse_peers_string(gen_port_peers[2])
if not peers_list or len(peers_list) < 1:
return []

p_list = []

for p in peers_list:
p_data = util.parse_peers_string(p)
if not p_data or len(p_data) < 3:
continue

# TODO - not used node_name = p_data[0]
tls_name = None
if p_data[1] and len(p_data[1]) > 0:
Expand All @@ -628,15 +631,19 @@ def _info_peers_helper(self, peers):

if not tls_name:
tls_name = util.find_dns(endpoints)

endpoint_list = []

for e in endpoints:
if "[" in e and "]:" not in e:
addr_port = util.parse_peers_string(e, delim=",")
else:
addr_port = util.parse_peers_string(e, delim=":")

addr = addr_port[0]
if addr.startswith("["):
addr = addr[1:]

if addr.endswith("]"):
addr = addr[:-1].strip()

Expand All @@ -645,12 +652,16 @@ def _info_peers_helper(self, peers):
port = addr_port[1]
else:
port = default_port

try:
port = int(port)
except Exception:
port = default_port

endpoint_list.append((addr, port, tls_name))

p_list.append(tuple(endpoint_list))

return p_list

@return_exceptions
Expand Down Expand Up @@ -723,7 +734,7 @@ def _info_service_helper(self, service, delimiter=";"):
if not service or isinstance(service, Exception):
return []
s = map(lambda v: util.parse_peers_string(v, ":"), util.info_to_list(service, delimiter=delimiter))
return map(lambda v: (v[0].strip("[]"), int(self.port), self.tls_name), s)
return map(lambda v: (v[0].strip("[]"), int(v[1]) if len(v)>1 and v[1] else int(self.port), self.tls_name), s)

# post 3.10 services

Expand Down
56 changes: 41 additions & 15 deletions lib/client/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,41 +142,50 @@ def find_dns(endpoints):
return None


def parse_peers_string(s, delim=",", ignore_chars_start="[", ignore_chars_end="]"):
o = []
if not s or isinstance(s, Exception):
return o
s = s.strip()
if not s:
return o
if s[0] in ignore_chars_start and s[-1] in ignore_chars_end:
s = s[1:-1]
if not s:
return o
def parse_peers_string(peers_str, delim=",", ignore_chars_start="[", ignore_chars_end="]"):
peers_list = []
if not peers_str or isinstance(peers_str, Exception):
return peers_list

peers_str = peers_str.strip()
if not peers_str:
return peers_list

if peers_str[0] in ignore_chars_start and peers_str[-1] in ignore_chars_end:
peers_str = peers_str[1:-1]

if not peers_str:
return peers_list

push_bracket = ignore_chars_start
pop_bracket = ignore_chars_end
b_stack = []
current_str = ""
for i in s:
for i in peers_str:
if i == delim:
if len(b_stack) > 0:
current_str += i
else:
o.append(current_str.strip())
peers_list.append(current_str.strip())
current_str = ""
continue

if i in push_bracket:
current_str += i
b_stack.append(i)
continue

if i in pop_bracket:
current_str += i
b_stack.pop()
continue

current_str += i

if current_str:
o.append(current_str.strip())
return o
peers_list.append(current_str.strip())

return peers_list


def concurrent_map(func, data):
Expand Down Expand Up @@ -234,6 +243,12 @@ def __call__(self, *args):


def flatten(list1):
"""
Simple function to flatten peers list
Format: [((node1 endpoint1 tuple), (node1 endpoint2 tuple), ..., (node1 endpointm tuple)), ....]
Example: [(("172.17.0.1",3000,None),), (("2001:db8:85a3::8a2e",6666,None), ("172.17.0.3",3004,None))]
"""

f_list = []
for i in list1:
if isinstance(i[0], tuple):
Expand All @@ -245,6 +260,10 @@ def flatten(list1):


def remove_suffix(input_string, suffix):
"""
Simple function to remove suffix from input_string if available
"""

try:
input_string = input_string.strip()
if not input_string.endswith(suffix):
Expand All @@ -255,8 +274,15 @@ def remove_suffix(input_string, suffix):


def get_value_from_dict(d, keys, default_value=None, return_type=None):
"""
Simple function to fetch and return value from dict d for first available key from keys
If no key available then it returns default_value
If return_type provided then it covnerts and returns fetched value
"""

if not isinstance(keys, tuple):
keys = (keys,)

for key in keys:
if key in d:
val = d[key]
Expand Down
33 changes: 27 additions & 6 deletions lib/collectinfo_parser/sys_section_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -586,7 +586,10 @@ def _parse_free_m_section(imap, parsed_map):

mem_obj = {}
for idx, val in enumerate(tok_list):
mem_obj[val] = data_list[idx + 1]
try:
mem_obj[val] = int(data_list[idx + 1])
except Exception:
mem_obj[val] = data_list[idx + 1]

free_m_data['mem'] = mem_obj
continue
Expand All @@ -595,8 +598,15 @@ def _parse_free_m_section(imap, parsed_map):
data_list = line.rstrip().split()

buffer_obj = {}
buffer_obj[tok_list[1]] = data_list[2]
buffer_obj[tok_list[2]] = data_list[3]
try:
buffer_obj[tok_list[1]] = int(data_list[2])
except Exception:
buffer_obj[tok_list[1]] = data_list[2]

try:
buffer_obj[tok_list[2]] = int(data_list[3])
except Exception:
buffer_obj[tok_list[2]] = data_list[3]

free_m_data['buffers/cache'] = buffer_obj
continue
Expand All @@ -605,9 +615,20 @@ def _parse_free_m_section(imap, parsed_map):
data_list = line.rstrip().split()

swap_obj = {}
swap_obj[tok_list[0]] = data_list[1]
swap_obj[tok_list[1]] = data_list[2]
swap_obj[tok_list[2]] = data_list[3]
try:
swap_obj[tok_list[0]] = int(data_list[1])
except Exception:
swap_obj[tok_list[0]] = data_list[1]

try:
swap_obj[tok_list[1]] = int(data_list[2])
except Exception:
swap_obj[tok_list[1]] = data_list[2]

try:
swap_obj[tok_list[2]] = int(data_list[3])
except Exception:
swap_obj[tok_list[2]] = data_list[3]

free_m_data['swap'] = swap_obj
continue
Expand Down
4 changes: 3 additions & 1 deletion lib/health/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from lib.health.exceptions import HealthException
from lib.health.operation import select_keys_from_dict, AggOperation, ApplyOperation,\
AssertDetailOperation, BinaryOperation, ComplexOperation, SimpleOperation
from lib.health.util import create_health_internal_tuple, create_snapshot_key
from lib.health.util import create_health_internal_tuple, create_snapshot_key, get_value_from_health_internal_tuple

SNAPSHOT_KEY_PREFIX = "SNAPSHOT"
SNAPSHOT_KEY_PATTERN = r"SNAPSHOT(\d+)$"
Expand Down Expand Up @@ -137,6 +137,8 @@ def is_data_true(data):
return False

if not isinstance(data, dict):
if not get_value_from_health_internal_tuple(data):
return False
return True

for _k in data:
Expand Down
82 changes: 75 additions & 7 deletions lib/health/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -312,6 +312,22 @@
"heartbeat.mtu check.",
multicast_mode_enabled);
interval = select "heartbeat.interval" from NETWORK.CONFIG save;
r1 = do interval < 150;
r2 = do interval > 250;
r = do r1 || r2;
ASSERT(r, False, "Heartbeat interval is not in expected range (150 <= p <= 250)", "OPERATIONS", INFO,
"Listed nodes(s) have heartbeat interval value not in expected range (150 <= p <= 250). New node might fail to join cluster.",
"Heartbeat interval Check (150 <= p <= 250)");
timeout = select "heartbeat.timeout" from NETWORK.CONFIG save;
r1 = do timeout < 10;
r2 = do timeout > 15;
r = do r1 || r2;
ASSERT(r, False, "Heartbeat timeout is not in expected range (10 <= p <= 15)", "OPERATIONS", INFO,
"Listed nodes(s) have heartbeat timeout value not in expected range (10 <= p <= 15). New node might fail to join cluster.",
"Heartbeat timeout Check (10 <= p <= 15)");
s = select "migrate-threads", "migrate_threads" from SERVICE.CONFIG save;
r = do s > 1;
Expand Down Expand Up @@ -379,7 +395,7 @@
r = do warning_check || correct_range_check;
ASSERT(r, True, "Number of Sets equal to or above 750", "LIMITS", INFO,
"Listed namespace(s) have high number of set count (>=750). Please run in AQL 'show sets' for details",
"Basic Set Count Check (750 =< p < 1000)");
"Basic Set Count Check (750 <= p < 1000)");
stop_writes = select "stop_writes" from NAMESPACE.STATISTICS;
stop_writes = group by CLUSTER, NAMESPACE stop_writes;
Expand Down Expand Up @@ -1694,6 +1710,58 @@
e);
SET CONSTRAINT VERSION >= 4.3.0.2;
// sprig mounts-size-limit checks
// critical case
cs = select "cluster_size" as "sprig_limit_critical" from SERVICE.STATISTICS;
cs = group by CLUSTER do MAX(cs) save as "cluster-size";
repl = select "replication-factor" as "sprig_limit_critical" from NAMESPACE.STATISTICS;
pts = select "partition-tree-sprigs" as "sprig_limit_critical" from NAMESPACE.CONFIG;
msl = select "index-type.mounts-size-limit" as "sprig_limit_critical" from NAMESPACE.CONFIG;
// below statement adds thousand delimiter to mounts-size-limiter when it prints
msl = do msl * 1 save as "mounts-size-limit";
// check for enterprise edition
e = select "edition" from METADATA;
e = do e == "Enterprise";
e = group by CLUSTER, NODE do OR(e);
// calculate sprig overhead
r = do 4096 * repl;
r = do r/cs;
r = do r * pts;
r = do r * 4096 save as "Minimum space required";
r = do r > msl;
ASSERT(r, False, "ALL FLASH / PMEM - Too many sprigs per partition for current available index mounted space. Some records are likely failing to be created.", "OPERATIONS", CRITICAL,
"Minimum space required for sprig overhead at current cluster size exceeds mounts-size-limit.
See: https://www.aerospike.com/docs/operations/configure/namespace/index/#flash-index and https://www.aerospike.com/docs/operations/plan/capacity/#aerospike-all-flash",
"Check for too many sprigs for current cluster size.",
e);
// warning case
mcs = select "min-cluster-size" as "sprig_limit_warning" from SERVICE;
mcs = group by CLUSTER do MAX(mcs) save as "min-cluster-size";
repl = select "replication-factor" as "sprig_limit_warning" from NAMESPACE.STATISTICS;
pts = select "partition-tree-sprigs" as "sprig_limit_warning" from NAMESPACE.CONFIG;
msl = select "index-type.mounts-size-limit" as "sprig_limit_warning" from NAMESPACE.CONFIG;
// below statement adds thousand delimiter to mounts-size-limiter when it prints
msl = do msl * 1 save as "mounts-size-limit";
// calculate sprig overhead
r = do 4096 * repl;
r = do r/mcs;
r = do r * pts;
r = do r * 4096 save as "Minimum space required";
r = do r > msl;
ASSERT(r, False, "ALL FLASH / PMEM - Too many sprigs per partition for configured min-cluster-size.", "OPERATIONS", WARNING,
"Minimum space required for sprig overhead at min-cluster-size exceeds mounts-size-limit.
See: https://www.aerospike.com/docs/operations/configure/namespace/index/#flash-index and https://www.aerospike.com/docs/operations/plan/capacity/#aerospike-all-flash",
"Check for too many sprigs for minimum cluster size.",
e);
SET CONSTRAINT VERSION ALL;
SET CONSTRAINT VERSION >= 4.0.0.1;
// SC mode rules
Expand Down Expand Up @@ -1730,12 +1798,6 @@
"Cluster clock_skew check",
s);
roster = select "roster", "observed_nodes" from ROSTER.CONFIG;
r = group by CLUSTER, NAMESPACE, NODE do EQUAL(roster);
ASSERT(r, True, "Roster misconfigured.", "OPERATIONS", WARNING,
"Listed namespace[s] shows difference between set roster nodes and observe nodes. Please set roster properly.",
"Roster misconfiguration check.");
size = select "cluster_size" from SERVICE.STATISTICS;
p = group by CLUSTER do MAX(size) save as "cluster_size";
repl = select "replication-factor", "repl-factor" from NAMESPACE.CONFIG save as "replication_factor";
Expand All @@ -1747,6 +1809,12 @@
sc_check = select "strong-consistency" from NAMESPACE.CONFIG;
sc_check = group by CLUSTER, NAMESPACE do OR(sc_check);
roster = select "roster", "observed_nodes" from ROSTER.CONFIG;
r = group by CLUSTER, NAMESPACE, NODE do EQUAL(roster);
ASSERT(r, True, "Roster misconfigured.", "OPERATIONS", WARNING,
"Listed namespace[s] shows difference between set roster nodes and observe nodes. Please set roster properly.",
"Roster misconfiguration check.", sc_check);
roster_null_check = select "roster" from ROSTER.CONFIG;
roster_null_check = group by CLUSTER, NAMESPACE, NODE roster_null_check;
roster_null_check = do "null" IN roster_null_check;
Expand Down
Loading

0 comments on commit 9afa3fc

Please sign in to comment.