Skip to content

Commit

Permalink
Merge branch 'joeo/PFOPS-887-pull-debug-logs-via-virsh-console' into …
Browse files Browse the repository at this point in the history
…'master'

Joeo/pfops 887/888 pull debug logs via virsh console

changes only to testnet/tools/icos_collect_debug_info.py, adds traces in the Error handler to get info from the ic_hosts concerning whether the ic_guest VMs even started for that failed test.  Differentiates between a libvirt/ansible failure and VM startup failure (ipv6 not initializing, or DNS failure). 

See merge request dfinity-lab/public/ic!2105
  • Loading branch information
joeodfinity committed Dec 7, 2021
2 parents e8d35fb + 28748a1 commit 18f1e28
Showing 1 changed file with 40 additions and 0 deletions.
40 changes: 40 additions & 0 deletions testnet/tools/icos_collect_debug_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
${REPO_ROOT}/gitlab-ci/src/artifacts/gitlab_artifacts_download.py --job-id <gitlab-job-id>
"""
import argparse
import json
import logging
import os
import pathlib
Expand Down Expand Up @@ -42,6 +43,44 @@ def get_deployment_nodes(deployment_name: str):
return yaml.load(output, Loader=yaml.FullLoader)


def _get_map_node_to_ic_host(deployment_name: str):
"""Get the mapping {node: phy_node} between the nodes and the raw iron (physical host) behind it."""
hosts = subprocess.check_output(
[
repo_root / "testnet/ansible/inventory/inventory.py",
"--deployment",
deployment_name,
"--list",
]
)
result = {}
for phy_host, host_vars in json.loads(hosts)["_meta"]["hostvars"].items():
# Try to get the list of ic_guests on every hostvars. If the list does not exist, fallback to an empty list.
for node in host_vars.get("ic_guests", []):
# There are some "ic_guests" on this physical node, so create a reverse mapping
result[node] = phy_host
return result


def collect_host_dominfo(nodes: typing.List[str], deployment_name: str):
"""Iterate through the hosts collecting dominfo for each node and pull the libvirt dominfo for the deployment VMs."""
node_to_ic_host = _get_map_node_to_ic_host(deployment_name)
for node_name, node_ipv6 in nodes.items():
ichost = node_to_ic_host[node_name]
client = paramiko.SSHClient()
client.set_missing_host_key_policy(paramiko.AutoAddPolicy())

command = "sudo virsh dominfo " + node_name

client.connect(ichost, port=22, username=os.environ.get("USER", "gitlab-runner"), timeout=10)

(_stdin, _stdout, _stderr) = client.exec_command(f"timeout 10 bash -c '{command}'")
logging.info("-------")
logging.info("using command _ %s _ on host %s", command, ichost)
for line in iter(_stdout.readline, ""):
print(line, end="")


def _ssh_run_command(node: typing.List, out_dir: pathlib.Path, out_filename: str, command: str):
"""SSH into a node, run the command, and store the result in a local file {outdir}/{out_filename}."""
client = paramiko.SSHClient()
Expand Down Expand Up @@ -147,6 +186,7 @@ def collect_all_debug_info(
out_dir.mkdir(exist_ok=True, parents=True)
paramiko.util.log_to_file(out_dir / "paramiko.log", level="WARN")

collect_host_dominfo(nodes, deployment_name)
collect_journalctl_logs(nodes, out_dir)
collect_ic_replica_service_logs(nodes, out_dir)
collect_replica_api_status(nodes, out_dir)
Expand Down

0 comments on commit 18f1e28

Please sign in to comment.